## Real Time Testing

In [10]:
from collections import deque
from concurrent.futures import ThreadPoolExecutor
from time import time
from cvzone import FPS

import cv2
import numpy as np
from itertools import chain
import traceback
from time import time
import os

from cvzone.HandTrackingModule import HandDetector
from cvzone.FaceDetectionModule import FaceDetector
from cvzone.PoseModule import PoseDetector

In [11]:
## 2. Feature Extraction (Hand+Face+Pose Detection)
# Flatten a 2d np array into 1d array
def flatten2dList(arr, dataType=int):
    return np.fromiter(chain.from_iterable(arr), dataType)

# Get the largest absolute value in an np array
def getAbsLargestVal(arr):
    return np.max(np.abs(arr))

# Offset and normalize the landmark list
# Returns a 1d numpy array
def preprocess_landmarks(landmark_list):    
    landmark_list = np.array(landmark_list, dtype=float)
    origin = landmark_list[0]
    
    # Offset every point with respect to the first point
    # Convert to 1D-array
    new_landmark_list = (landmark_list - origin).ravel()
    
    # Get highest absolute value
    largest_value = getAbsLargestVal(new_landmark_list)
    
    # Normalization
    if largest_value != 0:
        return new_landmark_list / largest_value
    return new_landmark_list

# Offset and normalize a BBOX list (BBOX = Bounding Box, used in face and hand detection)
# Returns a 1d numpy array
def preprocess_bbox(bbox, frameSize):
    bbox = np.array(bbox, dtype=float)
    # Convert 3rd and 4th element into coordinates instead of width/height
    bbox[2] = bbox[0] + bbox[2]
    bbox[3] = bbox[1] + bbox[3]

    # Normalize against frame size
    bbox[0] /= frameSize[0]
    bbox[1] /= frameSize[1]
    bbox[2] /= frameSize[0]
    bbox[3] /= frameSize[1]

    return bbox

# Normalize a center vertex (a list of 2 elements)
# Returns a 1d numpy array
def preprocess_center(center, frameSize):
    center = np.array(center, dtype=float)
    center[0] /= frameSize[0]
    center[1] /= frameSize[1]
    return center

# Preprocess (Offset and normalize) the body's landmark list, bbox and center
def preprocess_body_part(bodyPart, frameSize):
    bodyPart['lmList'] = preprocess_landmarks(bodyPart['lmList'])
    bodyPart['bbox'] = preprocess_bbox(bodyPart['bbox'], frameSize)
    bodyPart['center'] = preprocess_center(bodyPart['center'], frameSize)
    return bodyPart

# Function to generate empty/placeholder data for a hand 
# Used when a hand is not detected in frame
def generate_empty_hand(type):
    return {
        'lmList': np.zeros(21 * 3, dtype=int), 
        'bbox': np.zeros(4, dtype=float), 
        'center': np.zeros(2, dtype=float), 
        'type': type
    }

# Select the best matching face, aka the one with the best score (clarity)
# and closest to the center of the screen
# Since the Neural network will be design to only accept one face
def select_best_matching_face(faces, frameSize):
    if not faces or len(faces) == 0:
        return False
    elif len(faces) == 1:
        return faces[0]
    
    def difference(a, b):
        return ((a[0] - b[0])**2) + ((a[1] - b[1])**2)
    
    frameCenter = (frameSize[0] / 2, frameSize[1] / 2)

    best_score = faces[0]
    best_center = faces[0]
    center_diff = difference(faces[0]['center'], frameCenter)

    for each in faces:
        if difference(each['center'], frameCenter) < center_diff:
            best_center = each
        if each['score'][0] > best_score['score'][0]:
            best_score = each
    
    if best_center['score'][0] > 0.5:
        return best_center
    return best_score

# Flatten everything
def flattenDetectionResult(obj):
    # return np.fromiter(chain.from_iterable([obj['lmList'], obj['bbox'], obj['center']]), float)
    return np.concatenate([obj['lmList'], obj['bbox'], obj['center']])

In [12]:
# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it

# Improved/Parallelised version
def featureExtractionV3(handDetector, faceDetector, poseDetector, frame, draw=True):
    def detectHands(handDetector, frame, frameSize, draw):
        results = None
        # Hand Detection
        if (draw):
            results, frame = handDetector.findHands(frame, draw=draw)
        else:
            results = handDetector.findHands(frame, draw=draw)

        if not results:
            results = [generate_empty_hand('Left'), generate_empty_hand('Right')]
        elif len(results) == 1:
            if (results[0]['type'] == 'Left'):
                results[0] = preprocess_body_part(results[0], frameSize)
                results.append(generate_empty_hand('Right'))
            else:
                results[0] = preprocess_body_part(results[0], frameSize)
                results.insert(0, generate_empty_hand('Left'))                         
        else:
            results[0] = preprocess_body_part(results[0], frameSize)
            results[1] = preprocess_body_part(results[1], frameSize)
        return results

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points 
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    def detectPose(poseDetector, frame, draw):
        frame = poseDetector.findPose(frame, draw=draw)
        results, _ = poseDetector.findPosition(frame, bboxWithHands=False)
        if results:
            results = preprocess_landmarks(results[:23])
        else:
            results = np.zeros(23, dtype=int)
        return results
    
    # Face Detection
    def detectFace(faceDetector, frame, frameSize, draw):
        frame, results = faceDetector.findFaces(frame, draw=draw)
        if results:
            results = select_best_matching_face(results, frameSize)
            results['bbox'] = preprocess_bbox(results['bbox'], frameSize)
            results['center'] = preprocess_center(results['center'], frameSize)
        else:
            results = {
                'bbox': np.zeros(4, dtype=float), 
                'center': np.zeros(2, dtype=float)
            }
        return results

    frameSize = (frame.shape[1], frame.shape[0])
    with ThreadPoolExecutor() as executor:
        t1 = executor.submit(detectHands, handDetector, frame, frameSize, draw)
        t2 = executor.submit(detectPose, poseDetector, frame, draw)
        t3 = executor.submit(detectFace, faceDetector, frame, frameSize, draw)
        
        # Convert results into 1D-array
        detectionResults = flatten2dList([
            flattenDetectionResult(t1.result()[0]), 
            flattenDetectionResult(t1.result()[1]), 
            t2.result(), 
            t3.result()['bbox'],
            t3.result()['center'],
            t3.result()['center'] - t1.result()[0]['center'],
            t3.result()['center'] - t1.result()[1]['center']
        ], dataType=float)

        return detectionResults, frame

In [13]:
import tensorflow as tf
import keras

# Load the model from the H5 file
model = tf.keras.models.load_model('../static-recognition/models/static_model.h5')

from static_files_io import readActionLabels

static_labels = readActionLabels()
static_labels

['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 'airplane']

In [14]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Detectors
handDetector = HandDetector(detectionCon=0.5, maxHands=2)
faceDetector = FaceDetector(minDetectionCon=0.5)
poseDetector = PoseDetector(detectionCon=0.5)

fpsReader = FPS()

timeStats = []

try:

    keypointsHistory = deque()
    predictionHistory = deque()
    detectionThreshold = 1.0

    lastPredictionTime = time()
    predictionCooldown = 1

    while True:
        startTime = time()

        # Read from camera
        success, frame = cam.read()
        if not success:
            raise Exception("No Frames Read")
        frame = cv2.flip(frame, 1)

        # Pose Detection
        detectionResults, frame = featureExtractionV3(
            handDetector, faceDetector, poseDetector, frame)

        detectionResults = np.expand_dims(
            detectionResults, axis=0)  # Reshape to (1, 138, 3)

        predictionResults = model.predict(
            x=detectionResults,
            verbose=0,
            use_multiprocessing=True,
            workers=4
        )[0]

        predCharacter = static_labels[np.argmax(predictionResults)]
        predAccuracy = predictionResults[np.argmax(predictionResults)]
        print(predictionResults)

        # cv2.putText(frame, ', '.join(predAccuracy), (15, 70),
        #             cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
        cv2.putText(frame, ', '.join(predCharacter), (15, 70),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)

        fps, frame = fpsReader.update(frame, pos=(
            950, 80), color=(0, 255, 0), scale=5, thickness=5)

        # Show resulting frame
        # cv2.putText(frame, f'Training #{training + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)

        timeStats.append(time() - startTime)

        keyPressed = cv2.waitKey(1)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")


except Exception as e:
    cam.release()
    cv2.destroyAllWindows()
    print(e)

cam.release()
cv2.destroyAllWindows()


[1.49873362e-04 8.58889073e-02 1.93236500e-03 7.89822298e-06
 1.37935684e-04 2.22837850e-01 4.36499342e-02 5.18707782e-02
 2.47855589e-01 1.28796382e-04 7.06685823e-05 4.82412637e-04
 1.42565684e-03 1.31758535e-02 5.08558842e-05 2.00384911e-02
 7.78185553e-04 7.75650392e-07 1.49790617e-03 2.20970213e-02
 2.10568251e-05 1.41264359e-02 4.36666800e-04 1.04353130e-04
 4.62472206e-03 2.22815648e-01 9.18076374e-04 4.31670062e-03
 2.45841639e-03 5.74950827e-03 2.42023627e-04 3.69405886e-03
 1.11872526e-02 2.49286206e-03 2.52805068e-03 9.32435039e-03
 8.82195018e-04]
[1.8832854e-03 6.8680099e-03 5.1663519e-04 1.5812760e-05 1.1437189e-05
 1.9837317e-01 5.4817256e-02 1.1002652e-02 3.7368584e-01 1.2277541e-04
 5.9945625e-04 6.5079826e-04 3.4358837e-03 3.0266792e-03 3.0850299e-04
 3.2575015e-02 8.8063025e-05 2.9251291e-07 2.2116473e-03 6.2278241e-02
 2.1139420e-05 1.3818842e-02 5.2421805e-05 1.1007827e-03 2.2623966e-02
 8.6944818e-02 1.1180557e-03 7.9583703e-03 7.9872757e-03 3.0664774e-03
 1.42799

In [15]:
timeStats[10:]

[0.1284773349761963,
 0.12379097938537598,
 0.11055850982666016,
 0.14420270919799805,
 0.11686515808105469,
 0.0971975326538086,
 0.11056661605834961,
 0.11314582824707031,
 0.11255717277526855,
 0.10884308815002441,
 0.1055138111114502,
 0.10604381561279297,
 0.14863133430480957,
 0.17254352569580078,
 0.1064903736114502,
 0.12843847274780273,
 0.10104227066040039,
 0.12800979614257812,
 0.10358858108520508,
 0.1358189582824707,
 0.09929800033569336,
 0.13472604751586914,
 0.10358524322509766,
 0.12552404403686523,
 0.10854601860046387,
 0.13220691680908203,
 0.09871673583984375,
 0.14682435989379883,
 0.10359001159667969,
 0.12778043746948242,
 0.10230088233947754,
 0.12905049324035645,
 0.09916877746582031,
 0.12583541870117188,
 0.09760069847106934,
 0.12525177001953125,
 0.09857344627380371,
 0.11234784126281738,
 0.11196064949035645,
 0.12619519233703613,
 0.09420609474182129,
 0.12706923484802246,
 0.09586405754089355,
 0.1250460147857666,
 0.09631109237670898,
 0.1262655258178