## 1. Import Libraries / Dependencies

In [2]:
import cv2
import numpy as np
from itertools import chain
import traceback

In [3]:
from cvzone.HandTrackingModule import HandDetector
from cvzone.FaceDetectionModule import FaceDetector
from cvzone.PoseModule import PoseDetector

## 2. Feature Extraction (Hand+Face+Pose Detection)

In [4]:
# Flatten a 2d np array into 1d array
def flatten2dList(arr, dataType=int):
    return np.fromiter(chain.from_iterable(arr), dataType)

In [5]:
# Get the largest absolute value in an np array
def getAbsLargestVal(arr):
    return np.max(np.abs(arr))

In [6]:
# Offset and normalize the landmark list
# Returns a 1d numpy array
def preprocess_landmarks(landmark_list):
    if not landmark_list:
        return []
    
    landmark_list = np.array(landmark_list, dtype=float)
    origin = landmark_list[0]
    
    # Offset every point with respect to the first point
    # Convert to 1D-array
    new_landmark_list = (landmark_list - origin).ravel()
    
    # Get highest absolute value
    largest_value = getAbsLargestVal(new_landmark_list)
    
    # Normalization
    return new_landmark_list / largest_value

In [7]:
# Offset and normalize a BBOX list (BBOX = Bounding Box, used in face and hand detection)
# Returns a 1d numpy array
def preprocess_bbox(bbox, frameSize):
    bbox = np.array(bbox, dtype=float)
    # Convert 3rd and 4th element into coordinates instead of width/height
    bbox[2] = bbox[0] + bbox[2]
    bbox[3] = bbox[1] + bbox[3]

    # Normalize against frame size
    bbox[0] /= frameSize[0]
    bbox[1] /= frameSize[1]
    bbox[2] /= frameSize[0]
    bbox[3] /= frameSize[1]

    return bbox

In [8]:
# Normalize a center vertex (a list of 2 elements)
# Returns a 1d numpy array
def preprocess_center(center, frameSize):
    center = np.array(center, dtype=float)
    center[0] /= frameSize[0]
    center[1] /= frameSize[1]
    return center

In [9]:
# Preprocess (Offset and normalize) the body's landmark list, bbox and center
def preprocess_body_part(bodyPart, frameSize):
    bodyPart['lmList'] = preprocess_landmarks(bodyPart['lmList'])
    bodyPart['bbox'] = preprocess_bbox(bodyPart['bbox'], frameSize)
    bodyPart['center'] = preprocess_center(bodyPart['center'], frameSize)
    return bodyPart

In [10]:
# Function to generate empty/placeholder data for a hand 
# Used when a hand is not detected in frame
def generate_empty_hand(type):
    return {
        'lmList': np.zeros(21 * 3, dtype=int), 
        'bbox': np.zeros(4, dtype=float), 
        'center': np.zeros(2, dtype=float), 
        'type': type
    }

In [11]:
# Select the best matching face, aka the one with the best score (clarity)
# and closest to the center of the screen
# Since the Neural network will be design to only accept one face
def select_best_matching_face(faces, frameSize):
    if not faces:
        return False
    elif len(faces) == 1:
        return faces[0]
    
    def difference(a, b):
        return (a[0] - b[0])**2 + (a[1] - b[1])**2
    
    frameCenter = (frameSize[0] / 2, frameSize[1] / 2)

    best_score = faces[0]
    best_center = faces[0]
    center_diff = difference(faces[0]['center'], frameCenter)

    for each in faces:
        if difference(each['center'], frameCenter) < center_diff:
            best_center = each
        if each['score'][0] > best_score['score'][0]:
            best_score = each
    
    if best_center['score'][0] > 0.5:
        return best_center
    return best_score

In [12]:
# Flatten everything
def flattenDetectionResult(obj):
    return np.fromiter(chain.from_iterable([obj['lmList'], obj['bbox'], obj['center']]), float)
    # return np.concatenate([obj['lmList'], obj['bbox'], obj['center']])

## 3. Preparation for Data Collection

In [13]:
from constants import TRAININGS_PER_LABEL, FRAMES_PER_TRAINING, KEYPOINTS_PER_FRAME

In [14]:
from files_io import readActionLabels, initActionLabelFolders

action_labels = readActionLabels()
initActionLabelFolders(action_labels)
action_labels

['hello', 'thank you', 'help']

## 4. Data Collection

### 4.1 Feature Extraction Functions

In [15]:
from time import time

In [16]:
cam = None

In [17]:
# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it

# Serial/Unparallelised version (Old version)
def featureExtraction(handDetector, faceDetector, poseDetector, frame):
    results = {}
    frameSize = (frame.shape[1], frame.shape[0])

    tempResults = {}

    # Hand Detection
    results['hands'], frame = handDetector.findHands(frame, draw=True)
    tempResults['hands'] = results['hands'].copy()
    if not results['hands']:
        results['hands'] = [generate_empty_hand('Left'), generate_empty_hand('Right')]
    elif len(results['hands']) == 1:
        if (results['hands'][0]['type'] == 'Left'):
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'].append(generate_empty_hand('Right'))
        else:
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'].insert(0, generate_empty_hand('Left'))                         
    else:
        results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
        results['hands'][1] = preprocess_body_part(results['hands'][1], frameSize)

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points 
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    frame = poseDetector.findPose(frame, draw=True)
    results['pose'] = {}
    results['pose']['lmList'], tempPoseBbox = poseDetector.findPosition(frame, bboxWithHands=False)
    if results['pose']['lmList'] and tempPoseBbox:
        results['pose']['lmList'] = results['pose']['lmList'][:23]
        results['pose']['bbox'] = tempPoseBbox['bbox']
        results['pose']['center'] = tempPoseBbox['center']
        
        tempResults['pose'] = results['pose'].copy()

        results['pose'] = preprocess_body_part(results['pose'], frameSize)
    else:
        results['pose']['lmList'] = np.zeros(23 * 3, dtype=int)
        results['pose']['bbox'] = np.zeros(4, dtype=float)
        results['pose']['center'] = np.zeros(2, dtype=float)
        

    
    # Face Detection
    frame, results['face'] = faceDetector.findFaces(frame, draw=True)
    if results['face']:
        results['face'] = select_best_matching_face(results['face'], frameSize)

        
        tempResults['face'] = results['face'].copy()

        results['face']['bbox'] = preprocess_bbox(results['face']['bbox'], frameSize)
        results['face']['center'] = preprocess_center(results['face']['center'], frameSize)
    else:
        results['face'] = {
            'bbox': np.zeros(4, dtype=float), 
            'center': np.zeros(2, dtype=float)
        }

    # Calculate relative distance between body parts
    results['relative'] = {}
    results['relative']['faceHand0'] = results['face']['center'] - results['hands'][0]['center']
    results['relative']['faceHand1'] = results['face']['center'] - results['hands'][1]['center']
    results['relative']['facePose'] = results['face']['center'] - results['pose']['center']

    # Convert results into 1D-array
    detectionResults = flatten2dList([
        flattenDetectionResult(results['hands'][0]), 
        flattenDetectionResult(results['hands'][1]), 
        flattenDetectionResult(results['pose']), 
        results['face']['bbox'], 
        results['face']['center'],
        results['relative']['faceHand0'],
        results['relative']['faceHand1'],
        results['relative']['facePose']
    ], dataType=float)

    return detectionResults, frame, tempResults

### 4.2  UI Functions

In [18]:
# Read one frame from camera
def readFrame():
    success, frame = cam.read()
    if not success: 
        raise Exception("No Frames Read")
    return cv2.flip(frame, 1)

In [19]:
# Pause recording upon "Space"
def pauseWhenSpace(trainingNum, actionStr):
    while True:
        frame = readFrame()
        cv2.putText(frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.putText(frame, f'Pausing...', (40, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (20, 255, 125), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)

        # If pressed resume, do countdown
        keyPressed = cv2.waitKey(100)   # Read key every 100ms, i.e. lock fps to 10fps
        if keyPressed == 32:    # 32 == Space
            resume = False
            
            for i in range(3):
                for _ in range(10):
                    temp_frame = readFrame()
                    cv2.putText(temp_frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
                    cv2.putText(temp_frame, f'Resuming in {3 - i}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 120), 3)
                    cv2.imshow("Sign Language Recognition Prototype", temp_frame)
                    tempKey = cv2.waitKey(100)
                    if (tempKey == 27):
                        raise Exception("Finished")
                    # If pressed paused again, stop resuming and continue pausing
                    elif tempKey == 32:
                        resume = True
                        break
                if resume:
                    break
            if not resume:
                return
            
        elif keyPressed == 27:
            raise Exception("Finished")

In [20]:
# Display countdown (1, 2, 3)
pausing = False
def countdownFromThree(trainingNum, actionStr):
    # Count down 3 seconds on every new training
    for i in range(3):

        # Using iteration of 10 frames (100ms each) so that the display is still going on 10fps
        for _ in range(10):
            frame = readFrame()
            
            cv2.putText(frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
            cv2.putText(frame, f'Next Training in {3 - i}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
            cv2.imshow("Sign Language Recognition Prototype", frame)
            
            tempKey = cv2.waitKey(100)
            if (tempKey == 27):     # Pressed Esc
                raise Exception("Finished")
            elif tempKey == 32:     # Pressed Space
                pauseWhenSpace(trainingNum, actionStr)
                return


### 4.3 Recording Label (Create Training Data)

In [21]:
# Specify which action to record
action = action_labels[2]
action

'help'

In [24]:

cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Detectors
handDetector = HandDetector(detectionCon=0.5, maxHands=2)
faceDetector = FaceDetector(minDetectionCon=0.5)
poseDetector = PoseDetector(detectionCon=0.5)

try:
    startTime = time()

    trainingResults = np.zeros((TRAININGS_PER_LABEL, FRAMES_PER_TRAINING, KEYPOINTS_PER_FRAME))
    
    for training_num in range(TRAININGS_PER_LABEL): 
        for frame_num in range(FRAMES_PER_TRAINING):

            # Countdown
            if frame_num == 0:
                countdownFromThree(training_num, action)
                startTime = time()
        
            # Read from camera
            frame = readFrame()

            detectionResults, frame, ogResults = featureExtraction(
                handDetector, faceDetector, poseDetector, frame)
            
            # Show resulting frame
            cv2.putText(frame, f'Training #{training_num + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
            cv2.imshow("Sign Language Recognition Prototype", frame)     

            # Save the results
            trainingResults[training_num][frame_num] = detectionResults

            keyPressed = cv2.waitKey(1)
            # Stop Program when pressed 'Esc'
            if (keyPressed == 27):
                raise Exception("Finished")

    # After all frames are finished for each training:
    # save as .npy
    
    # IMPORTANT: THIS LINE IS DISABLED IN CASE OF ACCIDENTALLY OVERWRITING DATA
    # Enable it ONLY during data collection
    # np.save(os.path.join(KEYPOINTS_PATH, action), trainingResults)

except Exception as e:
    cam.release()
    cv2.destroyAllWindows()
    print(e)
    traceback.print_exc()

cam.release()
cv2.destroyAllWindows()

Finished


Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_2896\4034106895.py", line 20, in <module>
    countdownFromThree(training_num, action)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_2896\3147981487.py", line 17, in countdownFromThree
    raise Exception("Finished")
Exception: Finished


In [27]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
while True:
    frame = readFrame()
    cv2.imshow("Sign Language Recognition Prototype", frame) 
    
    keyPressed = cv2.waitKey(1)
    # Stop Program when pressed 'Esc'
    if (keyPressed == 27):
        raise Exception("Finished")

KeyboardInterrupt: 

In [28]:

cam.release()
cv2.destroyAllWindows()

In [67]:
print(list(i.shape for i in detectionResults))

[(69,), (69,), (75,), (4,), (2,), (2,), (2,), (2,)]


In [1]:
detectionResults.shape

NameError: name 'detectionResults' is not defined

In [75]:
ogResults['hands'][0]

{'lmList': [[896, 328, 0],
  [878, 304, -12],
  [869, 270, -22],
  [862, 245, -32],
  [853, 224, -42],
  [908, 260, -30],
  [924, 223, -45],
  [932, 199, -55],
  [942, 178, -61],
  [927, 272, -33],
  [950, 255, -52],
  [970, 244, -61],
  [986, 234, -66],
  [940, 289, -37],
  [932, 296, -54],
  [909, 310, -51],
  [891, 319, -46],
  [947, 307, -41],
  [939, 313, -54],
  [920, 323, -50],
  [905, 330, -45]],
 'bbox': (853, 178, 133, 152),
 'center': (919, 254),
 'type': 'Left'}

In [None]:
cam.release()
cv2.destroyAllWindows()

In [None]:
np.load(os.path.join(KEYPOINTS_PATH, action, '0.npy')).shape

(15, 219)

## 5. Real Time Testing

In [None]:
from collections import deque
from concurrent.futures import ThreadPoolExecutor
from time import time
from cvzone import FPS

In [None]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Detectors
handDetector = HandDetector(detectionCon=0.5, maxHands=2)
faceDetector = FaceDetector(minDetectionCon=0.5)
poseDetector = PoseDetector(detectionCon=0.5)

fpsReader = FPS()

timeStats = []

try:

    keypointsHistory = deque()
    predictionHistory = deque()
    detectionThreshold = 1.0

    lastPredictionTime = time()
    predictionCooldown = 1

    while True:
        startTime = time()

        # Read from camera
        success, frame = cam.read()
        if not success: 
            raise Exception("No Frames Read")
        frame = cv2.flip(frame, 1)

        # Pose Detection
        detectionResults, frame = featureExtraction(
            handDetector, faceDetector, poseDetector, frame)
        
        # Semantic Prediction
        keypointsHistory.append(detectionResults)
        if len(keypointsHistory) > FRAMES_PER_TRAINING:
            keypointsHistory.popleft()

            
            # if time() > lastPredictionTime + predictionCooldown:
            #     predictionResults = model.predict(
            #         np.expand_dims(keypointsHistory, axis=0), 
            #         verbose=0, 
            #         use_multiprocessing=True, 
            #         workers=4
            #         )[0]
            #     predWord = action_labels[np.argmax(predictionResults)]
            #     predAccuracy = predictionResults[np.argmax(predictionResults)]

            #     if predAccuracy >= detectionThreshold:
            #         lastPredictionTime = time()
                    
            #         predictionHistory.append(predWord)
            #         if len(predictionHistory) > 5:
            #             predictionHistory.popleft()
        
        cv2.putText(frame, ', '.join(predictionHistory), (15, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
        
        fps, frame = fpsReader.update(frame,pos=(950,80),color=(0,255,0),scale=5,thickness=5)


        # Show resulting frame
        # cv2.putText(frame, f'Training #{training + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)     

        timeStats.append(time() - startTime)

        keyPressed = cv2.waitKey(15)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")


except Exception as e:
    cam.release()
    cv2.destroyAllWindows()
    print(e)

cam.release()
cv2.destroyAllWindows()

name 'featureExtraction' is not defined


In [None]:
timeStats[10:]

[0.07875680923461914,
 0.07371330261230469,
 0.0756688117980957,
 0.07382822036743164,
 0.07182097434997559,
 0.14654779434204102,
 0.13663458824157715,
 0.13762927055358887,
 0.13364148139953613,
 0.173537015914917,
 0.16253042221069336,
 0.16656756401062012,
 0.1545863151550293,
 0.1565384864807129,
 0.16048026084899902,
 0.1515054702758789,
 0.1635282039642334,
 0.17353558540344238,
 0.15356874465942383,
 0.15557503700256348,
 0.15255475044250488,
 0.1715106964111328,
 0.15854263305664062,
 0.15158939361572266,
 0.1595776081085205,
 0.16945433616638184]

In [None]:
from concurrent.futures import ThreadPoolExecutor


# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it
def featureExtractionV2(handDetector, faceDetector, poseDetector, frame):
    results = {}
    frameSize = (frame.shape[1], frame.shape[0])

    def detectHands(frame, handDetector, frameSize):
        results = {}
        # Hand Detection
        results['hands'], frame = handDetector.findHands(frame, draw=True)
        if not results['hands']:
            results['hands'] = [generate_empty_hand('Left'), generate_empty_hand('Right')]
        elif len(results['hands']) == 1:
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            
            if (results['hands'][0]['type'] == 'Left'):
                results['hands'].append(generate_empty_hand('Right'))
            else:
                results['hands'].insert(0, generate_empty_hand('Left'))                         
        else:
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'][1] = preprocess_body_part(results['hands'][1], frameSize)
        return results['hands']

    def detectPose(frame, poseDetector, frameSize):
        results = {}
        # Pose Detection
        # * We only use the first 23 out of the total 33 landmark points 
        #   as those represent the lower half body and are irrelevant
        frame = poseDetector.findPose(frame, draw=True)
        results['pose'] = {}
        results['pose']['lmList'], tempPoseBbox = poseDetector.findPosition(frame, bboxWithHands=False)
        if results['pose']['lmList'] and tempPoseBbox:
            results['pose']['lmList'] = results['pose']['lmList'][:23]
            results['pose']['bbox'] = tempPoseBbox['bbox']
            results['pose']['center'] = tempPoseBbox['center']
            results['pose'] = preprocess_body_part(results['pose'], frameSize)
        else:
            results['pose']['lmList'] = np.zeros(23 * 3, dtype=int)
            results['pose']['bbox'] = np.zeros(4, dtype=float)
            results['pose']['center'] = np.zeros(2, dtype=float)
        return results['pose']
            

    def detectFace(frame, faceDetector, frameSize):
        results = {}
        # Face Detection
        frame, results['face'] = faceDetector.findFaces(frame, draw=True)
        if results['face']:
            results['face'] = select_best_matching_face(results['face'], frameSize)
            results['face']['bbox'] = preprocess_bbox(results['face']['bbox'], frameSize)
            results['face']['center'] = preprocess_center(results['face']['center'], frameSize)
        else:
            results['face'] = {
                'bbox': np.zeros(4, dtype=float), 
                'center': np.zeros(2, dtype=float)
            }
        return results['face']
    
    with ThreadPoolExecutor() as executor:
        # t1 = executor.submit(detectHands, frame, handDetector, frameSize)
        # t2 = executor.submit(detectPose, frame, poseDetector, frameSize)
        t3 = executor.submit(detectFace, frame, faceDetector, frameSize)


        # Convert results into 1D-array
        detectionResults = np.concatenate([
            flattenDetectionResult(t1.result()[0]), 
            flattenDetectionResult(t1.result()[1]), 
            flattenDetectionResult(t2.result()), 
            t3.result()['bbox'], 
            t3.result()['center']
        ])

        return detectionResults, frame

In [None]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

fpsReader = FPS()

timeStats = []

try:
    initialTime = time()
    while True:
        startTime = time()

        # Read from camera
        success, frame = cam.read()
        
        # Pose Detection
        detectionResults, frame = featureExtractionV2(
            handDetector, faceDetector, poseDetector, frame)
        
        # fps, frame = fpsReader.update(frame,pos=(50,80),color=(0,255,0),scale=5,thickness=5)


        # Show resulting frame
        # cv2.putText(frame, f'Training #{training + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)     

        timeStats.append(time() - startTime)

        keyPressed = cv2.waitKey(15)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")
        
        # if time() - initialTime > 10:
        #     raise Exception()


except Exception as e:
    cam.release()
    cv2.destroyAllWindows()
    print(e)

cam.release()
cv2.destroyAllWindows()

Finished


In [None]:
np.array(timeStats[1:]).mean()

0.05556477281384002

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


In [None]:
np.array(timeStats[1:]).mean()

0.016924326368373075

In [None]:
np.array(timeStats[1:]).mean()

0.09001387300945464

In [None]:
np.array(timeStats[1:]).mean()

0.06750456676926724

In [None]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

success, frame = cam.read()

In [None]:
type(frame)

numpy.ndarray

In [None]:

cam.release()
cv2.destroyAllWindows()