## 1. Import Libraries / Dependencies

In [1]:
import cv2
import numpy as np
from itertools import chain
import traceback
from time import time

In [2]:
from cvzone.HandTrackingModule import HandDetector
from cvzone.FaceDetectionModule import FaceDetector
from cvzone.PoseModule import PoseDetector
from cvzone import FPS

In [3]:
# Real Time Testing 
from collections import deque
from concurrent.futures import ThreadPoolExecutor

## 2. Feature Extraction (Hand+Face+Pose Detection)

In [4]:
# Flatten a 2d np array into 1d array
def flatten2dList(arr, dataType=int):
    return np.fromiter(chain.from_iterable(arr), dataType)

In [5]:
# Get the largest absolute value in an np array
def getAbsLargestVal(arr):
    return np.max(np.abs(arr))

In [6]:
# Offset and normalize the landmark list
# Returns a 1d numpy array
def preprocess_landmarks(landmark_list):    
    landmark_list = np.array(landmark_list, dtype=float)
    origin = landmark_list[0]
    
    # Offset every point with respect to the first point
    # Convert to 1D-array
    new_landmark_list = (landmark_list - origin).ravel()
    
    # Get highest absolute value
    largest_value = getAbsLargestVal(new_landmark_list)
    
    # Normalization
    if largest_value != 0:
        return new_landmark_list / largest_value
    return new_landmark_list

In [7]:
# Offset and normalize a BBOX list (BBOX = Bounding Box, used in face and hand detection)
# Returns a 1d numpy array
def preprocess_bbox(bbox, frameSize):
    bbox = np.array(bbox, dtype=float)
    # Convert 3rd and 4th element into coordinates instead of width/height
    bbox[2] = bbox[0] + bbox[2]
    bbox[3] = bbox[1] + bbox[3]

    # Normalize against frame size
    bbox[0] /= frameSize[0]
    bbox[1] /= frameSize[1]
    bbox[2] /= frameSize[0]
    bbox[3] /= frameSize[1]

    return bbox

In [8]:
# Normalize a center vertex (a list of 2 elements)
# Returns a 1d numpy array
def preprocess_center(center, frameSize):
    center = np.array(center, dtype=float)
    center[0] /= frameSize[0]
    center[1] /= frameSize[1]
    return center

In [9]:
# Preprocess (Offset and normalize) the body's landmark list, bbox and center
def preprocess_body_part(bodyPart, frameSize):
    bodyPart['lmList'] = preprocess_landmarks(bodyPart['lmList'])
    bodyPart['bbox'] = preprocess_bbox(bodyPart['bbox'], frameSize)
    bodyPart['center'] = preprocess_center(bodyPart['center'], frameSize)
    return bodyPart

In [10]:
# Function to generate empty/placeholder data for a hand 
# Used when a hand is not detected in frame
def generate_empty_hand(type):
    return {
        'lmList': np.zeros(21 * 3, dtype=int), 
        'bbox': np.zeros(4, dtype=float), 
        'center': np.zeros(2, dtype=float), 
        'type': type
    }

In [11]:
# Select the best matching face, aka the one with the best score (clarity)
# and closest to the center of the screen
# Since the Neural network will be design to only accept one face
def select_best_matching_face(faces, frameSize):
    if not faces or len(faces) == 0:
        return False
    elif len(faces) == 1:
        return faces[0]
    
    def difference(a, b):
        return ((a[0] - b[0])**2) + ((a[1] - b[1])**2)
    
    frameCenter = (frameSize[0] / 2, frameSize[1] / 2)

    best_score = faces[0]
    best_center = faces[0]
    center_diff = difference(faces[0]['center'], frameCenter)

    for each in faces:
        if difference(each['center'], frameCenter) < center_diff:
            best_center = each
        if each['score'][0] > best_score['score'][0]:
            best_score = each
    
    if best_center['score'][0] > 0.5:
        return best_center
    return best_score

In [12]:
# Flatten everything
def flattenDetectionResult(obj):
    # return np.fromiter(chain.from_iterable([obj['lmList'], obj['bbox'], obj['center']]), float)
    return np.concatenate([obj['lmList'], obj['bbox'], obj['center']])

## 3. Preparation for Data Collection

In [13]:
# Paths
from constants import TRAININGS_PER_LABEL, FRAMES_PER_TRAINING, KEYPOINTS_PER_FRAME

In [14]:
from files_io import readActionLabels, initActionLabelFolders

action_labels = readActionLabels()
initActionLabelFolders(action_labels)
action_labels

{'0': 'hello',
 '1': 'good/thank you',
 '2': 'help',
 '3': 'I/me',
 '4': 'please',
 '5': 'sorry',
 '6': 'welcome',
 '7': 'welcome',
 '8': 'ok',
 '9': 'what',
 '10': 'what',
 '11': 'can',
 '12': 'thank you very much',
 '13': 'deaf',
 '14': 'do not',
 '15': 'feel',
 '16': 'eat/food',
 '17': 'eat a lot',
 '18': 'tired',
 '19': 'because',
 '20': 'sick',
 '21': 'drink',
 '22': 'drink',
 '23': 'apple',
 '24': 'banana',
 '25': 'drive',
 '26': 'again',
 '27': 'also',
 '28': 'ask',
 '29': 'yes',
 '30': 'no',
 '31': 'man',
 '32': 'man',
 '33': 'woman',
 '34': 'woman',
 '35': 'he/she',
 '36': 'bad',
 '37': 'have/has/had',
 '38': 'have/has/had',
 '39': 'when',
 '40': 'where',
 '41': 'which',
 '42': 'who',
 '43': 'why',
 '44': 'how',
 '45': 'you',
 '46': 'boy',
 '47': 'girl',
 '48': 'friend',
 '49': 'finish/complete',
 '50': 'find',
 '51': 'other',
 '52': 'forget',
 '53': 'give',
 '54': 'give you',
 '55': 'give me',
 '56': 'go',
 '57': 'get',
 '58': 'understand/comprehend',
 '59': 'use',
 '60': 'wi

## 4. Data Collection

### 4.1 Feature Extraction Functions

In [15]:
# Initialize cam as global object
cam = None

In [17]:
# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it

# Improved/Parallelised version
def featureExtraction(handDetector, faceDetector, poseDetector, frame, draw=True):
    def detectHands(handDetector, frame, frameSize, draw):
        results = None
        # Hand Detection
        if (draw):
            results, frame = handDetector.findHands(frame, draw=draw, flipType=False)
        else:
            results = handDetector.findHands(frame, draw=draw, flipType=False)

        if not results:
            results = [generate_empty_hand('Left'), generate_empty_hand('Right')]
        elif len(results) == 1:
            if (results[0]['type'] == 'Left'):
                results[0] = preprocess_body_part(results[0], frameSize)
                results.append(generate_empty_hand('Right'))
            else:
                results[0] = preprocess_body_part(results[0], frameSize)
                results.insert(0, generate_empty_hand('Left'))                         
        else:
            results[0] = preprocess_body_part(results[0], frameSize)
            results[1] = preprocess_body_part(results[1], frameSize)
        return results

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points 
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    def detectPose(poseDetector, frame, draw):
        frame = poseDetector.findPose(frame, draw=draw)
        results, _ = poseDetector.findPosition(frame, bboxWithHands=False)
        if results:
            results = preprocess_landmarks(results[:23])
        else:
            results = np.zeros(23, dtype=int)
        return results
    
    # Face Detection
    def detectFace(faceDetector, frame, frameSize, draw):
        frame, results = faceDetector.findFaces(frame, draw=draw)
        if results:
            results = select_best_matching_face(results, frameSize)
            results['bbox'] = preprocess_bbox(results['bbox'], frameSize)
            results['center'] = preprocess_center(results['center'], frameSize)
        else:
            results = {
                'bbox': np.zeros(4, dtype=float), 
                'center': np.zeros(2, dtype=float)
            }
        return results

    frameSize = (frame.shape[1], frame.shape[0])
    with ThreadPoolExecutor() as executor:
        t1 = executor.submit(detectHands, handDetector, frame, frameSize, draw)
        t2 = executor.submit(detectPose, poseDetector, frame, draw)
        t3 = executor.submit(detectFace, faceDetector, frame, frameSize, draw)
        
        # Convert results into 1D-array
        detectionResults = flatten2dList([
            flattenDetectionResult(t1.result()[0]), 
            flattenDetectionResult(t1.result()[1]), 
            t2.result(), 
            t3.result()['bbox'],
            t3.result()['center'],
            t3.result()['center'] - t1.result()[0]['center'],
            t3.result()['center'] - t1.result()[1]['center']
        ], dataType=float)

        return detectionResults, frame

#### **Details of Keypoints collected**

detectHands => {
    lmList: array(21, 3), 
    bbox: array(4,), 
    center: array(2,)
}
(Total 69 for each hand)

detectPose => [
    array(23, 4)
]
(Total 92)

detectFace => {
    bbox: array(4,),
    center: array(2,)
}
(Total 6)

Extra: Distance difference between face and hands => (
    face.center - hands[0].center = array(2,),
    face.center - hands[1].center = array(2,)
)
(Total 4)

#### **Final Total Number of Keypoints:**
69 + 69 + 92 + 6 + 4 = 240 keypoints


### 4.2  UI Functions

In [18]:
# Read one frame from camera
def readFrame():
    success, frame = cam.read()
    if not success: 
        raise Exception("No Frames Read")
    return cv2.flip(frame, 1)

In [19]:
# Pause recording upon "Space"
def pauseWhenSpace(trainingNum, actionStr):
    while True:
        frame = readFrame()
        cv2.putText(frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.putText(frame, f'Pausing...', (40, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (20, 255, 125), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)

        # If pressed resume, do countdown
        keyPressed = cv2.waitKey(10)
        if keyPressed == 32:    # 32 == Space
            pause_again = False
            
            for i in range(3):
                for _ in range(10):
                    temp_frame = readFrame()
                    cv2.putText(temp_frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
                    cv2.putText(temp_frame, f'Resuming in {3 - i}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 120), 3)
                    cv2.imshow("Sign Language Recognition Prototype", temp_frame)
                    tempKey = cv2.waitKey(100)

                    if (tempKey == 27):
                        raise Exception("Finished")
                    # If pressed paused again, stop resuming and continue pausing
                    elif tempKey == 32:
                        pause_again = True
                        break
                    elif tempKey == 122:    # Pressed z
                        trainingNum -= 1
                        pause_again = True
                        break
                    elif tempKey == 120:    # Pressed x
                        trainingNum += 1
                        pause_again = True
                        break
                if pause_again:
                    break

            if not pause_again:
                return trainingNum
            
        elif keyPressed == 27:
            raise Exception("Finished")
        elif keyPressed == 122:    # Pressed z
            trainingNum -= 1
        elif keyPressed == 120:    # Pressed x
            trainingNum += 1

In [24]:
# Display countdown (1, 2, 3)
def countdownFromThree(trainingNum, actionStr):
    # Count down 3 seconds on every new training
    for i in range(2):
        for _ in range(6):
            frame = readFrame()
            
            cv2.putText(frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
            cv2.putText(frame, f'Next Training in {3 - i}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
            cv2.imshow("Sign Language Recognition Prototype", frame)
            
            tempKey = cv2.waitKey(100)
            if (tempKey == 27):     # Pressed Esc
                raise Exception("Finished")
            elif tempKey == 32:     # Pressed Space
                return pauseWhenSpace(trainingNum, actionStr)
            elif tempKey == 122:    # Pressed z
                trainingNum -= 1
            elif tempKey == 120:    # Pressed x
                trainingNum += 1
                
    return trainingNum


### 4.3 Recording Label (Create Training Data)

In [25]:
from files_io import saveKeypoints

In [26]:
# Specify which action to record
selected_i = 121
action = action_labels[f"{selected_i}"]
action

'NONE'

In [27]:
# Detectors
handDetector = HandDetector(detectionCon=0.5, maxHands=2)
faceDetector = FaceDetector(minDetectionCon=0.5)
poseDetector = PoseDetector(detectionCon=0.5)

In [28]:

cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

try:
    trainingResults = np.zeros((TRAININGS_PER_LABEL, FRAMES_PER_TRAINING, 240))
    
    training_num = 0
    while training_num < TRAININGS_PER_LABEL:
        
        frame_num = 0
        while frame_num < FRAMES_PER_TRAINING:

            # Countdown
            if frame_num == 0:
                training_num = countdownFromThree(training_num, action)
        
            # Read from camera
            frame = readFrame()

            detectionResults, frame = featureExtraction(
                handDetector, faceDetector, poseDetector, frame)
            
            # Show resulting frame
            cv2.putText(frame, f'Training #{training_num + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
            cv2.imshow("Sign Language Recognition Prototype", frame)     

            # Save the results
            trainingResults[training_num][frame_num] = detectionResults

            keyPressed = cv2.waitKey(10)
            # Stop Program when pressed 'Esc'
            if (keyPressed == 27):
                raise Exception("Finished")
            
            frame_num += 1
        
        training_num += 1

        if training_num >= TRAININGS_PER_LABEL:
            training_num = countdownFromThree(training_num, action)

    # After all frames are finished for each training:
    # save as .npy
    
    # IMPORTANT: 
    # Enable it ONLY during data collection or it may OVERWRITE EXISTING DATA
    # saveKeypoints(f"{selected_i},{action}", "0-99", trainingResults)

except Exception as e:
    print(e)
    traceback.print_exc()

finally:
    cam.release()
    cv2.destroyAllWindows()