## 1. Import Libraries / Dependencies

In [35]:
import cv2
import numpy as np
from itertools import chain
import traceback
from time import time
import os

In [36]:
from cvzone.HandTrackingModule import HandDetector
from cvzone.FaceDetectionModule import FaceDetector
from cvzone.PoseModule import PoseDetector
from cvzone import FPS

In [37]:
# Real Time Testing 
from collections import deque
from concurrent.futures import ThreadPoolExecutor

## 2. Feature Extraction (Hand+Face+Pose Detection)

In [38]:
# Flatten a 2d np array into 1d array
def flatten2dList(arr, dataType=int):
    return np.fromiter(chain.from_iterable(arr), dataType)

In [39]:
# Get the largest absolute value in an np array
def getAbsLargestVal(arr):
    return np.max(np.abs(arr))

In [40]:
# Offset and normalize the landmark list
# Returns a 1d numpy array
def preprocess_landmarks(landmark_list):    
    landmark_list = np.array(landmark_list, dtype=float)
    origin = landmark_list[0]
    
    # Offset every point with respect to the first point
    # Convert to 1D-array
    new_landmark_list = (landmark_list - origin).ravel()
    
    # Get highest absolute value
    largest_value = getAbsLargestVal(new_landmark_list)
    
    # Normalization
    if largest_value != 0:
        return new_landmark_list / largest_value
    return new_landmark_list

In [41]:
# Offset and normalize a BBOX list (BBOX = Bounding Box, used in face and hand detection)
# Returns a 1d numpy array
def preprocess_bbox(bbox, frameSize):
    bbox = np.array(bbox, dtype=float)
    # Convert 3rd and 4th element into coordinates instead of width/height
    bbox[2] = bbox[0] + bbox[2]
    bbox[3] = bbox[1] + bbox[3]

    # Normalize against frame size
    bbox[0] /= frameSize[0]
    bbox[1] /= frameSize[1]
    bbox[2] /= frameSize[0]
    bbox[3] /= frameSize[1]

    return bbox

In [42]:
# Normalize a center vertex (a list of 2 elements)
# Returns a 1d numpy array
def preprocess_center(center, frameSize):
    center = np.array(center, dtype=float)
    center[0] /= frameSize[0]
    center[1] /= frameSize[1]
    return center

In [43]:
# Preprocess (Offset and normalize) the body's landmark list, bbox and center
def preprocess_body_part(bodyPart, frameSize):
    bodyPart['lmList'] = preprocess_landmarks(bodyPart['lmList'])
    bodyPart['bbox'] = preprocess_bbox(bodyPart['bbox'], frameSize)
    bodyPart['center'] = preprocess_center(bodyPart['center'], frameSize)
    return bodyPart

In [44]:
# Function to generate empty/placeholder data for a hand 
# Used when a hand is not detected in frame
def generate_empty_hand(type):
    return {
        'lmList': np.zeros(21 * 3, dtype=int), 
        'bbox': np.zeros(4, dtype=float), 
        'center': np.zeros(2, dtype=float), 
        'type': type
    }

In [45]:
# Select the best matching face, aka the one with the best score (clarity)
# and closest to the center of the screen
# Since the Neural network will be design to only accept one face
def select_best_matching_face(faces, frameSize):
    if not faces or len(faces) == 0:
        return False
    elif len(faces) == 1:
        return faces[0]
    
    def difference(a, b):
        return ((a[0] - b[0])**2) + ((a[1] - b[1])**2)
    
    frameCenter = (frameSize[0] / 2, frameSize[1] / 2)

    best_score = faces[0]
    best_center = faces[0]
    center_diff = difference(faces[0]['center'], frameCenter)

    for each in faces:
        if difference(each['center'], frameCenter) < center_diff:
            best_center = each
        if each['score'][0] > best_score['score'][0]:
            best_score = each
    
    if best_center['score'][0] > 0.5:
        return best_center
    return best_score

In [46]:
# Flatten everything
def flattenDetectionResult(obj):
    # return np.fromiter(chain.from_iterable([obj['lmList'], obj['bbox'], obj['center']]), float)
    return np.concatenate([obj['lmList'], obj['bbox'], obj['center']])

## 3. Preparation for Data Collection

In [47]:
# Paths
from static_constants import TRAININGS_PER_LABEL, FRAMES_PER_TRAINING, KEYPOINTS_PER_FRAME, KEYPOINTS_PATH

In [48]:
from static_files_io import readActionLabels

static_labels = readActionLabels()
static_labels

['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 'airplane',
 'ship',
 'car',
 'telephone']

## 4. Data Collection

### 4.1 Feature Extraction Functions

In [49]:
# Initialize cam as global object
cam = None

In [50]:
# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it

# Serial/Unparallelised version (Old version)
def featureExtractionV1(handDetector, faceDetector, poseDetector, frame):
    results = {}
    frameSize = (frame.shape[1], frame.shape[0])

    times = [time()]

    # Hand Detection
    results['hands'], frame = handDetector.findHands(frame, draw=True)
    times.append(time())
    if not results['hands']:
        results['hands'] = [generate_empty_hand('Left'), generate_empty_hand('Right')]
    elif len(results['hands']) == 1:
        if (results['hands'][0]['type'] == 'Left'):
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'].append(generate_empty_hand('Right'))
        else:
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'].insert(0, generate_empty_hand('Left'))                         
    else:
        results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
        results['hands'][1] = preprocess_body_part(results['hands'][1], frameSize)
    times.append(time())

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points 
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    frame = poseDetector.findPose(frame, draw=True)
    times.append(time())
    results['pose'] = {}
    results['pose']['lmList'], tempPoseBbox = poseDetector.findPosition(frame, bboxWithHands=False)
    if results['pose']['lmList'] and tempPoseBbox:
        results['pose']['lmList'] = results['pose']['lmList'][:23]
        results['pose']['bbox'] = tempPoseBbox['bbox']
        results['pose']['center'] = tempPoseBbox['center']
        results['pose'] = preprocess_body_part(results['pose'], frameSize)
    else:
        results['pose']['lmList'] = np.zeros(23 * 3, dtype=int)
        results['pose']['bbox'] = np.zeros(4, dtype=float)
        results['pose']['center'] = np.zeros(2, dtype=float)
        
    times.append(time())

    
    # Face Detection
    frame, results['face'] = faceDetector.findFaces(frame, draw=True)
    times.append(time())
    if results['face']:
        results['face'] = select_best_matching_face(results['face'], frameSize)
        results['face']['bbox'] = preprocess_bbox(results['face']['bbox'], frameSize)
        results['face']['center'] = preprocess_center(results['face']['center'], frameSize)
    else:
        results['face'] = {
            'bbox': np.zeros(4, dtype=float), 
            'center': np.zeros(2, dtype=float)
        }
    times.append(time())

    # Calculate relative distance between body parts
    results['relative'] = {}
    results['relative']['faceHand0'] = results['face']['center'] - results['hands'][0]['center']
    results['relative']['faceHand1'] = results['face']['center'] - results['hands'][1]['center']

    # Convert results into 1D-array
    detectionResults = flatten2dList([
        flattenDetectionResult(results['hands'][0]), 
        flattenDetectionResult(results['hands'][1]), 
        flattenDetectionResult(results['pose']), 
        results['face']['bbox'], 
        results['face']['center'],
        results['relative']['faceHand0'],
        results['relative']['faceHand1']
    ], dataType=float)

    return detectionResults, frame, times

In [51]:
# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it

# Improved/Parallelised version
def featureExtractionV3(handDetector, faceDetector, poseDetector, frame, draw=True):
    def detectHands(handDetector, frame, frameSize, draw):
        results = None
        # Hand Detection
        if (draw):
            results, frame = handDetector.findHands(frame, draw=draw)
        else:
            results = handDetector.findHands(frame, draw=draw)

        if not results:
            results = [generate_empty_hand('Left'), generate_empty_hand('Right')]
        elif len(results) == 1:
            if (results[0]['type'] == 'Left'):
                results[0] = preprocess_body_part(results[0], frameSize)
                results.append(generate_empty_hand('Right'))
            else:
                results[0] = preprocess_body_part(results[0], frameSize)
                results.insert(0, generate_empty_hand('Left'))                         
        else:
            results[0] = preprocess_body_part(results[0], frameSize)
            results[1] = preprocess_body_part(results[1], frameSize)
        return results

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points 
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    def detectPose(poseDetector, frame, draw):
        frame = poseDetector.findPose(frame, draw=draw)
        results, _ = poseDetector.findPosition(frame, bboxWithHands=False)
        if results:
            results = preprocess_landmarks(results[:23])
        else:
            results = np.zeros(23, dtype=int)
        return results
    
    # Face Detection
    def detectFace(faceDetector, frame, frameSize, draw):
        frame, results = faceDetector.findFaces(frame, draw=draw)
        if results:
            results = select_best_matching_face(results, frameSize)
            results['bbox'] = preprocess_bbox(results['bbox'], frameSize)
            results['center'] = preprocess_center(results['center'], frameSize)
        else:
            results = {
                'bbox': np.zeros(4, dtype=float), 
                'center': np.zeros(2, dtype=float)
            }
        return results

    frameSize = (frame.shape[1], frame.shape[0])
    with ThreadPoolExecutor() as executor:
        t1 = executor.submit(detectHands, handDetector, frame, frameSize, draw)
        t2 = executor.submit(detectPose, poseDetector, frame, draw)
        t3 = executor.submit(detectFace, faceDetector, frame, frameSize, draw)
        
        # Convert results into 1D-array
        detectionResults = flatten2dList([
            flattenDetectionResult(t1.result()[0]), 
            flattenDetectionResult(t1.result()[1]), 
            t2.result(), 
            t3.result()['bbox'],
            t3.result()['center'],
            t3.result()['center'] - t1.result()[0]['center'],
            t3.result()['center'] - t1.result()[1]['center']
        ], dataType=float)

        return detectionResults, frame

### 4.2  UI Functions

In [52]:
# Read one frame from camera
def readFrame():
    success, frame = cam.read()
    if not success: 
        raise Exception("No Frames Read")
    return cv2.flip(frame, 1)

### 4.3 Recording Label (Create Training Data)

In [53]:
# Detectors
handDetector = HandDetector(detectionCon=0.5, maxHands=2)
faceDetector = FaceDetector(minDetectionCon=0.5)
poseDetector = PoseDetector(detectionCon=0.5)

In [54]:
from static_files_io import saveKeypoints

In [82]:
# Specify which action to record
static = static_labels[37]
static

'ship'

In [83]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

try:
    save_path=os.path.join(KEYPOINTS_PATH, f"{static}.npy")
    if(os.path.exists(save_path)):
        trainingResults = np.load(save_path)  
        training_num = len(trainingResults)  
    else:
        trainingResults = np.zeros((0, 240)) 
        training_num=0
        
    
    while True:
        # Read from camera
        frame = readFrame()

        detectionResults, frame = featureExtractionV3(
            handDetector, faceDetector, poseDetector, frame)
        
        # Show resulting frame
        cv2.putText(frame, f'Training #{training_num + 1} for \'{static}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)

        keyPressed = cv2.waitKey(10)
        # Stop Program when pressed 'Esc'
        if (training_num >= 100 or keyPressed == 27):
            np.save(os.path.join(KEYPOINTS_PATH, f"{static}.npy"), trainingResults)
            print(len(trainingResults))
            raise Exception("Finished")
        # space key
        elif (keyPressed == 32):
        # elif (keyPressed == ord('s')):
            # Append the detectionResults as a new row to trainingResults
            trainingResults = np.vstack((trainingResults, detectionResults))
            training_num += 1
            print(trainingResults)  
        
    # After all frames are finished for each training:
    # save as .npy
        
    # IMPORTANT: THIS LINE IS DISABLED IN CASE OF ACCIDENTALLY OVERWRITING DATA
    # Enable it ONLY during data collection
    # saveKeypoints(action, "0-99", trainingResults)

except Exception as e:
    print(e)
    traceback.print_exc()

finally:
    cam.release()
    cv2.destroyAllWindows()

0
Finished


Traceback (most recent call last):
  File "C:\Users\Eng Lip\AppData\Local\Temp\ipykernel_13216\261187169.py", line 31, in <module>
    raise Exception("Finished")
Exception: Finished


In [35]:
print(len(trainingResults))
trainingResults.nbytes / 1024 / 1024

100


0.18310546875

In [36]:
for i, each in enumerate(trainingResults[:10]):
    print(i, np.sum(each))

0 9.62254434171681
1 9.450946207258873
2 12.891450670509357
3 10.26076285743612
4 7.264914140998189
5 5.3507210216213945
6 5.905700496039413
7 7.7291411356209165
8 7.3399289845876545
9 7.456041214583111
