1. Import Libraries / Dependencies

In [23]:
import cv2
import numpy as np
import mediapipe as mp
from itertools import chain

In [24]:
from cvzone.HandTrackingModule import HandDetector
from cvzone.FaceDetectionModule import FaceDetector
from cvzone.PoseModule import PoseDetector

2. Hand, Face and Pose Detection + Feature Extraction

In [25]:
def flatten2dList(arr, dataType=int):
    return np.fromiter(chain.from_iterable(arr), dataType)

In [26]:
def getAbsLargestVal(arr):
    return max(np.max(arr), abs(np.min(arr)))

In [27]:
def preprocess_landmarks(landmark_list):
    if not landmark_list:
        return []
    
    # Offset every point with respect to the first point
    new_landmark_list = []
    origin_x = landmark_list[0][0] 
    origin_y = landmark_list[0][1]
    origin_z = landmark_list[0][2]
    for each in landmark_list:
        updated_point = [
            each[0] - origin_x, 
            each[1] - origin_y, 
            each[2] - origin_z
        ]
        new_landmark_list.append(updated_point)
    
    # Convert to 1D-array
    new_landmark_list = flatten2dList(new_landmark_list)
    
    # Get highest absolute value
    largest_value = getAbsLargestVal(new_landmark_list)
    
    # Normalization
    return new_landmark_list / largest_value


In [28]:
def preprocess_bbox(bbox, frameSize):
    bbox = np.array(bbox, dtype=float)
    # Convert 3rd and 4th element into coordinates instead of width/height
    bbox[2] = bbox[0] + bbox[2]
    bbox[3] = bbox[1] + bbox[3]

    # Normalize against frame size
    bbox[0] /= frameSize[0]
    bbox[2] /= frameSize[0]
    bbox[1] /= frameSize[1]
    bbox[3] /= frameSize[1]

    return bbox

In [29]:
def preprocess_center(center, frameSize):
    center = np.array(center)
    center[0] /= frameSize[0]
    center[1] /= frameSize[1]
    return center

In [30]:
def preprocess_body_part(bodyPart, frameSize):
    bodyPart['lmList'] = preprocess_landmarks(bodyPart['lmList'])
    bodyPart['bbox'] = preprocess_bbox(bodyPart['bbox'], frameSize)
    bodyPart['center'] = preprocess_center(bodyPart['center'], frameSize)
    return bodyPart

In [31]:
def generate_empty_hand(type):
    return {
        'lmList': np.zeros(21 * 3, dtype=int), 
        'bbox': np.zeros(4, dtype=float), 
        'center': np.zeros(2, dtype=float), 
        'type': type
    }

In [32]:
def select_best_matching_face(faces, frameSize):
    if not faces:
        return False
    elif len(faces) == 1:
        return faces[0]
    
    def difference(a, b):
        return (a[0] - b[0])**2 + (a[1] - b[1])**2
    
    frameCenter = (frameSize[0] / 2, frameSize[1] / 2)

    best_score = faces[0]
    best_center = faces[0]
    center_diff = difference(faces[0]['center'], frameCenter)

    for each in faces:
        if difference(each['center'], frameCenter) < center_diff:
            best_center = each
        if each['score'][0] > best_score['score'][0]:
            best_score = each
    
    if best_center['score'][0] > 0.5:
        return best_center
    return best_score

In [56]:
def flattenDetectionResult(obj):
    return np.concatenate([obj['lmList'], obj['bbox'], obj['center']])

In [59]:
cam = cv2.VideoCapture(0)

# Detectors
handDetector = HandDetector(detectionCon=0.5, maxHands=2)
faceDetector = FaceDetector(minDetectionCon=0.5)
poseDetector = PoseDetector(detectionCon=0.5)

results = {}

try:
    while True:
        # Read from camera
        success, frame = cam.read()
        if not success:
            break

        frameSize = (frame.shape[1], frame.shape[0])

        # Hand Detection
        results['hands'], frame = handDetector.findHands(frame, draw=True)
        if not results['hands']:
            results['hands'] = [generate_empty_hand('Left'), generate_empty_hand('Right')]
        elif len(results['hands']) == 1:
            if (results['hands'][0]['type'] == 'Left'):
                results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
                results['hands'].append(generate_empty_hand('Right'))
            else:
                results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
                results['hands'].insert(0, generate_empty_hand('Left'))                         
        else:
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'][1] = preprocess_body_part(results['hands'][1], frameSize)

        # Pose Detection
        # * We only use the first 23 out of the total 33 landmark points 
        #   as those represent the lower half body and are irrelevant
        frame = poseDetector.findPose(frame, draw=True)
        results['pose'] = {}
        results['pose']['lmList'], tempPoseBbox = poseDetector.findPosition(frame, bboxWithHands=False)
        if results['pose']['lmList'] and tempPoseBbox:
            results['pose']['lmList'] = results['pose']['lmList'][:23]
            results['pose']['bbox'] = tempPoseBbox['bbox']
            results['pose']['center'] = tempPoseBbox['center']
            results['pose'] = preprocess_body_part(results['pose'], frameSize)
        else:
            results['pose']['lmList'] = np.zeros(23 * 3, dtype=int)
            results['pose']['bbox'] = np.zeros(4, dtype=float)
            results['pose']['center'] = np.zeros(2, dtype=float)
            
        # Face Detection
        frame, results['face'] = faceDetector.findFaces(frame, draw=True)
        if results['face']:
            results['face'] = select_best_matching_face(results['face'], frameSize)
            results['face']['bbox'] = preprocess_bbox(results['face']['bbox'], frameSize)
            results['face']['center'] = preprocess_center(results['face']['center'], frameSize)
        else:
            results['face'] = {
                'bbox': np.zeros(4, dtype=float), 
                'center': np.zeros(2, dtype=float)
            }

        # Convert results into 1D-array
        finalResults = np.concatenate([
            flattenDetectionResult(results['hands'][0]), 
            flattenDetectionResult(results['hands'][1]), 
            flattenDetectionResult(results['pose']), 
            results['face']['bbox'], 
            results['face']['center']
        ])

        # Show frame
        cv2.imshow("Sign Language Recognition Prototype", frame)

        keyPressed = cv2.waitKey(10)
        if (keyPressed == ord('q')):
            print(results['face'][0]['center'])
            print(frame.shape)
            pass

        # Pressed 'Esc'
        if (keyPressed == 27):
            break
except Exception as e:
    cam.release()
    cv2.destroyAllWindows()
    raise e

cam.release()
cv2.destroyAllWindows()


In [63]:
len(finalResults)

219

3. Data Collection