In [1]:
import numpy as np
from cvzone.HandTrackingModule import HandDetector
from cvzone.FaceDetectionModule import FaceDetector
from cvzone.PoseModule import PoseDetector
from concurrent.futures import ThreadPoolExecutor
from itertools import chain

In [2]:


LIST_PATH = "action_recognition/action_labels.csv"

import os
import csv
action_labels = []
with open(os.path.join("../", LIST_PATH)) as f:
    csv_reader = csv.reader(f, delimiter=",")
    action_labels = [i[1] for i in csv_reader]
action_labels

['hello',
 'good/thank you',
 'help',
 'I/me',
 'please',
 'sorry',
 'welcome',
 'welcome',
 'ok',
 'what',
 'what',
 'can',
 'thank you very much',
 'deaf',
 'do not',
 'feel',
 'eat/food',
 'eat a lot',
 'tired',
 'because',
 'sick',
 'drink',
 'drink',
 'apple',
 'banana',
 'drive',
 'again',
 'also',
 'ask',
 'yes',
 'no',
 'man',
 'man',
 'woman',
 'woman',
 'he/she',
 'bad',
 'have/has/had',
 'have/has/had',
 'when',
 'where',
 'which',
 'who',
 'why',
 'how',
 'you',
 'boy',
 'girl',
 'friend',
 'finish/complete',
 'find',
 'other',
 'forget',
 'give',
 'give you',
 'give me',
 'go',
 'get',
 'understand/comprehend',
 'use',
 'will',
 'with',
 'wait',
 'work',
 'they',
 'their',
 'school',
 'write',
 'send text/message',
 'email',
 'email',
 'home',
 'but',
 'should',
 'not',
 'my',
 'name',
 'like',
 'say',
 'cold',
 'hot',
 'family',
 'mother',
 'father',
 'many',
 'few',
 'now',
 'later',
 'time',
 'tomorrow',
 'yesterday',
 'same/also',
 'remember',
 'your',
 'more',
 'meet'

In [3]:
import numpy as np
DATA_PATH = "action_recognition/keypoints_data"

current_i = 0
rawNpy = np.load(os.path.join("../", DATA_PATH, f"{current_i},{action_labels[current_i]}", "0-99.npy"))

In [4]:

class FeatureExtractionModule():
    def __init__(self, **kwargs):
        # Detectors
        self.handDetector = HandDetector(detectionCon=0.5, maxHands=2)
        self.faceDetector = FaceDetector(minDetectionCon=0.5)
        self.poseDetector = PoseDetector(detectionCon=0.5)

    def detectHands(self, handDetector, frame, frameSize, draw):
        results = [0, 0]
        tempResults = []
        # Hand Detection
        if draw:
            tempResults, frame = handDetector.findHands(frame, draw=draw, flipType=False)
        else:
            tempResults = handDetector.findHands(frame, draw=draw, flipType=False)

        if not tempResults:
            results = [self.generate_empty_hand("Left"), self.generate_empty_hand("Right")]
        elif len(tempResults) == 1:
            if tempResults[0]["type"] == "Left":
                results = [self.preprocess_body_part(tempResults[0], frameSize), self.generate_empty_hand("Right")]
            else:
                results = [self.generate_empty_hand("Left"), self.preprocess_body_part(tempResults[0], frameSize)]
        else:
            if tempResults[0]['type'] == 'Right' and tempResults[1]['type'] == 'Left':
                results[0] = tempResults[1]
                results[1] = tempResults[0]
            elif tempResults[0]['type'] == 'Left' and tempResults[1]['type'] == 'Right':
                results[0] = tempResults[0]
                results[1] = tempResults[1]

            # If both detected hands are both left or both right
            elif tempResults[0]['center'][0] > tempResults[1]['center'][0]:
                results[0] = tempResults[1]
                results[1] = tempResults[0]
            else:
                results[0] = tempResults[0]
                results[1] = tempResults[1]

            results[0] = self.preprocess_body_part(results[0], frameSize)
            results[1] = self.preprocess_body_part(results[1], frameSize)

        return results

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    def detectPose(self, poseDetector, frame, frameSize, draw):
        frame = poseDetector.findPose(frame, draw=draw)
        if poseDetector.results.pose_landmarks:
            results = np.array([[i.x, i.y, i.z, i.visibility] for i in poseDetector.results.pose_landmarks.landmark[:23]])
            return results.ravel()

        # frame = poseDetector.findPose(frame, draw=draw)
        # results, _ = poseDetector.findPosition(frame, bboxWithHands=False)
        # print('---------------')
        # print('e1', np.array(results)[:, -1])
        # if results:
        #     return np.array(results).flatten()
        #     # return self.preprocess_landmarks(results[:23], frameSize)
        # print('e2', results)
        return np.zeros(92, dtype=float)
        

    # Face Detection
    def detectFace(self, faceDetector, frame, frameSize, draw):
        frame, results = faceDetector.findFaces(frame, draw=draw)
        if results:
            results = self.select_best_matching_face(results, frameSize)
            results["bbox"] = self.preprocess_bbox(results["bbox"], frameSize)
            results["center"] = self.preprocess_center(results["center"], frameSize)
            return results

        return {
            "bbox": np.zeros(4, dtype=float),
            "center": np.zeros(2, dtype=float),
        }

    # Detects hands, face & pose,
    # convert them into normalized landmark/keypoint coordinates in a 1D-array,
    # and also returns the frame with the landmark connections drawn onto it
    def parallelFeatureExtraction(
        self, handDetector, faceDetector, poseDetector, frame, draw=True
    ):
        frameSize = (frame.shape[1], frame.shape[0])
        with ThreadPoolExecutor() as executor:
            t1 = executor.submit(self.detectHands, handDetector, frame, frameSize, draw)
            t2 = executor.submit(self.detectPose, poseDetector, frame, frameSize, draw)
            t3 = executor.submit(self.detectFace, faceDetector, frame, frameSize, draw)

            # Convert results into 1D-array
            detectionResults = self.flatten2dList(
                [
                    self.flattenDetectionResult(t1.result()[0]),
                    self.flattenDetectionResult(t1.result()[1]),
                    t2.result(),
                    t3.result()["bbox"],
                    t3.result()["center"],
                    t3.result()["center"] - t1.result()[0]["center"],
                    t3.result()["center"] - t1.result()[1]["center"],
                ],
                dataType=float,
            )

            return detectionResults, frame

    # Offset and normalize the landmark list
    # Returns a 1d numpy array
    def preprocess_landmarks(self, landmark_list, frameSize):
        np_landmark_list = np.array(landmark_list, dtype=float)
        np_frameSize = np.array([frameSize[0], frameSize[1], frameSize[0]])
        return (np_landmark_list / np_frameSize).ravel()


    # Offset and normalize a BBOX list (BBOX = Bounding Box, used in face and hand detection)
    # Returns a 1d numpy array
    def preprocess_bbox(self, bbox, frameSize):
        bbox = np.array(bbox, dtype=float)
        # Convert 3rd and 4th element into coordinates instead of width/height
        bbox[2] = bbox[0] + bbox[2]
        bbox[3] = bbox[1] + bbox[3]

        # Normalize against frame size
        bbox[0] /= frameSize[0]
        bbox[1] /= frameSize[1]
        bbox[2] /= frameSize[0]
        bbox[3] /= frameSize[1]

        return bbox


    # Normalize a center vertex (a list of 2 elements)
    # Returns a 1d numpy array
    def preprocess_center(self, center, frameSize):
        center = np.array(center, dtype=float)
        center[0] /= frameSize[0]
        center[1] /= frameSize[1]
        return center


    # Preprocess (Offset and normalize) the body's landmark list, bbox and center
    def preprocess_body_part(self, bodyPart, frameSize):
        bodyPart["lmList"] = self.preprocess_landmarks(bodyPart["lmList"], frameSize)
        bodyPart["bbox"] = self.preprocess_bbox(bodyPart["bbox"], frameSize)
        bodyPart["center"] = self.preprocess_center(bodyPart["center"], frameSize)
        return bodyPart


    # Function to generate empty/placeholder data for a hand
    # Used when a hand is not detected in frame
    def generate_empty_hand(self, type):
        return {
            "lmList": np.zeros(63, dtype=float),
            "bbox": np.zeros(4, dtype=float),
            "center": np.zeros(2, dtype=float),
            "type": type,
        }


    # Select the best matching face, aka the one with the best score (clarity)
    # and closest to the center of the screen
    # Since the Neural network will be design to only accept one face
    def select_best_matching_face(self, faces, frameSize):
        if not faces or len(faces) == 0:
            return False
        elif len(faces) == 1:
            return faces[0]

        def difference(a, b):
            return ((a[0] - b[0]) ** 2) + ((a[1] - b[1]) ** 2)

        frameCenter = (frameSize[0] / 2, frameSize[1] / 2)

        best_score = faces[0]
        best_center = faces[0]
        center_diff = difference(faces[0]["center"], frameCenter)

        for each in faces[1:]:
            if difference(each["center"], frameCenter) < center_diff:
                best_center = each
            if each["score"][0] > best_score["score"][0]:
                best_score = each

        if best_center["score"][0] > 0.5:
            return best_center
        return best_score

    # Flatten a 2d np array into 1d array
    def flatten2dList(self, arr, dataType=float):
        return np.fromiter(chain.from_iterable(arr), dataType)

    # Flatten everything
    def flattenDetectionResult(self, obj):
        return np.concatenate([obj["lmList"], obj["bbox"], obj["center"]])


    def extractFeatures(self, frame):
        detectionResults, frame = self.parallelFeatureExtraction(
            self.handDetector, self.faceDetector, self.poseDetector, frame
        )

        return detectionResults, frame


In [14]:
import cv2
import traceback

In [5]:


cam = None
# Read one frame from camera

def readFrame():
    success, frame = cam.read()
    if not success: 
        raise Exception("No Frames Read")
    return cv2.flip(frame, 1)


cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

try:
    femodule = FeatureExtractionModule()
    
    while True:
        # Read from camera
        rawframe = readFrame()

        frame = np.copy(rawframe)

        detectionResults, frame = femodule.extractFeatures(frame)
        
        # Show resulting frame
        cv2.imshow("Sign Language Recognition Prototype", frame)     

        keyPressed = cv2.waitKey(10)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")

        break

except Exception as e:
    print(e)
    traceback.print_exc()

finally:
    cam.release()
    cv2.destroyAllWindows()
    del femodule

In [107]:
detectionResults.shape

(240,)

In [108]:
np.argmax(detectionResults)

173

In [109]:
np.max(detectionResults)

0.9999569654464722

In [53]:
detectionResults[138:207].reshape(23, 3)

array([[ 0.55139923,  0.73399454, -0.74569803],
       [ 0.56648535,  0.68524396, -0.70646536],
       [ 0.57641351,  0.68570024, -0.70679051],
       [ 0.58612144,  0.68697423, -0.70683694],
       [ 0.53418308,  0.68727589, -0.7128588 ],
       [ 0.52298796,  0.68843925, -0.71283257],
       [ 0.51305228,  0.69001383, -0.71318281],
       [ 0.60445964,  0.71692556, -0.40919787],
       [ 0.5005421 ,  0.71843433, -0.43512335],
       [ 0.57334667,  0.78898698, -0.62870717],
       [ 0.53335941,  0.79032856, -0.63620675],
       [ 0.68944788,  0.9606145 , -0.20965974],
       [ 0.43566662,  0.96137846, -0.25402993],
       [ 0.75933588,  1.21941566, -0.30540067],
       [ 0.39738399,  1.2676121 , -0.33917922],
       [ 0.72616225,  1.33728361, -0.68819159],
       [ 0.42404801,  1.39239919, -0.77118653],
       [ 0.7201255 ,  1.39245105, -0.80496472],
       [ 0.42453066,  1.45391953, -0.86561853],
       [ 0.7049216 ,  1.36649776, -0.79739189],
       [ 0.43603382,  1.42126441, -0.894

In [9]:
def convertX(normalized_x):
    return int(normalized_x * 1280)
def convertY(normalized_y):
    return int(normalized_y * 720)

In [10]:
dr = detectionResults
print((convertX(dr[63]), convertY(dr[64])))
print((convertX(dr[65]), convertY(dr[66])))
print(dr[65], dr[66])

(0, 0)
(0, 0)
0.0 0.0


In [16]:
modFrame = np.copy(rawframe)
dr = detectionResults

# Draw left hand center
cv2.circle(modFrame, (convertX(dr[67]), convertY(dr[68])), 5, (0, 0, 255), 5)

# Draw left hand bbox
cv2.rectangle(modFrame, 
              (convertX(dr[63]), convertY(dr[64])),
              (convertX(dr[65]), convertY(dr[66])),
              (0, 0, 255), 3
              )

cv2.circle(modFrame, (1200, 700), 5, (0, 255, 255), 5)

try:
    while True:
        cv2.imshow("dsa", modFrame)
        # cv2.imshow("dsa", frame)
        
        keyPressed = cv2.waitKey(10)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")

except Exception as e:
    print(e)
    traceback.print_exc()

finally:
    cam.release()
    cv2.destroyAllWindows()

Finished


Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_45200\1325288025.py", line 24, in <module>
    raise Exception("Finished")
Exception: Finished


# Training

In [5]:
from files_io import readActionLabels, initActionLabelFolders

action_labels = readActionLabels()
initActionLabelFolders(action_labels)
action_labels

{'0': 'hello',
 '1': 'good/thank you',
 '2': 'help',
 '3': 'I/me',
 '4': 'please',
 '5': 'sorry',
 '6': 'welcome',
 '7': 'welcome',
 '8': 'ok',
 '9': 'what',
 '10': 'what',
 '11': 'can',
 '12': 'thank you very much',
 '13': 'deaf',
 '14': 'do not',
 '15': 'feel',
 '16': 'eat/food',
 '17': 'eat a lot',
 '18': 'tired',
 '19': 'because',
 '20': 'sick',
 '21': 'drink',
 '22': 'drink',
 '23': 'apple',
 '24': 'banana',
 '25': 'drive',
 '26': 'again',
 '27': 'also',
 '28': 'ask',
 '29': 'yes',
 '30': 'no',
 '31': 'man',
 '32': 'man',
 '33': 'woman',
 '34': 'woman',
 '35': 'he/she',
 '36': 'bad',
 '37': 'have/has/had',
 '38': 'have/has/had',
 '39': 'when',
 '40': 'where',
 '41': 'which',
 '42': 'who',
 '43': 'why',
 '44': 'how',
 '45': 'you',
 '46': 'boy',
 '47': 'girl',
 '48': 'friend',
 '49': 'finish/complete',
 '50': 'find',
 '51': 'other',
 '52': 'forget',
 '53': 'give',
 '54': 'give you',
 '55': 'give me',
 '56': 'go',
 '57': 'get',
 '58': 'understand/comprehend',
 '59': 'use',
 '60': 'wi

In [6]:
# Read one frame from camera
def readFrame():
    success, frame = cam.read()
    if not success: 
        raise Exception("No Frames Read")
    return cv2.flip(frame, 1)

In [7]:
# Pause recording upon "Space"
def pauseWhenSpace(trainingNum, actionStr):
    while True:
        frame = readFrame()
        cv2.putText(frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.putText(frame, f'Pausing...', (40, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (20, 255, 125), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)

        # If pressed resume, do countdown
        keyPressed = cv2.waitKey(10)
        if keyPressed == 32:    # 32 == Space
            pause_again = False
            
            for i in range(3):
                for _ in range(10):
                    temp_frame = readFrame()
                    cv2.putText(temp_frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
                    cv2.putText(temp_frame, f'Resuming in {3 - i}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 120), 3)
                    cv2.imshow("Sign Language Recognition Prototype", temp_frame)
                    tempKey = cv2.waitKey(100)

                    if (tempKey == 27):
                        raise Exception("Finished")
                    # If pressed paused again, stop resuming and continue pausing
                    elif tempKey == 32:
                        pause_again = True
                        break
                    elif tempKey == 122:    # Pressed z
                        trainingNum -= 1
                        pause_again = True
                        break
                    elif tempKey == 120:    # Pressed x
                        trainingNum += 1
                        pause_again = True
                        break
                if pause_again:
                    break

            if not pause_again:
                return trainingNum
            
        elif keyPressed == 27:
            raise Exception("Finished")
        elif keyPressed == 122:    # Pressed z
            trainingNum -= 1
        elif keyPressed == 120:    # Pressed x
            trainingNum += 1

In [12]:
# Display countdown (1, 2, 3)
def countdownFromThree(trainingNum, actionStr):
    # Count down 3 seconds on every new training
    for i in range(1):
        for _ in range(6):
            frame = readFrame()
            
            cv2.putText(frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
            cv2.putText(frame, f'Next Training in {3 - i}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
            cv2.imshow("Sign Language Recognition Prototype", frame)
            
            tempKey = cv2.waitKey(100)
            if (tempKey == 27):     # Pressed Esc
                raise Exception("Finished")
            elif tempKey == 32:     # Pressed Space
                return pauseWhenSpace(trainingNum, actionStr)
            elif tempKey == 122:    # Pressed z
                trainingNum -= 1
            elif tempKey == 120:    # Pressed x
                trainingNum += 1
                
    return trainingNum


### 4.3 Recording Label (Create Training Data)

In [9]:
from files_io import saveKeypoints

In [10]:
fem = FeatureExtractionModule()

In [11]:
# Specify which action to record
selected_i = 121
action = action_labels[f"{selected_i}"]
action

'NONE'

In [15]:
TRAININGS_PER_LABEL = 100
FRAMES_PER_TRAINING = 20

cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

try:
    trainingResults = np.zeros((TRAININGS_PER_LABEL, FRAMES_PER_TRAINING, 240))
    
    training_num = 0
    while training_num < TRAININGS_PER_LABEL:
        
        frame_num = 0
        while frame_num < FRAMES_PER_TRAINING:

            # Countdown
            if frame_num == 0:
                training_num = countdownFromThree(training_num, action)
        
            # Read from camera
            frame = readFrame()

            detectionResults, frame = fem.extractFeatures(frame)
            
            # Show resulting frame
            cv2.putText(frame, f'Training #{training_num + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
            cv2.imshow("Sign Language Recognition Prototype", frame)     

            # Save the results
            trainingResults[training_num][frame_num] = detectionResults

            keyPressed = cv2.waitKey(10)
            # Stop Program when pressed 'Esc'
            if (keyPressed == 27):
                raise Exception("Finished")
            
            frame_num += 1
        
        training_num += 1

        if training_num >= TRAININGS_PER_LABEL:
            training_num = countdownFromThree(training_num, action)

    # After all frames are finished for each training:
    # save as .npy
    
    # IMPORTANT: 
    # Enable it ONLY during data collection or it may OVERWRITE EXISTING DATA
    saveKeypoints(f"{selected_i},{action}", "0-99", trainingResults)

except Exception as e:
    print(e)
    traceback.print_exc()

finally:
    cam.release()
    cv2.destroyAllWindows()