In [3]:
import numpy as np
from cvzone.HandTrackingModule import HandDetector
from cvzone.FaceDetectionModule import FaceDetector
from cvzone.PoseModule import PoseDetector
from concurrent.futures import ThreadPoolExecutor
from itertools import chain

In [4]:


LIST_PATH = "action_recognition/action_labels.csv"

import os
import csv
action_labels = []
with open(os.path.join("../", LIST_PATH)) as f:
    csv_reader = csv.reader(f, delimiter=",")
    action_labels = [i[1] for i in csv_reader]
action_labels

['hello',
 'good/thank you',
 'help',
 'I/me',
 'please',
 'sorry',
 'welcome',
 'welcome',
 'ok',
 'what',
 'what',
 'can',
 'thank you very much',
 'deaf',
 'do not',
 'feel',
 'eat/food',
 'eat a lot',
 'tired',
 'because',
 'sick',
 'drink',
 'drink',
 'apple',
 'banana',
 'drive',
 'again',
 'also',
 'ask',
 'yes',
 'no',
 'man',
 'man',
 'woman',
 'woman',
 'he/she',
 'bad',
 'have/has/had',
 'have/has/had',
 'when',
 'where',
 'which',
 'who',
 'why',
 'how',
 'you',
 'boy',
 'girl',
 'friend',
 'finish/complete',
 'find',
 'other',
 'forget',
 'give',
 'give you',
 'give me',
 'go',
 'get',
 'understand/comprehend',
 'use',
 'will',
 'with',
 'wait',
 'work',
 'they',
 'their',
 'school',
 'write',
 'send text/message',
 'email',
 'email',
 'home',
 'but',
 'should',
 'not',
 'my',
 'name',
 'like',
 'say',
 'cold',
 'hot',
 'family',
 'mother',
 'father',
 'many',
 'few',
 'now',
 'later',
 'time',
 'tomorrow',
 'yesterday',
 'same/also',
 'remember',
 'your',
 'more',
 'meet'

In [5]:
import numpy as np
DATA_PATH = "action_recognition/keypoints_data"

current_i = 0
rawNpy = np.load(os.path.join("../", DATA_PATH, f"{current_i},{action_labels[current_i]}", "0-99.npy"))

In [30]:

class FeatureExtractionModule():
    def __init__(self, **kwargs):
        # Detectors
        self.handDetector = HandDetector(detectionCon=0.5, maxHands=2)
        self.faceDetector = FaceDetector(minDetectionCon=0.5)
        self.poseDetector = PoseDetector(detectionCon=0.5)

    def detectHands(self, handDetector, frame, frameSize, draw):
        results = [0, 0]
        tempResults = []
        # Hand Detection
        if draw:
            tempResults, frame = handDetector.findHands(frame, draw=draw, flipType=False)
        else:
            tempResults = handDetector.findHands(frame, draw=draw, flipType=False)

        if not tempResults:
            results = [self.generate_empty_hand("Left"), self.generate_empty_hand("Right")]
        elif len(tempResults) == 1:
            if tempResults[0]["type"] == "Left":
                results = [self.preprocess_body_part(tempResults[0], frameSize), self.generate_empty_hand("Right")]
            else:
                results = [self.generate_empty_hand("Left"), self.preprocess_body_part(tempResults[0], frameSize)]
        else:
            if tempResults[0]['type'] == 'Right' and tempResults[1]['type'] == 'Left':
                results[0] = tempResults[1]
                results[1] = tempResults[0]
            elif tempResults[0]['type'] == 'Left' and tempResults[1]['type'] == 'Right':
                results[0] = tempResults[0]
                results[1] = tempResults[1]

            # If both detected hands are both left or both right
            elif tempResults[0]['center'][0] > tempResults[1]['center'][0]:
                results[0] = tempResults[1]
                results[1] = tempResults[0]
            else:
                results[0] = tempResults[0]
                results[1] = tempResults[1]

            results[0] = self.preprocess_body_part(results[0], frameSize)
            results[1] = self.preprocess_body_part(results[1], frameSize)

        return results

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    def detectPose(self, poseDetector, frame, frameSize, draw):
        frame = poseDetector.findPose(frame, draw=draw)
        if poseDetector.results.pose_landmarks:
            results = np.array([[i.x, i.y, i.z] for i in poseDetector.results.pose_landmarks.landmark[:23]])
            return results.ravel()
        return np.zeros(69, dtype=float)
        

    # Face Detection
    def detectFace(self, faceDetector, frame, frameSize, draw):
        frame, results = faceDetector.findFaces(frame, draw=draw)
        if results:
            results = self.select_best_matching_face(results, frameSize)
            results["bbox"] = self.preprocess_bbox(results["bbox"], frameSize)
            results["center"] = self.preprocess_center(results["center"], frameSize)
            return results

        return {
            "bbox": np.zeros(4, dtype=float),
            "center": np.zeros(2, dtype=float),
        }

    # Detects hands, face & pose,
    # convert them into normalized landmark/keypoint coordinates in a 1D-array,
    # and also returns the frame with the landmark connections drawn onto it
    def parallelFeatureExtraction(
        self, handDetector, faceDetector, poseDetector, frame, draw=True
    ):
        frameSize = (frame.shape[1], frame.shape[0])
        with ThreadPoolExecutor() as executor:
            t1 = executor.submit(self.detectHands, handDetector, frame, frameSize, draw)
            t2 = executor.submit(self.detectPose, poseDetector, frame, frameSize, draw)
            t3 = executor.submit(self.detectFace, faceDetector, frame, frameSize, draw)

            # Convert results into 1D-array
            detectionResults = self.flatten2dList(
                [
                    self.flattenDetectionResult(t1.result()[0]),
                    self.flattenDetectionResult(t1.result()[1]),
                    t2.result(),
                    t3.result()["bbox"],
                    t3.result()["center"],
                    t3.result()["center"] - t1.result()[0]["center"],
                    t3.result()["center"] - t1.result()[1]["center"],
                ],
                dataType=float,
            )

            return detectionResults, frame

    # Offset and normalize the landmark list
    # Returns a 1d numpy array
    def preprocess_landmarks(self, landmark_list, frameSize):
        np_landmark_list = np.array(landmark_list, dtype=float)
        np_frameSize = np.array([frameSize[0], frameSize[1], frameSize[0]])
        return (np_landmark_list / np_frameSize).ravel()


    # Offset and normalize a BBOX list (BBOX = Bounding Box, used in face and hand detection)
    # Returns a 1d numpy array
    def preprocess_bbox(self, bbox, frameSize):
        bbox = np.array(bbox, dtype=float)
        # Convert 3rd and 4th element into coordinates instead of width/height
        bbox[2] = bbox[0] + bbox[2]
        bbox[3] = bbox[1] + bbox[3]

        # Normalize against frame size
        bbox[0] /= frameSize[0]
        bbox[1] /= frameSize[1]
        bbox[2] /= frameSize[0]
        bbox[3] /= frameSize[1]

        return bbox


    # Normalize a center vertex (a list of 2 elements)
    # Returns a 1d numpy array
    def preprocess_center(self, center, frameSize):
        center = np.array(center, dtype=float)
        center[0] /= frameSize[0]
        center[1] /= frameSize[1]
        return center


    # Preprocess (Offset and normalize) the body's landmark list, bbox and center
    def preprocess_body_part(self, bodyPart, frameSize):
        bodyPart["lmList"] = self.preprocess_landmarks(bodyPart["lmList"], frameSize)
        bodyPart["bbox"] = self.preprocess_bbox(bodyPart["bbox"], frameSize)
        bodyPart["center"] = self.preprocess_center(bodyPart["center"], frameSize)
        return bodyPart


    # Function to generate empty/placeholder data for a hand
    # Used when a hand is not detected in frame
    def generate_empty_hand(self, type):
        return {
            "lmList": np.zeros(63, dtype=float),
            "bbox": np.zeros(4, dtype=float),
            "center": np.zeros(2, dtype=float),
            "type": type,
        }


    # Select the best matching face, aka the one with the best score (clarity)
    # and closest to the center of the screen
    # Since the Neural network will be design to only accept one face
    def select_best_matching_face(self, faces, frameSize):
        if not faces or len(faces) == 0:
            return False
        elif len(faces) == 1:
            return faces[0]

        def difference(a, b):
            return ((a[0] - b[0]) ** 2) + ((a[1] - b[1]) ** 2)

        frameCenter = (frameSize[0] / 2, frameSize[1] / 2)

        best_score = faces[0]
        best_center = faces[0]
        center_diff = difference(faces[0]["center"], frameCenter)

        for each in faces[1:]:
            if difference(each["center"], frameCenter) < center_diff:
                best_center = each
            if each["score"][0] > best_score["score"][0]:
                best_score = each

        if best_center["score"][0] > 0.5:
            return best_center
        return best_score

    # Flatten a 2d np array into 1d array
    def flatten2dList(self, arr, dataType=float):
        return np.fromiter(chain.from_iterable(arr), dataType)

    # Flatten everything
    def flattenDetectionResult(self, obj):
        print(obj)
        return np.concatenate([obj["lmList"], obj["bbox"], obj["center"]])


    def extractFeatures(self, frame):
        detectionResults, frame = self.parallelFeatureExtraction(
            self.handDetector, self.faceDetector, self.poseDetector, frame
        )

        return detectionResults, frame


In [32]:

femodule = FeatureExtractionModule()

In [33]:
import cv2
import traceback

cam = None
# Read one frame from camera
def readFrame():
    success, frame = cam.read()
    if not success: 
        raise Exception("No Frames Read")
    return cv2.flip(frame, 1)


cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

try:
    
    while True:
        # Read from camera
        rawframe = readFrame()

        frame = np.copy(rawframe)

        detectionResults, frame = femodule.extractFeatures(frame)
        
        # Show resulting frame
        cv2.imshow("Sign Language Recognition Prototype", frame)     

        keyPressed = cv2.waitKey(10)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")

except Exception as e:
    print(e)
    traceback.print_exc()

finally:
    cam.release()
    cv2.destroyAllWindows()
    # del femodule

{'lmList': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'bbox': array([0., 0., 0., 0.]), 'center': array([0., 0.]), 'type': 'Left'}
{'lmList': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'bbox': array([0., 0., 0., 0.]), 'center': array([0., 0.]), 'type': 'Right'}
{'lmList': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_45016\876110533.py", line 33, in <module>
    raise Exception("Finished")
Exception: Finished


In [11]:
detectionResults

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [9]:
def convertX(normalized_x):
    return int(normalized_x * 1280)
def convertY(normalized_y):
    return int(normalized_y * 720)

In [10]:
dr = detectionResults
print((convertX(dr[63]), convertY(dr[64])))
print((convertX(dr[65]), convertY(dr[66])))
print(dr[65], dr[66])

(0, 0)
(0, 0)
0.0 0.0


In [16]:
modFrame = np.copy(rawframe)
dr = detectionResults

# Draw left hand center
cv2.circle(modFrame, (convertX(dr[67]), convertY(dr[68])), 5, (0, 0, 255), 5)

# Draw left hand bbox
cv2.rectangle(modFrame, 
              (convertX(dr[63]), convertY(dr[64])),
              (convertX(dr[65]), convertY(dr[66])),
              (0, 0, 255), 3
              )

cv2.circle(modFrame, (1200, 700), 5, (0, 255, 255), 5)

try:
    while True:
        cv2.imshow("dsa", modFrame)
        # cv2.imshow("dsa", frame)
        
        keyPressed = cv2.waitKey(10)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")

except Exception as e:
    print(e)
    traceback.print_exc()

finally:
    cam.release()
    cv2.destroyAllWindows()

Finished


Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_45200\1325288025.py", line 24, in <module>
    raise Exception("Finished")
Exception: Finished


In [22]:

cam.release()
cv2.destroyAllWindows()

In [63]:

mf = np.copy(rawframe)
fem = FeatureExtractionModule()
fem.detectPose(fem.poseDetector, mf, (1280, 720), False)

array([ 5.03531983e-04,  1.88927253e-03, -6.35702629e-05,  3.80028854e-04,
        1.93832931e-03,  6.46637927e-05,  4.97945677e-04,  2.53801528e-03,
        4.99524933e-05,  3.78504442e-04,  2.56094701e-03,  2.35871039e-04,
        4.99892421e-04,  3.11529769e-03,  3.51638114e-04,  3.72882606e-04,
        3.13743684e-03,  4.90312185e-04,  5.06585138e-04,  3.20153369e-03,
        3.66787123e-04,  3.68840550e-04,  3.21989391e-03,  5.10521932e-04,
        4.83982125e-04,  3.31390103e-03,  1.10713672e-04,  3.94875929e-04,
        3.34577362e-03,  2.33860523e-04])

In [31]:
(np.array(thisn)[:, 1:] / np.array([1280, 720, 1280]))

array([[ 0.64296875,  0.81805556, -0.0296875 ],
       [ 0.6484375 ,  0.81805556, -0.01875   ],
       [ 0.6484375 ,  0.82083333, -0.01875   ],
       [ 0.6484375 ,  0.82361111, -0.01875   ],
       [ 0.64921875,  0.80833333, -0.03359375],
       [ 0.64921875,  0.80416667, -0.03359375],
       [ 0.64921875,  0.79861111, -0.03359375],
       [ 0.64375   ,  0.81805556,  0.04296875],
       [ 0.64453125,  0.78611111, -0.02109375],
       [ 0.634375  ,  0.82222222, -0.00390625],
       [ 0.63515625,  0.80833333, -0.02265625],
       [ 0.60703125,  0.84166667,  0.08203125],
       [ 0.6125    ,  0.75138889, -0.046875  ],
       [ 0.57734375,  0.87222222,  0.0421875 ],
       [ 0.5640625 ,  0.73472222, -0.078125  ],
       [ 0.59453125,  0.82222222, -0.0375    ],
       [ 0.53203125,  0.79166667, -0.0578125 ],
       [ 0.59375   ,  0.81111111, -0.053125  ],
       [ 0.5171875 ,  0.80138889, -0.06640625],
       [ 0.6       ,  0.80277778, -0.04453125],
       [ 0.52109375,  0.81388889, -0.068

In [32]:
np.array([[i.x, i.y, i.z] for i in pd.results.pose_landmarks.landmark])

array([[ 0.64309895,  0.81921208, -0.02971784],
       [ 0.64914638,  0.81839812, -0.01926368],
       [ 0.64897245,  0.82129753, -0.01930911],
       [ 0.64877284,  0.82423294, -0.01935484],
       [ 0.64952326,  0.80892742, -0.03362121],
       [ 0.64965653,  0.80432618, -0.03364923],
       [ 0.64978689,  0.79971641, -0.033786  ],
       [ 0.64440036,  0.81944329,  0.04302012],
       [ 0.64520204,  0.78625327, -0.02152   ],
       [ 0.63484842,  0.82259595, -0.00439513],
       [ 0.63522363,  0.80940622, -0.02327977],
       [ 0.6076892 ,  0.84276557,  0.08261181],
       [ 0.61270493,  0.75246572, -0.04764055],
       [ 0.57774049,  0.87272847,  0.04224254],
       [ 0.56469619,  0.73545492, -0.07852706],
       [ 0.5946523 ,  0.82358366, -0.03812525],
       [ 0.53232223,  0.79264587, -0.05852298],
       [ 0.59378326,  0.811351  , -0.05325757],
       [ 0.51767898,  0.80220228, -0.06679534],
       [ 0.60058713,  0.80354351, -0.04516131],
       [ 0.52152324,  0.81429291, -0.068

In [27]:

pd = PoseDetector(detectionCon=0.5)
def f1():
    pd.results = None
    mf2 = pd.findPose(mf, draw=False)
    thisn, thisn2 = pd.findPosition(mf2, bboxWithHands=False)
    return (np.array(thisn)[:, 1:] / np.array([1280, 720, 1280]))

def f2():
    pd.results = None
    mf2 = pd.findPose(mf, draw=False)
    return np.array([[i.x, i.y, i.z] for i in pd.results.pose_landmarks.landmark])

In [28]:
from time import time
st = time()
f1()
print(time() - st)

st = time()
f2()
print(time() - st)


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed