## 1. Import Libraries / Dependencies

In [1]:
import cv2
import numpy as np
from itertools import chain
import traceback
from time import time

In [2]:
from cvzone.HandTrackingModule import HandDetector
from cvzone.FaceDetectionModule import FaceDetector
from cvzone.PoseModule import PoseDetector
from cvzone import FPS

In [3]:
# Real Time Testing 
from collections import deque
from concurrent.futures import ThreadPoolExecutor

## 2. Feature Extraction (Hand+Face+Pose Detection)

In [4]:
# Flatten a 2d np array into 1d array
def flatten2dList(arr, dataType=int):
    return np.fromiter(chain.from_iterable(arr), dataType)

In [5]:
# Get the largest absolute value in an np array
def getAbsLargestVal(arr):
    return np.max(np.abs(arr))

In [6]:
# Offset and normalize the landmark list
# Returns a 1d numpy array
def preprocess_landmarks(landmark_list):    
    landmark_list = np.array(landmark_list, dtype=float)
    origin = landmark_list[0]
    
    # Offset every point with respect to the first point
    # Convert to 1D-array
    new_landmark_list = (landmark_list - origin).ravel()
    
    # Get highest absolute value
    largest_value = getAbsLargestVal(new_landmark_list)
    
    # Normalization
    if largest_value != 0:
        return new_landmark_list / largest_value
    return new_landmark_list

In [7]:
# Offset and normalize a BBOX list (BBOX = Bounding Box, used in face and hand detection)
# Returns a 1d numpy array
def preprocess_bbox(bbox, frameSize):
    bbox = np.array(bbox, dtype=float)
    # Convert 3rd and 4th element into coordinates instead of width/height
    bbox[2] = bbox[0] + bbox[2]
    bbox[3] = bbox[1] + bbox[3]

    # Normalize against frame size
    bbox[0] /= frameSize[0]
    bbox[1] /= frameSize[1]
    bbox[2] /= frameSize[0]
    bbox[3] /= frameSize[1]

    return bbox

In [8]:
# Normalize a center vertex (a list of 2 elements)
# Returns a 1d numpy array
def preprocess_center(center, frameSize):
    center = np.array(center, dtype=float)
    center[0] /= frameSize[0]
    center[1] /= frameSize[1]
    return center

In [9]:
# Preprocess (Offset and normalize) the body's landmark list, bbox and center
def preprocess_body_part(bodyPart, frameSize):
    bodyPart['lmList'] = preprocess_landmarks(bodyPart['lmList'])
    bodyPart['bbox'] = preprocess_bbox(bodyPart['bbox'], frameSize)
    bodyPart['center'] = preprocess_center(bodyPart['center'], frameSize)
    return bodyPart

In [10]:
# Function to generate empty/placeholder data for a hand 
# Used when a hand is not detected in frame
def generate_empty_hand(type):
    return {
        'lmList': np.zeros(21 * 3, dtype=int), 
        'bbox': np.zeros(4, dtype=float), 
        'center': np.zeros(2, dtype=float), 
        'type': type
    }

In [11]:
# Select the best matching face, aka the one with the best score (clarity)
# and closest to the center of the screen
# Since the Neural network will be design to only accept one face
def select_best_matching_face(faces, frameSize):
    if not faces or len(faces) == 0:
        return False
    elif len(faces) == 1:
        return faces[0]
    
    def difference(a, b):
        return ((a[0] - b[0])**2) + ((a[1] - b[1])**2)
    
    frameCenter = (frameSize[0] / 2, frameSize[1] / 2)

    best_score = faces[0]
    best_center = faces[0]
    center_diff = difference(faces[0]['center'], frameCenter)

    for each in faces:
        if difference(each['center'], frameCenter) < center_diff:
            best_center = each
        if each['score'][0] > best_score['score'][0]:
            best_score = each
    
    if best_center['score'][0] > 0.5:
        return best_center
    return best_score

In [12]:
# Flatten everything
def flattenDetectionResult(obj):
    # return np.fromiter(chain.from_iterable([obj['lmList'], obj['bbox'], obj['center']]), float)
    return np.concatenate([obj['lmList'], obj['bbox'], obj['center']])

## 3. Preparation for Data Collection

In [13]:
# Paths
from constants import TRAININGS_PER_LABEL, FRAMES_PER_TRAINING, KEYPOINTS_PER_FRAME

In [14]:
from files_io import readActionLabels, initActionLabelFolders

action_labels = readActionLabels()
initActionLabelFolders(action_labels)
action_labels

['hello', 'thank you', 'help']

## 4. Data Collection

### 4.1 Feature Extraction Functions

In [15]:
# Initialize cam as global object
cam = None

In [16]:
# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it

# Serial/Unparallelised version (Old version)
def featureExtractionV1(handDetector, faceDetector, poseDetector, frame):
    results = {}
    frameSize = (frame.shape[1], frame.shape[0])

    times = [time()]

    # Hand Detection
    results['hands'], frame = handDetector.findHands(frame, draw=True)
    times.append(time())
    if not results['hands']:
        results['hands'] = [generate_empty_hand('Left'), generate_empty_hand('Right')]
    elif len(results['hands']) == 1:
        if (results['hands'][0]['type'] == 'Left'):
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'].append(generate_empty_hand('Right'))
        else:
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'].insert(0, generate_empty_hand('Left'))                         
    else:
        results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
        results['hands'][1] = preprocess_body_part(results['hands'][1], frameSize)
    times.append(time())

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points 
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    frame = poseDetector.findPose(frame, draw=True)
    times.append(time())
    results['pose'] = {}
    results['pose']['lmList'], tempPoseBbox = poseDetector.findPosition(frame, bboxWithHands=False)
    if results['pose']['lmList'] and tempPoseBbox:
        results['pose']['lmList'] = results['pose']['lmList'][:23]
        results['pose']['bbox'] = tempPoseBbox['bbox']
        results['pose']['center'] = tempPoseBbox['center']
        results['pose'] = preprocess_body_part(results['pose'], frameSize)
    else:
        results['pose']['lmList'] = np.zeros(23 * 3, dtype=int)
        results['pose']['bbox'] = np.zeros(4, dtype=float)
        results['pose']['center'] = np.zeros(2, dtype=float)
        
    times.append(time())

    
    # Face Detection
    frame, results['face'] = faceDetector.findFaces(frame, draw=True)
    times.append(time())
    if results['face']:
        results['face'] = select_best_matching_face(results['face'], frameSize)
        results['face']['bbox'] = preprocess_bbox(results['face']['bbox'], frameSize)
        results['face']['center'] = preprocess_center(results['face']['center'], frameSize)
    else:
        results['face'] = {
            'bbox': np.zeros(4, dtype=float), 
            'center': np.zeros(2, dtype=float)
        }
    times.append(time())

    # Calculate relative distance between body parts
    results['relative'] = {}
    results['relative']['faceHand0'] = results['face']['center'] - results['hands'][0]['center']
    results['relative']['faceHand1'] = results['face']['center'] - results['hands'][1]['center']

    # Convert results into 1D-array
    detectionResults = flatten2dList([
        flattenDetectionResult(results['hands'][0]), 
        flattenDetectionResult(results['hands'][1]), 
        flattenDetectionResult(results['pose']), 
        results['face']['bbox'], 
        results['face']['center'],
        results['relative']['faceHand0'],
        results['relative']['faceHand1']
    ], dataType=float)

    return detectionResults, frame, times

In [17]:


# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it

# Improved/Parallelised version
def featureExtractionV3(handDetector, faceDetector, poseDetector, frame, draw=True):
    def detectHands(handDetector, frame, frameSize, draw):
        results = None
        # Hand Detection
        if (draw):
            results, frame = handDetector.findHands(frame, draw=draw)
        else:
            results = handDetector.findHands(frame, draw=draw)

        if not results:
            results = [generate_empty_hand('Left'), generate_empty_hand('Right')]
        elif len(results) == 1:
            if (results[0]['type'] == 'Left'):
                results[0] = preprocess_body_part(results[0], frameSize)
                results.append(generate_empty_hand('Right'))
            else:
                results[0] = preprocess_body_part(results[0], frameSize)
                results.insert(0, generate_empty_hand('Left'))                         
        else:
            results[0] = preprocess_body_part(results[0], frameSize)
            results[1] = preprocess_body_part(results[1], frameSize)
        return results

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points 
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    def detectPose(poseDetector, frame, draw):
        frame = poseDetector.findPose(frame, draw=draw)
        results, _ = poseDetector.findPosition(frame, bboxWithHands=False)
        if results:
            results = preprocess_landmarks(results[:23])
        else:
            results = np.zeros(23, dtype=int)
        return results
    
    # Face Detection
    def detectFace(faceDetector, frame, frameSize, draw):
        frame, results = faceDetector.findFaces(frame, draw=draw)
        if results:
            results = select_best_matching_face(results, frameSize)
            results['bbox'] = preprocess_bbox(results['bbox'], frameSize)
            results['center'] = preprocess_center(results['center'], frameSize)
        else:
            results = {
                'bbox': np.zeros(4, dtype=float), 
                'center': np.zeros(2, dtype=float)
            }
        return results

    frameSize = (frame.shape[1], frame.shape[0])
    with ThreadPoolExecutor() as executor:
        t1 = executor.submit(detectHands, handDetector, frame, frameSize, draw)
        t2 = executor.submit(detectPose, poseDetector, frame, draw)
        t3 = executor.submit(detectFace, faceDetector, frame, frameSize, draw)
        
        # Convert results into 1D-array
        detectionResults = flatten2dList([
            flattenDetectionResult(t1.result()[0]), 
            flattenDetectionResult(t1.result()[1]), 
            t2.result(), 
            t3.result()['bbox'],
            t3.result()['center'],
            t3.result()['center'] - t1.result()[0]['center'],
            t3.result()['center'] - t1.result()[1]['center']
        ], dataType=float)

        return detectionResults, frame

### 4.2  UI Functions

In [18]:
# Read one frame from camera
def readFrame():
    success, frame = cam.read()
    if not success: 
        raise Exception("No Frames Read")
    return cv2.flip(frame, 1)

In [38]:
# Pause recording upon "Space"
def pauseWhenSpace(trainingNum, actionStr):
    while True:
        frame = readFrame()
        cv2.putText(frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.putText(frame, f'Pausing...', (40, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (20, 255, 125), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)

        # If pressed resume, do countdown
        keyPressed = cv2.waitKey(10)
        if keyPressed == 32:    # 32 == Space
            pause_again = False
            
            for i in range(3):
                for _ in range(10):
                    temp_frame = readFrame()
                    cv2.putText(temp_frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
                    cv2.putText(temp_frame, f'Resuming in {3 - i}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 120), 3)
                    cv2.imshow("Sign Language Recognition Prototype", temp_frame)
                    tempKey = cv2.waitKey(100)

                    if (tempKey == 27):
                        raise Exception("Finished")
                    # If pressed paused again, stop resuming and continue pausing
                    elif tempKey == 32:
                        pause_again = True
                        break
                    elif tempKey == 122:    # Pressed z
                        trainingNum -= 1
                        pause_again = True
                        break
                    elif tempKey == 120:    # Pressed x
                        trainingNum += 1
                        pause_again = True
                        break
                if pause_again:
                    break

            if not pause_again:
                return trainingNum
            
        elif keyPressed == 27:
            raise Exception("Finished")
        elif keyPressed == 122:    # Pressed z
            trainingNum -= 1
        elif keyPressed == 120:    # Pressed x
            trainingNum += 1

In [39]:
# Display countdown (1, 2, 3)
def countdownFromThree(trainingNum, actionStr):
    # Count down 3 seconds on every new training
    for i in range(3):
        for _ in range(10):
            frame = readFrame()
            
            cv2.putText(frame, f'Training #{trainingNum + 1} for \'{actionStr}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
            cv2.putText(frame, f'Next Training in {3 - i}', (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
            cv2.imshow("Sign Language Recognition Prototype", frame)
            
            tempKey = cv2.waitKey(100)
            if (tempKey == 27):     # Pressed Esc
                raise Exception("Finished")
            elif tempKey == 32:     # Pressed Space
                return pauseWhenSpace(trainingNum, actionStr)
            elif tempKey == 122:    # Pressed z
                trainingNum -= 1
            elif tempKey == 120:    # Pressed x
                trainingNum += 1
                
    return trainingNum


### 4.3 Recording Label (Create Training Data)

In [40]:
# Specify which action to record
action = action_labels[2]
action

'help'

In [41]:
# Detectors
handDetector = HandDetector(detectionCon=0.5, maxHands=2)
faceDetector = FaceDetector(minDetectionCon=0.5)
poseDetector = PoseDetector(detectionCon=0.5)

In [None]:
from files_io import saveKeypoints

In [57]:

cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

try:
    trainingResults = np.zeros((TRAININGS_PER_LABEL, FRAMES_PER_TRAINING, 240))
    
    training_num = 0
    while training_num < TRAININGS_PER_LABEL:
        
        frame_num = 0
        while frame_num < FRAMES_PER_TRAINING:

            # Countdown
            if frame_num == 0:
                training_num = countdownFromThree(training_num, action)
        
            # Read from camera
            frame = readFrame()

            detectionResults, frame = featureExtractionV3(
                handDetector, faceDetector, poseDetector, frame)
            
            # Show resulting frame
            cv2.putText(frame, f'Training #{training_num + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
            cv2.imshow("Sign Language Recognition Prototype", frame)     

            # Save the results
            trainingResults[training_num][frame_num] = detectionResults

            keyPressed = cv2.waitKey(10)
            # Stop Program when pressed 'Esc'
            if (keyPressed == 27):
                raise Exception("Finished")
            
            frame_num += 1
        
        training_num += 1

    # After all frames are finished for each training:
    # save as .npy
        
    # IMPORTANT: THIS LINE IS DISABLED IN CASE OF ACCIDENTALLY OVERWRITING DATA
    # Enable it ONLY during data collection
    # saveKeypoints(action, "0-99", trainingResults)

except Exception as e:
    print(e)
    traceback.print_exc()

finally:
    cam.release()
    cv2.destroyAllWindows()

index 15 is out of bounds for axis 0 with size 15


Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_21972\1803852624.py", line 29, in <module>
    trainingResults[training_num][frame_num] = detectionResults
IndexError: index 15 is out of bounds for axis 0 with size 15


In [55]:
trainingResults.nbytes / 1024 / 1024

2.74658203125

In [51]:
for i, each in enumerate(trainingResults[:10]):
    print(i, np.sum(each))

0 219.4406939479077
1 86.07244479420265
2 -52.84934236784753
3 -78.00540148650464
4 0.0
5 0.0
6 0.0
7 -210.5758248798489
8 0.0
9 0.0


In [None]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
while True:
    frame = readFrame()
    cv2.imshow("Sign Language Recognition Prototype", frame) 
    
    keyPressed = cv2.waitKey(1)
    # Stop Program when pressed 'Esc'
    if (keyPressed == 27):
        raise Exception("Finished")

In [30]:

cam.release()
cv2.destroyAllWindows()

In [67]:
print(list(i.shape for i in detectionResults))

[(69,), (69,), (75,), (4,), (2,), (2,), (2,), (2,)]


In [1]:
detectionResults.shape

NameError: name 'detectionResults' is not defined

In [75]:
ogResults['hands'][0]

{'lmList': [[896, 328, 0],
  [878, 304, -12],
  [869, 270, -22],
  [862, 245, -32],
  [853, 224, -42],
  [908, 260, -30],
  [924, 223, -45],
  [932, 199, -55],
  [942, 178, -61],
  [927, 272, -33],
  [950, 255, -52],
  [970, 244, -61],
  [986, 234, -66],
  [940, 289, -37],
  [932, 296, -54],
  [909, 310, -51],
  [891, 319, -46],
  [947, 307, -41],
  [939, 313, -54],
  [920, 323, -50],
  [905, 330, -45]],
 'bbox': (853, 178, 133, 152),
 'center': (919, 254),
 'type': 'Left'}

In [None]:
cam.release()
cv2.destroyAllWindows()

In [None]:
np.load(os.path.join(KEYPOINTS_PATH, action, '0.npy')).shape

(15, 219)

## 5. Real Time Testing

In [None]:
from collections import deque
from concurrent.futures import ThreadPoolExecutor
from time import time
from cvzone import FPS

In [None]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Detectors
handDetector = HandDetector(detectionCon=0.5, maxHands=2)
faceDetector = FaceDetector(minDetectionCon=0.5)
poseDetector = PoseDetector(detectionCon=0.5)

fpsReader = FPS()

timeStats = []

try:

    keypointsHistory = deque()
    predictionHistory = deque()
    detectionThreshold = 1.0

    lastPredictionTime = time()
    predictionCooldown = 1

    while True:
        startTime = time()

        # Read from camera
        success, frame = cam.read()
        if not success: 
            raise Exception("No Frames Read")
        frame = cv2.flip(frame, 1)

        # Pose Detection
        detectionResults, frame = featureExtraction(
            handDetector, faceDetector, poseDetector, frame)
        
        # Semantic Prediction
        keypointsHistory.append(detectionResults)
        if len(keypointsHistory) > FRAMES_PER_TRAINING:
            keypointsHistory.popleft()

            
            # if time() > lastPredictionTime + predictionCooldown:
            #     predictionResults = model.predict(
            #         np.expand_dims(keypointsHistory, axis=0), 
            #         verbose=0, 
            #         use_multiprocessing=True, 
            #         workers=4
            #         )[0]
            #     predWord = action_labels[np.argmax(predictionResults)]
            #     predAccuracy = predictionResults[np.argmax(predictionResults)]

            #     if predAccuracy >= detectionThreshold:
            #         lastPredictionTime = time()
                    
            #         predictionHistory.append(predWord)
            #         if len(predictionHistory) > 5:
            #             predictionHistory.popleft()
        
        cv2.putText(frame, ', '.join(predictionHistory), (15, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3)
        
        fps, frame = fpsReader.update(frame,pos=(950,80),color=(0,255,0),scale=5,thickness=5)


        # Show resulting frame
        # cv2.putText(frame, f'Training #{training + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)     

        timeStats.append(time() - startTime)

        keyPressed = cv2.waitKey(15)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")


except Exception as e:
    cam.release()
    cv2.destroyAllWindows()
    print(e)

cam.release()
cv2.destroyAllWindows()

name 'featureExtraction' is not defined


In [None]:
timeStats[10:]

[0.07875680923461914,
 0.07371330261230469,
 0.0756688117980957,
 0.07382822036743164,
 0.07182097434997559,
 0.14654779434204102,
 0.13663458824157715,
 0.13762927055358887,
 0.13364148139953613,
 0.173537015914917,
 0.16253042221069336,
 0.16656756401062012,
 0.1545863151550293,
 0.1565384864807129,
 0.16048026084899902,
 0.1515054702758789,
 0.1635282039642334,
 0.17353558540344238,
 0.15356874465942383,
 0.15557503700256348,
 0.15255475044250488,
 0.1715106964111328,
 0.15854263305664062,
 0.15158939361572266,
 0.1595776081085205,
 0.16945433616638184]

In [None]:
from concurrent.futures import ThreadPoolExecutor


# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it
def featureExtractionV2(handDetector, faceDetector, poseDetector, frame):
    results = {}
    frameSize = (frame.shape[1], frame.shape[0])

    def detectHands(frame, handDetector, frameSize):
        results = {}
        # Hand Detection
        results['hands'], frame = handDetector.findHands(frame, draw=True)
        if not results['hands']:
            results['hands'] = [generate_empty_hand('Left'), generate_empty_hand('Right')]
        elif len(results['hands']) == 1:
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            
            if (results['hands'][0]['type'] == 'Left'):
                results['hands'].append(generate_empty_hand('Right'))
            else:
                results['hands'].insert(0, generate_empty_hand('Left'))                         
        else:
            results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
            results['hands'][1] = preprocess_body_part(results['hands'][1], frameSize)
        return results['hands']

    def detectPose(frame, poseDetector, frameSize):
        results = {}
        # Pose Detection
        # * We only use the first 23 out of the total 33 landmark points 
        #   as those represent the lower half body and are irrelevant
        frame = poseDetector.findPose(frame, draw=True)
        results['pose'] = {}
        results['pose']['lmList'], tempPoseBbox = poseDetector.findPosition(frame, bboxWithHands=False)
        if results['pose']['lmList'] and tempPoseBbox:
            results['pose']['lmList'] = results['pose']['lmList'][:23]
            results['pose']['bbox'] = tempPoseBbox['bbox']
            results['pose']['center'] = tempPoseBbox['center']
            results['pose'] = preprocess_body_part(results['pose'], frameSize)
        else:
            results['pose']['lmList'] = np.zeros(23 * 3, dtype=int)
            results['pose']['bbox'] = np.zeros(4, dtype=float)
            results['pose']['center'] = np.zeros(2, dtype=float)
        return results['pose']
            

    def detectFace(frame, faceDetector, frameSize):
        results = {}
        # Face Detection
        frame, results['face'] = faceDetector.findFaces(frame, draw=True)
        if results['face']:
            results['face'] = select_best_matching_face(results['face'], frameSize)
            results['face']['bbox'] = preprocess_bbox(results['face']['bbox'], frameSize)
            results['face']['center'] = preprocess_center(results['face']['center'], frameSize)
        else:
            results['face'] = {
                'bbox': np.zeros(4, dtype=float), 
                'center': np.zeros(2, dtype=float)
            }
        return results['face']
    
    with ThreadPoolExecutor() as executor:
        # t1 = executor.submit(detectHands, frame, handDetector, frameSize)
        # t2 = executor.submit(detectPose, frame, poseDetector, frameSize)
        t3 = executor.submit(detectFace, frame, faceDetector, frameSize)


        # Convert results into 1D-array
        detectionResults = np.concatenate([
            flattenDetectionResult(t1.result()[0]), 
            flattenDetectionResult(t1.result()[1]), 
            flattenDetectionResult(t2.result()), 
            t3.result()['bbox'], 
            t3.result()['center']
        ])

        return detectionResults, frame

In [None]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

fpsReader = FPS()

timeStats = []

try:
    initialTime = time()
    while True:
        startTime = time()

        # Read from camera
        success, frame = cam.read()
        
        # Pose Detection
        detectionResults, frame = featureExtractionV2(
            handDetector, faceDetector, poseDetector, frame)
        
        # fps, frame = fpsReader.update(frame,pos=(50,80),color=(0,255,0),scale=5,thickness=5)


        # Show resulting frame
        # cv2.putText(frame, f'Training #{training + 1} for \'{action}\'', (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
        cv2.imshow("Sign Language Recognition Prototype", frame)     

        timeStats.append(time() - startTime)

        keyPressed = cv2.waitKey(15)
        # Stop Program when pressed 'Esc'
        if (keyPressed == 27):
            raise Exception("Finished")
        
        # if time() - initialTime > 10:
        #     raise Exception()


except Exception as e:
    cam.release()
    cv2.destroyAllWindows()
    print(e)

cam.release()
cv2.destroyAllWindows()

Finished


In [None]:
np.array(timeStats[1:]).mean()

0.05556477281384002

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


In [None]:
np.array(timeStats[1:]).mean()

0.016924326368373075

In [None]:
np.array(timeStats[1:]).mean()

0.09001387300945464

In [None]:
np.array(timeStats[1:]).mean()

0.06750456676926724

In [None]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

success, frame = cam.read()

In [None]:
type(frame)

numpy.ndarray

In [None]:

cam.release()
cv2.destroyAllWindows()

In [58]:
from collections import deque
from concurrent.futures import ThreadPoolExecutor
from time import time
from cvzone import FPS

# Detects hands, face & pose, 
# convert them into normalized landmark/keypoint coordinates in a 1D-array, 
# and also returns the frame with the landmark connections drawn onto it

# Serial/Unparallelised version (Old version)
def featureExtractionV3(handDetector, faceDetector, poseDetector, frame, draw=True):
    def detectHands(handDetector, frame, frameSize, draw):
        results = {}
        # Hand Detection
        if (draw):
            results['hands'], frame = handDetector.findHands(frame, draw=draw)
        else:
            results['hands'] = handDetector.findHands(frame, draw=draw)

        if not results['hands']:
            results['hands'] = [generate_empty_hand('Left'), generate_empty_hand('Right')]
        elif len(results['hands']) == 1:
            if (results['hands'][0]['type'] == 'Left'):
                # results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
                results['hands'].append(generate_empty_hand('Right'))
            else:
                # results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
                results['hands'].insert(0, generate_empty_hand('Left'))                         
        # else:
        #     results['hands'][0] = preprocess_body_part(results['hands'][0], frameSize)
        #     results['hands'][1] = preprocess_body_part(results['hands'][1], frameSize)
        return results['hands']

    # Pose Detection
    # **We only use the first 23 out of the total 33 landmark points 
    #   as those represent the lower half body and are irrelevant to sign language interpretation
    def detectPose(poseDetector, frame, frameSize, draw):
        results = {}
        frame = poseDetector.findPose(frame, draw=draw)
        results['pose'] = {}
        results['pose']['lmList'], tempPoseBbox = poseDetector.findPosition(frame, bboxWithHands=False)
        if results['pose']['lmList'] and tempPoseBbox:
            results['pose']['lmList'] = results['pose']['lmList'][:23]
            results['pose']['bbox'] = tempPoseBbox['bbox']
            results['pose']['center'] = tempPoseBbox['center']
            results['pose'] = preprocess_body_part(results['pose'], frameSize)
        else:
            results['pose']['lmList'] = np.zeros(23 * 3, dtype=int)
            results['pose']['bbox'] = np.zeros(4, dtype=float)
            results['pose']['center'] = np.zeros(2, dtype=float)
        return results['pose']
    
    # Face Detection
    def detectFace(faceDetector, frame, frameSize, draw):
        results = {}
        frame, results['face'] = faceDetector.findFaces(frame, draw=draw)
        if results['face']:
            results['face'] = select_best_matching_face(results['face'], frameSize)
            results['face']['bbox'] = preprocess_bbox(results['face']['bbox'], frameSize)
            results['face']['center'] = preprocess_center(results['face']['center'], frameSize)
        else:
            results['face'] = {
                'bbox': np.zeros(4, dtype=float), 
                'center': np.zeros(2, dtype=float)
            }
        return results['face']

    frameSize = (frame.shape[1], frame.shape[0])
    with ThreadPoolExecutor() as executor:
        t1 = executor.submit(detectHands, handDetector, frame, frameSize, draw)
        t2 = executor.submit(detectPose, poseDetector, frame, frameSize, draw)
        t3 = executor.submit(detectFace, faceDetector, frame, frameSize, draw)

        th0_lmList = executor.submit(preprocess_landmarks, t1.result()[0]['lmList'])
        th0_bbox = executor.submit(preprocess_bbox, t1.result()[0]['bbox'], frameSize)
        th0_center = executor.submit(preprocess_center, t1.result()[0]['center'], frameSize)
        
        th1_lmList = executor.submit(preprocess_landmarks, t1.result()[1]['lmList'])
        th1_bbox = executor.submit(preprocess_bbox, t1.result()[1]['bbox'], frameSize)
        th1_center = executor.submit(preprocess_center, t1.result()[1]['center'], frameSize)
        
        t_flattenHand0 = executor.submit(np.concatenate((th0_lmList.result(), th0_bbox.result(), th0_center.result())))
        t_flattenHand1 = executor.submit(np.concatenate((th1_lmList.result(), th1_bbox.result(), th1_center.result())))

        # Convert results into 1D-array
        detectionResults = flatten2dList([
            t_flattenHand0.result(),
            t_flattenHand1.result()
            flattenDetectionResult(t2.result()), 
            t3.result()['bbox'], 
            t3.result()['center'],
            t3.result()['center'] - t1.result()[0]['center'],
            t3.result()['center'] - t1.result()[1]['center']
        ], dataType=float)

        return detectionResults, frame

SyntaxError: invalid syntax (6389559.py, line 89)

In [65]:
cam = cv2.VideoCapture(0, cv2.CAP_DSHOW)
cam.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cam.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

fpsReader = FPS()

timeStats = []

try:
    initialTime = time()
    while True:
        startTime = time()

        # Read from camera
        success, frame = cam.read()
        
        # Pose Detection
        detectionResults, frame, times = featureExtraction(
            handDetector, faceDetector, poseDetector, frame)
        
        fps, frame = fpsReader.update(frame,pos=(50,80),color=(0,255,0),scale=5,thickness=5)

        cv2.imshow("Sign Language Recognition Prototype", frame)     

        timeStats.append(time() - startTime)

        keyPressed = cv2.waitKey(10)
        if (keyPressed == 27):
            raise Exception("Finished")
        
        # if time() - initialTime > 10:
        #     raise Exception()


except Exception as e:
    print(e)
    traceback.print_exc()
finally:
    cam.release()
    cv2.destroyAllWindows()

# cam.release()
# cv2.destroyAllWindows()

Finished


Traceback (most recent call last):
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_2896\2453371629.py", line 29, in <module>
    raise Exception("Finished")
Exception: Finished


In [66]:
times

[1698841322.1600137,
 1698841322.217857,
 1698841322.218854,
 1698841322.2657285,
 1698841322.267724,
 1698841322.2727106,
 1698841322.2727106]

In [67]:
for i in range(len(times) - 1):
    print(times[i + 1] - times[i])

0.05784320831298828
0.0009970664978027344
0.0468745231628418
0.001995563507080078
0.00498652458190918
0.0
