In [1]:
import cv2
import mediapipe as mp
import time
import numpy as np
from tensorflow.keras.models import load_model
#queue to find the right gesture
from collections import deque




In [2]:
def IdentifyGesture(prediction):
    print(prediction)
    if prediction == 0:
        return "Peace sign"
    elif prediction == 1:
        return "Tilted finger gun with thumb up"
    elif prediction == 2:
        return "Upward fist"
    elif prediction == 3:
        return "Three fingers up"
    elif prediction == 4:
        return "Crossed fingers"
    elif prediction == 5:
        return "O with fingers"
    elif prediction == 6:
        return "Upward fist with fingers forward"
    elif prediction == 7:
        return "One finger pointed up"
    elif prediction == 8:
        return "Two fingers pointing in a direction"
    else:
        return "None"

def createSquare(results, img):
    h, w, c = img.shape
    min_x, min_y = w, h
    max_x, max_y = 0, 0
    for lm in results.multi_hand_landmarks[0].landmark:
        # Convert the normalized position to pixel coordinates
        cx, cy = int(lm.x * w), int(lm.y * h)

        # Update min and max coordinates based on current landmark
        min_x, min_y = min(min_x, cx), min(min_y, cy)
        max_x, max_y = max(max_x, cx), max(max_y, cy)
    
    center_x, center_y = (min_x + max_x) // 2, (min_y + max_y) // 2

    # Determine the side length of the square by finding the maximum dimension
    side_length = max(max_x - min_x, max_y - min_y)

    # Ensure the square doesn't go outside the image boundaries
    square_half_length = side_length // 2
    new_min_x = max(center_x - square_half_length, 0)
    new_max_x = min(center_x + square_half_length, w)
    new_min_y = max(center_y - square_half_length, 0)
    new_max_y = min(center_y + square_half_length, h)

    # Adjust the square dimensions if they go beyond the image's height or width
    if new_min_x < 0:
        new_max_x -= new_min_x  # Adjust the max_x accordingly
        new_min_x = 0
    if new_min_y < 0:
        new_max_y -= new_min_y  # Adjust the max_y accordingly
        new_min_y = 0
    if new_max_x > w:
        new_min_x -= (new_max_x - w)  # Adjust the min_x accordingly
        new_max_x = w
    if new_max_y > h:
        new_min_y -= (new_max_y - h)  # Adjust the min_y accordingly
        new_max_y = h
    return new_min_x, new_min_y, new_max_x, new_max_y

def preprocessHandRegion(handRegion):
    #resize the image to the same resolution used in the dataset
    resized_hand = cv2.resize(handRegion, (64,64))
    normalized_hand = resized_hand / 255.0
    
    reshaped_hand = np.reshape(normalized_hand, (64,64, 3))
    batch_hand = np.expand_dims(reshaped_hand, axis=0)
    return batch_hand

def getHandFromImage(img,hands):
    results = hands.process(img)
    
    if results.multi_hand_landmarks:
        minX, minY, maxX, maxY = createSquare(results, img)
        if minX < maxX and minY < maxY:
            cv2.rectangle(img, (minX, minY), (maxX, maxY), (255, 255, 25), 2)
            handRegion = img[minY:maxY, minX:maxX]
        else:
            print("error in getHandFromImage")
            return None, img
        return handRegion, img

In [4]:
cap = cv2.VideoCapture(0)
mpHands = mp.solutions.hands
hands = mpHands.Hands(static_image_mode=False,
                      max_num_hands=1,
                      min_detection_confidence=0.5,
                      min_tracking_confidence=0.5)
mpDraw = mp.solutions.drawing_utils

pTime = 0
cTime = 0
ASLModel=load_model('ASLModelV3.h5')
GestureModel=load_model('ASLModelV1.h5')
queue=deque(maxlen=10)
instructionName, instructionPredicted = "", ""
while True:
    gestureName=""
    success, img = cap.read()
    if not success:
        print("empty camera frame!!!!!")
        continue
    #put the image into the hand detection model
    #if there are hands, process the coordinates to find the min and max regions
    
    
    results = hands.process(img)
    if results.multi_hand_landmarks:
        #get the dimensions for the cropped image
        minX,minY,maxX,maxY=createSquare(results,img)
        # Draw the square bounding box
        cv2.rectangle(img, (minX, minY), (maxX, maxY), (255, 255, 25), 2)
        if minX < maxX and minY < maxY:
            
            
            handRegion = img[minY:maxY, minX:maxX]
            #Preprocess the hand region for the ASL model
            preprocessedHand = preprocessHandRegion(handRegion)
            #Predict the ASL gesture given by user
            asl_prediction = ASLModel.predict(preprocessedHand) 
            
            #turning the gesture from clas number to real name and adding to video feed
            gestureName = "Detected Gesture: " + IdentifyGesture(np.argmax(asl_prediction)) 
            #If a user has held the same gesture fpr 10 frames, then add the gesture to the command
            #for the next model to be applied to see the instruction from the user
            
            queue.append(np.argmax(asl_prediction))
            if len(queue)==10:
                if len(set(queue))==1:
                    instructionName= IdentifyGesture(queue[0])
                    #cv2.imshow("Image", img)
                    while True:
                        success,img = cap.read()
                        if not success:
                            print("empty camera frame!!!!!")
                            continue
                        results = hands.process(img)
                        if results.multi_hand_landmarks:    
                            minX,minY,maxX,maxY=createSquare(results,img)
                            cv2.rectangle(img, (minX, minY), (maxX, maxY), (15, 15, 15), 2)
                            if minX < maxX and minY < maxY:
                                handRegion = img[minY:maxY, minX:maxX]
                                #Preprocess the hand region for the ASL model
                                preprocessedHand = preprocessHandRegion(handRegion)
                                #Predict the ASL gesture given by user
                                asl_prediction = ASLModel.predict(preprocessedHand) 
                                
                                #turning the gesture from clas number to real name and adding to video feed
                                instructionPredicted = "Detected Gesture: " + IdentifyGesture(np.argmax(asl_prediction))
                                print(instructionPredicted+'asdf')
                        break
                    queue.clear()
                    
    #calculations for displaying the FPS
    cTime = time.time()
    fps = 1/(cTime-pTime)
    pTime = cTime
    
    #adding all the text before displaying the image
    cv2.putText(img, gestureName, (10, 130), cv2.FONT_HERSHEY_PLAIN, 2, (100, 25, 220), 2)
    cv2.putText(img,str(int(fps)), (10,70), cv2.FONT_HERSHEY_PLAIN, 3, (255,0,255), 3)
    if instructionName!="":
        cv2.putText(img, "Final Gesture: " + instructionName+ '\nadd command', (10, 220), cv2.FONT_HERSHEY_PLAIN, 2, (100, 25, 220), 2)
        cv2.putText(img, "Final Gesture: " + instructionPredicted+ '\nadd command', (10, 230), cv2.FONT_HERSHEY_PLAIN, 1, (100, 25, 220), 2)
    
    cv2.imshow("Image", img)
    cv2.waitKey(1)
    time.sleep(0.1)

8
8
8
8
8
8
8
8
8
8
8
8
Detected Gesture: Two fingers pointing in a directionasdf
8
8
8
1
1
1
1
1
1
1
8
8
8
8
8
8
8
8
8
8
8
8
Detected Gesture: Two fingers pointing in a directionasdf
8
8
8
8
8
8
8
8
8
8
8
8
Detected Gesture: Two fingers pointing in a directionasdf
8
8
8
8
8
8
8
8
8
8
8
8
Detected Gesture: Two fingers pointing in a directionasdf
8
8
8
8
8
6
6
8
8
8
8
8
6
8
4
8
6
6
8
6
6
6
6
6
6
6
6
6
6
6
6
Detected Gesture: Upward fist with fingers forwardasdf
6
6
6
6
6
6
6
6
8
4
4
4
4
4
4
4
4
4
4
4
4
Detected Gesture: Crossed fingersasdf
4
4
4
4
4
4
4
4


KeyboardInterrupt: 

In [26]:
cap = cv2.VideoCapture(0)
pTime = 0
cTime = 0
firstDetected,secondDetected=None, None
ASLModel=load_model('ASLModelV3.h5')
queue=deque(maxlen=10)
while True:
    pTime,cTime, detected = detectHand(cap, cTime, pTime, ASLModel, 155)
    queue.append(detected)
    if len(queue)==10 and len(set(queue))==1:
        queue.clear()
        firstCommand=detected
        firstetected=firstCommand
        print(firstDetected)
        while True:
            pTime,cTime, detected = detectHand(cap, cTime, pTime, ASLModel, 0)
            queue.append(detected)
            if detected==7:
                break
            if len(queue)==10 and len(set(queue))==1:
                
                queue.clear()
                seconddetected=detected
                pTime,cTime, detected = detectHand(cap, cTime, pTime, ASLModel, 55)
                queue.append(detected)

                
                
                
                

1
1
1
1
1
1
1
1
1
1
None
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
8
6
8
6
8
6
8
8
8
6
8
7
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
6
6
4
6


KeyboardInterrupt: 

In [23]:
def detectHand(cap, cTime, pTime, ASLModel, colors):
    gestureName=""
    success, img = cap.read()
    if not success:
        print("empty camera frame!!!!!")
        
    results = hands.process(img)
    if results.multi_hand_landmarks:
        #get the dimensions for the cropped image
        minX,minY,maxX,maxY=createSquare(results,img)
        # Draw the square bounding box
        cv2.rectangle(img, (minX, minY), (maxX, maxY), (colors, colors, colors), 2)
        if minX < maxX and minY < maxY:
            handRegion = img[minY:maxY, minX:maxX]
            #Preprocess the hand region for the ASL model
            preprocessedHand = preprocessHandRegion(handRegion) 
            #Predict the ASL gesture given by user
            asl_prediction = ASLModel.predict(preprocessedHand) 

            #turning the gesture from clas number to real name and adding to video feed
            gestureName = "Detected Gesture: " + IdentifyGesture(np.argmax(asl_prediction)) 
    cTime = time.time()
    fps = 1/(cTime-pTime)
    pTime = cTime
    
    #adding all the text before displaying the image
    cv2.putText(img, gestureName, (10, 130), cv2.FONT_HERSHEY_PLAIN, 2, (colors, colors, colors), 2)
    cv2.putText(img,str(int(fps)), (10,70), cv2.FONT_HERSHEY_PLAIN, 3, (colors,50,colors), 3)
    cv2.imshow("Image", img)
    
    return pTime, cTime, gestureName