## Sign Language Detection using **Actoin Recognition** & LSTM DL Model
* Youtube Video - https://www.youtube.com/watch?v=doDUihpj6ro
* Github Repo - https://github.com/nicknochnack/ActionDetectionforSignLanguage
* Google Mediapipe - https://developers.google.com/mediapipe/solutions
* Mediapipe Github - https://github.com/google/mediapipe/tree/master

In [2]:
import cv2 as cv
import numpy as np
import os
import time
import mediapipe as mp
import matplotlib.pyplot as plt

### Keypoints using MP

In [3]:
mp_holistic = mp.solutions.holistic  # Holistic model
mp_drawing = mp.solutions.drawing_utils  # Drawing Utilities

In [11]:
def mediapipeDetection(image , model):
    image = cv.cvtColor(image , cv.COLOR_BGR2RGB)  # Color Conversion
    image.flags.writeable = False  # Image is not writeable
    
    results = model.process(image)  # Applying mediapipe model
    
    image.flags.writeable = True
    image = cv.cvtColor(image , cv.COLOR_RGB2BGR)  # Color Conversion
    
    return image , results

In [13]:
def drawLandmarks(image,results):
    mp_drawing.draw_landmarks(image,results.face_landmarks , mp_holistic.FACEMESH_CONTOURS)  # Face
    mp_drawing.draw_landmarks(image,results.pose_landmarks , mp_holistic.POSE_CONNECTIONS)  # Pose
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks , mp_holistic.HAND_CONNECTIONS)  # Left Hand
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks , mp_holistic.HAND_CONNECTIONS)  # Right Hand

In [14]:
def drawStyledLandmarks(image , results):
    mp_drawing.draw_landmarks(image,results.face_landmarks ,
                              mp_holistic.FACEMESH_CONTOURS,
                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=  1 ,circle_radius = 1),
                              mp_drawing.DrawingSpec(color=(80,265,121), thickness = 1 ,circle_radius = 1)
                              )
    
    mp_drawing.draw_landmarks(image,results.pose_landmarks ,
                              mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(80,22,10), thickness=  2 ,circle_radius = 4),
                              mp_drawing.DrawingSpec(color=(80,44,121), thickness = 2 ,circle_radius = 2)
                              )
    
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks ,
                              mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(121,22,76), thickness=  2 ,circle_radius = 4),
                              mp_drawing.DrawingSpec(color=(121,44,250), thickness = 2 ,circle_radius = 2)
                              )
    
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks ,
                              mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(245,117,66), thickness=  2 ,circle_radius = 4),
                              mp_drawing.DrawingSpec(color=(245,66,230), thickness = 2 ,circle_radius = 2),
                              )

In [50]:
cap = cv.VideoCapture(0)

# Access mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5 , min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        
        # Read Feed
        ret , frame = cap.read()
        
        # Make detections
        image , results = mediapipeDetection(frame , holistic)
        
        # Draw Landmarks
        drawStyledLandmarks(image, results)
        
        # Show to screen
        cv.imshow("OpenCV Feed" , image)
        
        # Breaking the loop
        if cv.waitKey(10) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv.destroyAllWindows()

### Extract Keypoint Values

In [57]:
# # Test
# pose= []
# for res in results.pose_landmarks.landmark:
#     test = np.array([res.x , res.y , res.z , res.visibility])
#     pose.append(test)

In [58]:
# # Explaiation
# np.zeros(21*3)

# pose = np.array([[res.x , res.y , res.z , res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks.landmark else np.zeros(132)
# lh = np.array([[res.x , res.y , res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks.landmark else np.zeros(21*3)
# rh = np.array([[res.x , res.y , res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks.landmark else np.zeros(21*3)
# face = np.array([[res.x , res.y , res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks.landmark else np.zeros(1404)

# print("Pose Length : ",len(pose))
# print("Pose Shape : ", pose.shape)
# print("Left Hand " , lh)
# print("Right Hand " , rh)
# print("Face " , face)


In [16]:
def extract_keypoints(results):
    pose = np.array([[res.x , res.y , res.z , res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x , res.y , res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x , res.y , res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    face = np.array([[res.x , res.y , res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    return np.concatenate([pose , face, lh , rh])

### Setup Folders for Collection

In [81]:
# result_test = extract_keypoints(results)
# np.save("0" , result_test)

In [6]:
# Path for explored data , np arrays
DATA_PATH = os.path.join("MP_Data")

# Actions we wanna detect
actions = np.array(['hello' , 'thanks' , 'i love you'])

# NO. of worth Videos
no_sequence = 30

# NO. of Frames
sequence_length = 10

In [88]:
# Creating Folders
for action in actions:
    for sequence in range(no_sequence):
        try:
            os.makedirs(os.path.join(DATA_PATH , action , str(sequence)))
        except: pass
        

### Collect Keypoints Sequences

In [89]:
cap = cv.VideoCapture(0)

# Access mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5 , min_tracking_confidence=0.5) as holistic:
    
    # Loop through actions:
    for action in actions:
        # Loop through Sequences aka Videos
        for sequence in range(no_sequence):
            # Loop Through video length aka sequence length
            for frame_num in range(sequence_length):
        
                # Read Feed
                ret , frame = cap.read()
                
                # Make detections
                image , results = mediapipeDetection(frame , holistic)
                
                # Draw Landmarks
                drawStyledLandmarks(image, results)
                
                # Applying collection logic
                if frame_num == 0:
                    cv.putText(image , 'Starting Collecting' , (120,200), cv.FONT_HERSHEY_SIMPLEX , 1,(0,255,0) , 4 ,cv.LINE_AA)
                    cv.putText(image , f'Collecting Frames For {action} Sequence NO. {sequence}' , (15,30), cv.FONT_HERSHEY_SIMPLEX , 0.7,(0,0,255) , 4 ,cv.LINE_AA)
                    cv.waitKey(2000)
                else:
                    cv.putText(image , f'Collecting Frames For {action} Sequence NO. {sequence}' , (15,30), cv.FONT_HERSHEY_SIMPLEX , 0.7,(0,0,255) , 4 ,cv.LINE_AA)
                
                # Show to screen
                cv.imshow("OpenCV Feed" , image)
                
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH , action , str(sequence) , str(frame_num))
                np.save(npy_path , keypoints)
                
                # Breaking the loop
                if cv.waitKey(10) & 0xFF == ord('q'):
                    break
    
    cap.release()
    cv.destroyAllWindows()

### Preprocess Data and Create Labels and Features
> Note : I changed the kernal from Python 3.11 to Conda 3.8 to run the following code

In [1]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [6]:
# Run action cell above again
label_map = {
    label:num for num,label in enumerate(actions)
}
print(label_map)

{'hello': 0, 'thanks': 1, 'i love you': 2}


In [None]:
# Loading Stored Data
sequences , labels = [] , []
for action in actions:
    for sequence in range(no_sequence):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH , action , str(sequence) , f'{frame_num}.npy'))
            window.append(res)
            
        sequences.append(window)
        labels.append(label_map[action])

In [37]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)

NameError: name 'sequences' is not defined

In [15]:
X[0]

array([[ 0.50149149,  0.5719136 , -0.95673114, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.49982572,  0.57538861, -1.0137049 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.49951634,  0.57645965, -1.0249747 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.49818   ,  0.58204305, -1.16991329, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.4981989 ,  0.58384401, -1.1466701 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.49813646,  0.58457911, -1.03997016, ...,  0.        ,
         0.        ,  0.        ]])

In [13]:
y[:5]

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0]])

In [36]:
# Splitting the data
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.05)

NameError: name 'X' is not defined

### Build & Train LSTM Neural Network

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM , Dense
from tensorflow.keras.callbacks import TensorBoard

In [27]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir) # monetoring training

In [7]:
model = Sequential()
model.add(LSTM(64 , return_sequences = True , activation = "relu" , input_shape = (10,1662)))
model.add(LSTM(128 , return_sequences = True , activation = "relu" ))
model.add(LSTM(64 , return_sequences = False , activation = "relu" ))
model.add(Dense(64,activation = 'relu'))
model.add(Dense(32,activation = 'relu'))
model.add(Dense(actions.shape[0],activation = 'softmax'))

In [71]:
model.compile(optimizer = 'Adam' , loss = 'categorical_crossentropy' , metrics = ['categorical_accuracy'])

In [72]:
model.fit(X_train, y_train, epochs=200 , callbacks=tb_callback)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x1e11fd59520>

In [73]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_33 (LSTM)              (None, 10, 64)            442112    
                                                                 
 lstm_34 (LSTM)              (None, 10, 128)           98816     
                                                                 
 lstm_35 (LSTM)              (None, 64)                49408     
                                                                 
 dense_33 (Dense)            (None, 64)                4160      
                                                                 
 dense_34 (Dense)            (None, 32)                2080      
                                                                 
 dense_35 (Dense)            (None, 3)                 99        
                                                                 
Total params: 596,675
Trainable params: 596,675
Non-t

In [49]:
print(y_train.shape)

(85, 3)


### Make Predictions

In [83]:
res = model.predict(X_test)
print("Predicted Value : ",actions[np.argmax(res)])
print("True Value : ",actions[np.argmax(y_test)])
print(actions)

Predicted Value :  hello
True Value :  hello
['hello' 'thanks' 'i love you']


### Saving the Model

In [84]:
model.save("actions_predition_model.h5")

In [85]:
# del model

In [8]:
model.load_weights("actions_predition_model.h5")

### Evaluating using Confusion Matrix & Accuracy

In [9]:
from sklearn.metrics import multilabel_confusion_matrix , accuracy_score

In [34]:
y_hat = model.predict(X_test)

NameError: name 'X_test' is not defined

In [35]:
y_true = np.argmax(y_test , axis = 1).tolist()
y_hat = np.argmax(y_hat , axis = 1).tolist()

NameError: name 'y_test' is not defined

In [94]:
multilabel_confusion_matrix(y_true,y_hat)

array([[[3, 0],
        [0, 2]],

       [[2, 0],
        [0, 3]]], dtype=int64)

In [33]:
accuracy_score(y_true,y_hat)

NameError: name 'y_true' is not defined

### Testing in Real Time

In [52]:
from scipy import stats

In [53]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv.putText(output_frame, actions[num], (0, 85+num*40), cv.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv.LINE_AA)
        
    return output_frame

In [55]:
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.5

cap = cv.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipeDetection(frame, holistic)
        
        # Draw landmarks
        drawStyledLandmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))
            
            
        #3. Viz logic
            if np.unique(predictions[-10:])[0]==np.argmax(res): 
                if res[np.argmax(res)] > threshold: 
                    
                    if len(sentence) > 0: 
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv.putText(image, ' '.join(sentence), (3,30), cv.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv.LINE_AA)
        
        # Show to screen
        cv.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

KeyboardInterrupt: 

: 