# Sign Language Detection Project



In [167]:
# %pip uninstall protobuf -y
# %pip install protobuf

In [168]:
# %pip install mediapipe==0.8.8.1
# versions: 0.9.1.0, 0.9.2.1, 0.9.3.0, 0.10.0, 0.10.1, 0.10.2, 0.10.3, 0.10.5, 0.10.7, 0.10.8, 0.10.9, 0.10.10, 0.10.11, 0.10.13, 0.10.14)

In [169]:
# tensorflow-gpu 
# %pip install mediapipe sklearn matplotlib
# %pip install tensorflow==2.10.0 mediapipe scikit-learn matplotlib
# %pip install mediapipe scikit-learn matplotlib

In [172]:
# %pip install tensorflow[and-cuda]

In [170]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
# need to launch VS Code as Administrator to use mediapipe?
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard
# from tensorflow.keras.applications import ResNet50 
from sklearn.model_selection import train_test_split


tf.config.list_physical_devices('GPU')

[]

In [171]:
print(tf.__version__)

2.13.1


In [173]:
# ========= INPUT 1 =========

# Actions that we try to detect
# actions = np.array(['yes', 'no', 'PeaceAmongWorlds'])
actions = np.array(['Hi', 'Yes', 'No', 'ThankYou', 'ILoveYou', 'background', 'NoHands'])

# number of videos/sequences worth of data
no_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30

DATA_PATH = os.path.join('MP_Data')

# training with facemesh or not
facemesh_included = False

# 6. Preprocess Data and Create Labels and Features

In [174]:
label_map = {label:num for num, label in enumerate(actions)}
label_map

{'Hi': 0,
 'Yes': 1,
 'No': 2,
 'ThankYou': 3,
 'ILoveYou': 4,
 'background': 5,
 'NoHands': 6}

In [175]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            frame = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            if facemesh_included == False:
                frame = frame[:258] # get only the first 258 points, ie delete facemesh data
            window.append(frame)
        sequences.append(window)
        labels.append(label_map[action])

In [176]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)

(210, 30, 258)

In [179]:
# not being used now - mounting google drive to be able to access files in the folder
# from google.colab import drive
# drive.mount('/content/drive/MyDrive')

In [178]:
np.save(os.path.join(DATA_PATH,'X'), X)
np.save(os.path.join(DATA_PATH,'y'), y)

In [180]:
# the paths need to be updated accordingly
X = np.load(os.path.join(DATA_PATH,'X.npy'))
y = np.load(os.path.join(DATA_PATH,'y.npy'))

In [181]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

# 7. Build and Train LSTM Neural Network

In [182]:
log_dir = os.path.join('Logs')

In [184]:
# ========= INPUT 2 =========

neural_multiplicator = 8
activation_function = 'tanh'
camera_index = 0
number_of_classes = actions.shape[0]

if facemesh_included:
  number_of_keypoints = 1662 # 258 if no facemesh, 1662 if facemesh is included
  coefficient = 1 # if coefficient is 1, the model structure is the same as it is in the original model by Nick
  weights_name = "weights_w_facemesh_NM" + str(neural_multiplicator)
else:
  number_of_keypoints = 258 # 258 if no facemesh, 1662 if facemesh is included
  coefficient = 0.5 # coefficient 0.5 means that the number of neurons in 2nd, the 3rd and the 4th layer will be half - so as to account for the smaller input shape WHEN FACEMESH is REMOVED
  weights_name = "weights_wo_facemesh_NM" + str(neural_multiplicator)


# simplest kind of a Keras model for neural networks that are just composed of a stack of layers where each layer has exactly one input tensor and one output tensor.
model = Sequential()
# Long Short-Term Memory (LSTM) layers which are a type of recurrent neural network (RNN) suitable for sequence prediction problems
model.add(LSTM(64, return_sequences=True, activation=activation_function, input_shape=(30, number_of_keypoints)))
model.add(LSTM(int(128*coefficient*neural_multiplicator), return_sequences=True, activation=activation_function))
model.add(LSTM(int(64*coefficient*neural_multiplicator), return_sequences=False, activation=activation_function))
# These layers further process the data into a more abstract representation. They are typical feed-forward layers.
model.add(Dense(int(64*coefficient*neural_multiplicator), activation=activation_function))
model.add(Dense(32*neural_multiplicator, activation=activation_function))
 #output layer that predicts the keypoints; each node in this layer will output a value between 0 and 1, indicating the presence of each keypoint/action
model.add(Dense(number_of_classes, activation='softmax')) 

# reLU = Rectified Linear Activation Function: returns 0 if it receives any negative input, but for any positive value x it returns that value back.  It helps the model to account for non-linearity.
# Sigmoid: Outputs a value between 0 and 1, making it suitable for binary classification or multilabel classification tasks where each label is predicted independently.

In [185]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  
    patience=10,          
    restore_best_weights=True  
)

tb_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir,
    histogram_freq=0,
    write_graph=True,
    write_images=False,
    write_steps_per_second=False,
    update_freq="epoch",
    profile_batch=0,
    embeddings_freq=0,
    embeddings_metadata=None,
)

In [186]:
"""
tb_callback = TensorBoard(log_dir=log_dir)
log_dir is the path of the directory where to save the log files to be parsed by TensorBoard.
TensorBoard is a visualization tool provided with TensorFlow. This callback logs events for TensorBoard, including:
Metrics summary plots, Training graph visualization, Activation histograms, Sampled profiling
"""
# hist = model.fit(X_train, y_train, epochs=2000, validation_data=val_data, callbacks=[early_stopping])
model.fit(X_train, y_train, epochs=2000, callbacks=[tb_callback])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

KeyboardInterrupt: 

In [187]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_21 (LSTM)              (None, 30, 64)            82688     
                                                                 
 lstm_22 (LSTM)              (None, 30, 512)           1181696   
                                                                 
 lstm_23 (LSTM)              (None, 256)               787456    
                                                                 
 dense_21 (Dense)            (None, 256)               65792     
                                                                 
 dense_22 (Dense)            (None, 256)               65792     
                                                                 
 dense_23 (Dense)            (None, 7)                 1799      
                                                                 
Total params: 2185223 (8.34 MB)
Trainable params: 2185

## 8. Make predictions

In [188]:
res = model.predict(X_test)
correct_count = 0
for i in range(len(res)):
  prediction = actions[np.argmax(res[i])]
  actual = actions[np.argmax(res[i])]
  if prediction == actual:
    correct_count += 1
  print("prediction: ", prediction)
  print("actual    : ", actual)
  print("~"*30)
print("~"*30)
print("accuracy on x_test: ", correct_count/len(res)*100, "%")



In [189]:
actions[np.argmax(res[0])]

'Yes'

## 9. Save weights

In [190]:
model.save(weights_name + ".keras")

## 10. Evaluation using confusion matrix and accuracy

In [191]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [192]:
yhat = model.predict(X_train)



In [193]:
ytrue = np.argmax(y_train, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [194]:
multilabel_confusion_matrix(ytrue, yhat)
# Confusion Matrix = True Positive, True Negative, False Positive, False Negative
# identify which gestures are frequently misclassified as others, providing insights into how the gestures overlap or are confused by the model

array([[[169,   0],
        [  0,  30]],

       [[172,   0],
        [  0,  27]],

       [[170,   0],
        [  0,  29]],

       [[169,   0],
        [  0,  30]],

       [[173,   0],
        [  0,  26]],

       [[170,   0],
        [  0,  29]],

       [[171,   0],
        [  0,  28]]], dtype=int64)

In [195]:
accuracy_score(ytrue, yhat)

1.0

# ToDo  
1. ~~Improve Accuracy~~
    - ~~train for more epochs (often: more accuracy, but might lead to overfitting)~~
    - ~~fine-tune hyperparameters~~
2. Integrate validation data set
3. implement image manipulations
4. put model parameters like neural multiplicator into config file (because used in B and C)
5. Integrate model visualizations from Nick's tutorial
6. Integrate TensorBoard visualization
7. integrate "if facemesh" into def extract_keypoints(results)