In [157]:
#%pip install tensorflow opencv-python mediapipe scikit-learn sklearn gTTS googletrans==4.0.0-rc1 pygame matplotlib

#if you're using a virtual env please make sure you install the libraries above. Else if want to install globally uncomment the code above and run it to install the libraries






In [158]:
import pygame

print("Pygame version:", pygame.__version__)

Pygame version: 2.3.0


In [159]:
import numpy as np
import cv2
import os
import mediapipe as mp
import random
import matplotlib.pyplot as plt
import copy


In [160]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [161]:
def mediapipe_detection(input_image, model):
    # Convert image from BGR to RGB
    input_image = cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB)
    # Make prediction
    detection_results = model.process(input_image)
    # Convert image back to BGR
    input_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)

    return input_image, detection_results

In [162]:
def draw_landmarks(input_image, detection_results):
    # Draw face connections
    mp_drawing.draw_landmarks(input_image, detection_results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    
    # Draw pose connections
    mp_drawing.draw_landmarks(input_image, detection_results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    
    # Draw left hand connections
    mp_drawing.draw_landmarks(input_image, detection_results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    
    # Draw right hand connections
    mp_drawing.draw_landmarks(input_image, detection_results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [163]:
def draw_styled_landmarks(input_image, detection_results):
    # Set drawing specifications for face connections
    face_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=1) 
    face_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=1)

    # Set drawing specifications for pose connections
    pose_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=2)
    pose_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2)

    # Set drawing specifications for left hand connections
    left_hand_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=2)
    left_hand_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2)

    # Set drawing specifications for right hand connections
    right_hand_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=2)
    right_hand_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2)

    # Draw face connections
    mp_drawing.draw_landmarks(input_image, detection_results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, face_points, face_lines)

    # Draw pose connections
    mp_drawing.draw_landmarks(input_image, detection_results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, pose_points, pose_lines)

    # Draw left hand connections
    mp_drawing.draw_landmarks(input_image, detection_results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, left_hand_points, left_hand_lines)

    # Draw right hand connections
    mp_drawing.draw_landmarks(input_image, detection_results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, right_hand_points, right_hand_lines)


In [164]:
# Extract keypoints into flattened arrays
def extract_keypoints (detection_results):
    pose = np.array([[results.x, results.y, results.z, results.visibility]
    for results in detection_results.pose_landmarks.landmark]).flatten() if detection_results.pose_landmarks else np.zeros(33*4)

    face = np.array([[results.x, results.y, results.z]
    for results in detection_results.face_landmarks.landmark]).flatten() if detection_results.face_landmarks else np.zeros(468*3)

    left_hand = np.array([[results.x, results.y, results.z]
    for results in detection_results.left_hand_landmarks.landmark]).flatten() if detection_results.left_hand_landmarks else np.zeros(21*3)

    right_hand = np.array([[results.x, results.y, results.z]
    for results in detection_results.right_hand_landmarks.landmark]).flatten() if detection_results.right_hand_landmarks else np.zeros(21*3)

    return np.concatenate([pose, face, left_hand, right_hand])

In [165]:
DATA_PATH = os.path.join('GestureData')

# Gesture that will be detected 
gestures = np.array(['Hello', 'Good', 'Morning', 'Afternoon', 'Night', 'Thanks', 'Sorry', 'How are you'])

# Number of sequences of data
n_frames = 100
# Length of each video is 20 frames
length_frames = 20

In [166]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [167]:
label_map = {label:num for num, label in enumerate(gestures)}

In [168]:
sequences, labels = [], []
for gesture in gestures:
    for frame in np.array(os.listdir(os.path.join(DATA_PATH, gesture))).astype(int):
        window = []
        for frame_num in range(length_frames):
            res = np.load(os.path.join(DATA_PATH, gesture, str(frame), f"{frame_num}.npy"))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[gesture])

In [169]:
x = np.array(sequences)

In [170]:
y = to_categorical(labels).astype(int)

In [171]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05)

In [172]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from tensorflow.keras.optimizers import Adam

In [173]:
import tensorflow

tensorflow.test.is_built_with_cuda()

True

In [174]:
from datetime import datetime

log_dir = os.path.join('Logs', datetime.now().strftime("%Y%m%d-%H%M%S"))
tb_callback = TensorBoard(log_dir=log_dir)
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

In [175]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(20,1662)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(gestures.shape[0], activation='softmax')) 

In [176]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [177]:
model.fit(x_train, y_train, validation_split=0.1, epochs=250, callbacks=[tb_callback])

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<keras.callbacks.History at 0x1b1166bd580>

In [178]:
model.save('my_model3.h5')

In [179]:
from tensorflow.keras.models import load_model

model = load_model('my_model3.h5')

In [180]:
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "No GPU available!"
tf.config.experimental.set_visible_devices(physical_devices[0], 'GPU')

AssertionError: No GPU available!

In [181]:
import time
import os
import uuid
from gtts import gTTS
from pygame import mixer
import threading
import atexit
import tempfile
import shutil
import collections

In [184]:
mixer.init()
pygame.init()

def play_text(text):
    try:
        tts = gTTS(text=text, lang='en', slow=False)
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        unique_filename = temp_file.name
        tts.save(unique_filename)
        temp_file.close()
        playback_thread = threading.Thread(target=play_and_delete_audio, args=(unique_filename,))
        playback_thread.start()
    except Exception as e:
        print(f"Error playing text: {e}")

def play_and_delete_audio(unique_filename):
    try:
        mixer.music.load(unique_filename)
        mixer.music.play()
        while mixer.music.get_busy():
            time.sleep(0.1)
        os.remove(unique_filename)
    except Exception as e:
        print(f"Error in play_and_delete_audio: {e}")

def moving_average(predictions, window_size=3):
    if len(predictions) < window_size:
        return predictions
    cumsum = np.cumsum(predictions, dtype=float)
    cumsum[window_size:] = cumsum[window_size:] - cumsum[:-window_size]
    cumsum[:window_size] /= np.arange(1, window_size + 1)
    return cumsum


sequence = []
current_gesture = []
predictions = []

threshold = 0.9
window_size = 5

cap = cv2.VideoCapture(0)

cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    prev_frame_time = 0
    while cap.isOpened():
        ret, input_frame = cap.read()

        input_image, detection_results = mediapipe_detection(input_frame, holistic)
        draw_styled_landmarks(input_image, detection_results)

        keypoints = extract_keypoints(detection_results)
        sequence.append(keypoints)
        sequence = sequence[-20:]

        if len(sequence) == 20:
            result = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(result))

            if len(predictions) >= window_size:
                smoothed_predictions = moving_average(predictions[-window_size:], window_size)

                if len(current_gesture) > 0:
                    if (
                        np.unique(smoothed_predictions)[-1] == np.argmax(result)
                        and result[np.argmax(result)] > threshold
                    ) and gestures[np.argmax(result)] != current_gesture[-1]:
                        current_gesture.append(gestures[np.argmax(result)])
                        play_text(gestures[np.argmax(result)])
                elif (
                    np.unique(smoothed_predictions)[-1] == np.argmax(result)
                    and result[np.argmax(result)] > threshold
                ):
                    current_gesture.append(gestures[np.argmax(result)])
                    play_text(gestures[np.argmax(result)])

                if len(current_gesture) > 1: 
                    current_gesture = current_gesture[-1:]


        new_frame_time = time.time()
        fps = 1 / (new_frame_time - prev_frame_time)
        prev_frame_time = new_frame_time
        fps_text = f"FPS: {int(fps)}"
        cv2.putText(input_image, fps_text, (input_frame.shape[1] - 80, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
        # Display the recognized gesture with a background
        gesture_text = ' '.join(current_gesture)
        (text_width, text_height), _ = cv2.getTextSize(gesture_text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        cv2.rectangle(input_image, (5, 35 - text_height - 10), (5 + text_width + 10, 35 + 10), (0, 20, 0), -1)
        cv2.putText(input_image, gesture_text, (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show
        cv2.imshow('Sign Language Recognition', input_image)
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    mixer.quit()
    pygame.quit()

    cap.release()
    cv2.destroyAllWindows()

Error in play_and_delete_audio: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\dangl\\AppData\\Local\\Temp\\tmp9nut4ieh.mp3'
Error in play_and_delete_audio: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\dangl\\AppData\\Local\\Temp\\tmpkfcqwclv.mp3'
Error in play_and_delete_audio: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\dangl\\AppData\\Local\\Temp\\tmpvt6204nf.mp3'
Error in play_and_delete_audio: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\dangl\\AppData\\Local\\Temp\\tmps38f4k5r.mp3'
Error in play_and_delete_audio: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\dangl\\AppData\\Local\\Temp\\tmpxgwzcg8z.mp3'
Error in play_and_delete_audio: [WinError 32] The process cannot access the file