In [None]:
#%pip install opencv-python mediapipe gTTS googletrans==4.0.0-rc1 pygame pyttsx3

#if you're using a virtual env please make sure you install the libraries above. Else if want to install globally uncomment the code above and run it to install the libraries

# This file is used to test the model in real time

In [None]:
import numpy as np
import cv2
import os
import mediapipe as mp
import time
import os
from gtts import gTTS
import pygame
from pygame import mixer
import threading
import pyttsx3


In [None]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [None]:
def mediapipe_detection(input_image, model):
    # Flip image from BGR to RGB
    input_image = cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB)
    # Make prediction
    detection_results = model.process(input_image)
    # Fip image back to BGR
    input_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)

    return input_image, detection_results

In [None]:
def draw_landmarks(input_image, detection_results):
    # Draw face connections
    mp_drawing.draw_landmarks(input_image, detection_results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    
    # Draw pose connections
    mp_drawing.draw_landmarks(input_image, detection_results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    
    # Draw left hand connections
    mp_drawing.draw_landmarks(input_image, detection_results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    
    # Draw right hand connections
    mp_drawing.draw_landmarks(input_image, detection_results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [None]:
def draw_styled_landmarks(input_image, detection_results):
    # Set drawing specifications for face connections
    face_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=1) 
    face_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=1)

    # Set drawing specifications for pose connections
    pose_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=2)
    pose_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2)

    # Set drawing specifications for left hand connections
    left_hand_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=2)
    left_hand_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2)

    # Set drawing specifications for right hand connections
    right_hand_points = mp_drawing.DrawingSpec(color=(0,0,0), thickness=1, circle_radius=2)
    right_hand_lines = mp_drawing.DrawingSpec(color=(255,255,255), thickness=1, circle_radius=2)

    # Draw face connections
    mp_drawing.draw_landmarks(input_image, detection_results.face_landmarks, mp_holistic.FACEMESH_TESSELATION, face_points, face_lines)

    # Draw pose connections
    mp_drawing.draw_landmarks(input_image, detection_results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, pose_points, pose_lines)

    # Draw left hand connections
    mp_drawing.draw_landmarks(input_image, detection_results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, left_hand_points, left_hand_lines)

    # Draw right hand connections
    mp_drawing.draw_landmarks(input_image, detection_results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, right_hand_points, right_hand_lines)

In [None]:
# Extract keypoints into flattened arrays
def extract_keypoints (detection_results):
    pose = np.array([[results.x, results.y, results.z, results.visibility]
    for results in detection_results.pose_landmarks.landmark]).flatten() if detection_results.pose_landmarks else np.zeros(33*4)

    face = np.array([[results.x, results.y, results.z]
    for results in detection_results.face_landmarks.landmark]).flatten() if detection_results.face_landmarks else np.zeros(468*3)

    left_hand = np.array([[results.x, results.y, results.z]
    for results in detection_results.left_hand_landmarks.landmark]).flatten() if detection_results.left_hand_landmarks else np.zeros(21*3)

    right_hand = np.array([[results.x, results.y, results.z]
    for results in detection_results.right_hand_landmarks.landmark]).flatten() if detection_results.right_hand_landmarks else np.zeros(21*3)

    return np.concatenate([pose, face, left_hand, right_hand])

In [None]:
DATA_PATH = os.path.join('GestureData')

# Gesture that will be detected - Use this array when testing the new dataset in real time
gestures = np.array(['Hello', 'Good', 'Morning', 'Afternoon', 'Night', 'Thanks', 'Sorry', 'How are you', 'Sure'])


# Use this array when testing the original dataset, and commment out the other one
# gestures = np.array(['Hello', 'Good', 'Morning', 'Afternoon', 'Night', 'Thanks', 'Sorry'])

# Length of each video is 20 frames
length_frames = 20

In [None]:
from tensorflow.keras.models import load_model

model = load_model('newDataWithDropout.h5')
# model = load_model('origData.h5')
# model = load_model('newDataset.h5')

In [None]:
mixer.init()
pygame.init()

def play_text(text):
    def speak_text(text):
        try:
            engine = pyttsx3.init()
            engine.say(text)
            engine.runAndWait()
        except Exception as e:
            print(f"Error speaking text: {e}")

    speech_thread = threading.Thread(target=speak_text, args=(text,))
    speech_thread.start()

def play_and_delete_audio(unique_filename):
    try:
        mixer.music.load(unique_filename)
        mixer.music.play()
        while mixer.music.get_busy():
            time.sleep(0.1)
        os.remove(unique_filename)
    except Exception as e:
        print(f"Error in play_and_delete_audio: {e}")

def moving_average(predictions, window_size=3):
    if len(predictions) < window_size:
        return predictions
    return [np.mean(predictions[: i + 1]) for i in range(window_size - 1)] + [
        np.mean(predictions[i - window_size + 1 : i + 1])
        for i in range(window_size - 1, len(predictions))
    ]


sequence = []
current_gesture = []
predictions = []

threshold = 0.95
window_size = 5

cap = cv2.VideoCapture(0)

cap.set(cv2.CAP_PROP_FRAME_WIDTH, 320)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 240)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    prev_frame_time = 0
    prediction_frequency = 1
    frame_counter = 0
    while cap.isOpened():
        ret, input_frame = cap.read()

        input_image, detection_results = mediapipe_detection(input_frame, holistic)
        draw_styled_landmarks(input_image, detection_results)

        keypoints = extract_keypoints(detection_results)
        sequence.append(keypoints)
        sequence = sequence[-20:]

        if len(sequence) == 20 and frame_counter % prediction_frequency == 0:
            result = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(result))

            if len(predictions) >= window_size:
                smoothed_predictions = moving_average(predictions[-window_size:], window_size)

                if len(current_gesture) > 0:
                    if (
                        np.unique(smoothed_predictions)[-1] == np.argmax(result)
                        and result[np.argmax(result)] > threshold
                    ) and gestures[np.argmax(result)] != current_gesture[-1]:
                        current_gesture.append(gestures[np.argmax(result)])
                        play_text(gestures[np.argmax(result)])
                elif (
                    np.unique(smoothed_predictions)[-1] == np.argmax(result)
                    and result[np.argmax(result)] > threshold
                ):
                    current_gesture.append(gestures[np.argmax(result)])
                    play_text(gestures[np.argmax(result)])

                if len(current_gesture) > 1: 
                    current_gesture = current_gesture[-1:]
        frame_counter += 1

        new_frame_time = time.time()
        fps = 1 / (new_frame_time - prev_frame_time)
        prev_frame_time = new_frame_time
        fps_text = f"FPS: {int(fps)}"
        cv2.putText(input_image, fps_text, (input_frame.shape[1] - 80, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
        # Display the recognised gesture with a black background
        gesture_text = ' '.join(current_gesture)
        (text_width, text_height), _ = cv2.getTextSize(gesture_text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        cv2.rectangle(input_image, (5, 35 - text_height - 10), (5 + text_width + 10, 35 + 10), (0, 20, 0), -1)
        cv2.putText(input_image, gesture_text, (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show
        cv2.imshow('Sign Language Recognition', input_image)
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    mixer.quit()
    pygame.quit()

    cap.release()
    cv2.destroyAllWindows()