In [1]:
import mediapipe as mp
import cv2
import numpy as np
import os
from scipy import stats
from collections import Counter
import random

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR to RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB to BGR
    return image, results

In [4]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=2), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=1)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=2), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=1)
                             ) 

In [5]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [6]:
# Desired path for data
desired_path = "C:\\Users\\erwin\\Desktop\\ASL_Translation_FYP"

# Creating a subfolder for MP_Data within the desired path
DATA_PATH = os.path.join(desired_path, 'ASL_Dataset') 

# Actions that we try to detect
actions = np.array(["Family", "Friends", "Work", "School", "Home", "Car", "Happy", "Sad", "Play", 
                    "Help", "Eat", "Drink", "Sleep", "Sorry", "Computer", "Money", "Phone", "Clothe", "Me", "Stop"])

# Thirty videos worth of data
no_sequences = 30

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
start_folder = 1

In [7]:
colors = [(245, 117, 16), (117, 245, 16), (16, 117, 245), 
    (245, 16, 117), (117, 16, 245), (16, 245, 117), 
    (128, 0, 0), (0, 128, 0), (0, 0, 128),
    (128, 128, 0), (128, 0, 128), (0, 128, 128),
    (64, 0, 0), (0, 64, 0), (0, 0, 64),
    (64, 64, 0), (64, 0, 64), (0, 64, 64),
    (192, 64, 64), (64, 192, 64)]

def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    # Smaller font size
    font_scale = 0.5
    # Smaller line thickness
    line_thickness = 1
    # Reduced vertical space per action
    space_per_action = 20  # Adjusted from 40 to 20

    for num, prob in enumerate(res):
        # Adjust rectangle and text position
        rect_start = (0, 5 + num * space_per_action)
        rect_end = (int(prob * 100), 20 + num * space_per_action)
        text_position = (0, 20 + num * space_per_action - 3)

        cv2.rectangle(output_frame, rect_start, rect_end, colors[num], -1)
        cv2.putText(output_frame, actions[num], text_position, cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), line_thickness)

    return output_frame

## Load LSTM Model

In [8]:
import tensorflow as tf
LSTM_model = tf.keras.models.load_model("LSTM_Model.h5")



### Test LSTM Model

In [13]:
# New Current Method
# Existing variables and setup
sequence = []
sentence = []
predictions = []
threshold = 0.5
res = [0] * len(actions) 

# New variables for frame-based prediction and translation start
predicting = False
start_translating = False
frame_count = 0

cap = cv2.VideoCapture(0)

# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Check for 'k' key press to initiate start translating process
        if cv2.waitKey(10) & 0xFF == ord('k'):
            start_translating = True
            predictions.clear()

        # Start translating logic
        if start_translating:
            cv2.putText(image, "Start Translating", (200, 200),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
            cv2.imshow('OpenCV Feed', image)
            cv2.waitKey(1000)  # Wait for 1 second
            start_translating = False
            predicting = True
            frame_count = 0

        # Prediction logic
        if predicting:
            # Run prediction logic for the next 30 frames
            if frame_count < 30:
                keypoints = extract_keypoints(results)
                sequence.append(keypoints)
                sequence = sequence[-30:]

                if len(sequence) == 30:
                    res = LSTM_model.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
                    predictions.append(np.argmax(res))
                    
                frame_count += 1
            else:
                # Determine the most frequent action after 30 frames
                if predictions:
                    most_common = Counter(predictions).most_common(1)[0]
                    action_index, occurrence = most_common
                    action_probability = max(res)

                    if action_probability > threshold:
                        action = actions[action_index]
                        if len(sentence) == 0 or (sentence[-1] != action):
                            sentence.append(action)
                            print(action)
                            print(action_probability)
                
                # Reset prediction mode
                predicting = False
                frame_count = 0
                predictions = []

                # Limit sentence length
                if len(sentence) > 5:
                    sentence = sentence[-5:]
        
        image = prob_viz(res, actions, image, colors) if predicting else prob_viz([0]*len(actions), actions, image, colors)

        # Display the sentence
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully with 'q' key
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

## Load CNN Model

In [8]:
import tensorflow as tf
CNN_model = tf.keras.models.load_model("CNN_Model.h5")



### Test CNN MOdel

In [13]:
# New Current Metho
# Existing variables and setup
sequence = []
sentence = []
predictions = []
threshold = 0.7
res = [0] * len(actions) 

# New variables for frame-based prediction and translation start
predicting = False
start_translating = False
frame_count = 0

cap = cv2.VideoCapture(0)

# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Check for 'k' key press to initiate start translating process
        if cv2.waitKey(10) & 0xFF == ord('k'):
            start_translating = True
            predictions.clear()

        # Start translating logic
        if start_translating:
            cv2.putText(image, "Start Translating", (200, 200),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
            cv2.imshow('OpenCV Feed', image)
            cv2.waitKey(1000)  # Wait for 1 second
            start_translating = False
            predicting = True
            frame_count = 0

        # Prediction logic
        if predicting:
            # Run prediction logic for the next 30 frames
            if frame_count < 30:
                keypoints = extract_keypoints(results)
                sequence.append(keypoints)
                sequence = sequence[-30:]

                if len(sequence) == 30:
                    res = CNN_model.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
                    predictions.append(np.argmax(res))
                    
                frame_count += 1
            else:
                # Determine the most frequent action after 30 frames
                if predictions:
                    most_common = Counter(predictions).most_common(1)[0]
                    action_index, occurrence = most_common
                    action_probability = max(res)

                    if action_probability > threshold:
                        action = actions[action_index]
                        if len(sentence) == 0 or (sentence[-1] != action):
                            sentence.append(action)
                            print(action)
                            print(action_probability)
                            
                
                # Reset prediction mode
                predicting = False
                frame_count = 0
                predictions = []

                # Limit sentence length
                if len(sentence) > 5:
                    sentence = sentence[-5:]
        
        image = prob_viz(res, actions, image, colors) if predicting else prob_viz([0]*len(actions), actions, image, colors)

        # Display the sentence
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully with 'q' key
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

Clothe
0.8900825


## ASL Test Function

In [14]:
# Variables
sequence = []
sentence = []
predictions = []
predicting = False
start_translating = False
frame_count = 0
selected_word = None
display_word = ""
result_display = ""

cap = cv2.VideoCapture(0)

# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Randomly select a word from 'actions' and display it
        if not selected_word:
            selected_word = random.choice(actions)
            display_word = f"Sign this word: {selected_word}"
        
        # Display the selected word
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, display_word, (3, 30),
        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2, cv2.LINE_AA)


        # Check for 'k' key press to start prediction
        if cv2.waitKey(10) & 0xFF == ord('k'):
            result_display = ""  # Reset after displaying
            start_translating = True
            predictions.clear()

        # Start translating logic
        if start_translating:
            cv2.putText(image, "Start Translating", (200, 200),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
            cv2.imshow('OpenCV Feed', image)
            cv2.waitKey(1000)  # Wait for 1 second
            start_translating = False
            predicting = True
            frame_count = 0

        # Prediction logic
        if predicting:
            if frame_count < 30:
                keypoints = extract_keypoints(results)
                sequence.append(keypoints)
                sequence = sequence[-30:]

                if len(sequence) == 30:
                    res = CNN_model.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
                    predictions.append(np.argmax(res))
                frame_count += 1
            else:
                # Compare the prediction result with the selected word
                if predictions:
                    most_common = Counter(predictions).most_common(1)[0][0]
                    predicted_action = actions[most_common]
                    result_display = "Correct" if predicted_action == selected_word else "Wrong"

                # Reset
                predicting = False
                selected_word = None
                display_word = ""

        # Display result
        if result_display:
            cv2.putText(image, result_display, (10, 100),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
            

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully with 'q' key
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()