In [1]:
## Cell 1: Imports and Setup
import cv2
import mediapipe as mp
import numpy as np
import torch
import torch.nn as nn
from collections import deque
import time
import os
from corpus_engine import CorpusEngine
from audiofication import generate_audio

# Initialize MediaPipe
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
hands = mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.7)
pose = mp_pose.Pose(min_detection_confidence=0.7)

# Paths and parameters
MODEL_PATH = 'models/sign_lstm.pth'
LABELS_PATH = 'processed_data/labels.npy'
FRAMES_PER_VIDEO = 30
EXPECTED_LANDMARKS = 225  # 21*3*2 (hands) + 33*3 (pose)
MOVEMENT_FRAMES = 10  # Number of frames to capture movement (1-2 seconds at 5-10 fps)

# Load labels
try:
    labels = np.load(LABELS_PATH)
    print(f"Loaded labels: {labels}")
except FileNotFoundError:
    print(f"Error: {LABELS_PATH} not found. Run preprocess_data.py.")
    raise

# Initialize corpus engine
from corpus_engine import SENTENCES
corpus_engine = CorpusEngine(SENTENCES)

# Verify 10 signs
if len(labels) != 6:
    print(f"Error: Expected 9 signs, found {len(labels)}. Update dataset.")
    raise ValueError("Incorrect number of signs")

# Define LSTM Model (matching train_model.py)
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size1, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_size1, hidden_size2, batch_first=True)
        self.lstm3 = nn.LSTM(hidden_size2, hidden_size3, batch_first=True)
        self.fc1 = nn.Linear(hidden_size3, 64)
        self.fc2 = nn.Linear(64, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        if len(x.shape) == 3:
            x = x[:, -1, :]
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Load model
input_size = EXPECTED_LANDMARKS
hidden_size1 = 128
hidden_size2 = 64
hidden_size3 = 32
output_size = len(labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if not torch.cuda.is_available():
    print("Error: CUDA is not available. Ensure NVIDIA drivers and CUDA are installed.")
    raise RuntimeError("CUDA unavailable")
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
model = LSTMModel(input_size, hidden_size1, hidden_size2, hidden_size3, output_size).to(device)
if os.path.exists(MODEL_PATH):
    try:
        model.load_state_dict(torch.load(MODEL_PATH))
    except Exception as e:
        print(f"Error: Failed to load model. {str(e)}")
        raise
else:
    print(f"Error: Model file {MODEL_PATH} not found. Run train_model.py.")
    raise FileNotFoundError(MODEL_PATH)
model.eval()

# Landmark Extraction with Movement
def extract_landmarks_with_movement(frames):
    """Extract hand and pose landmarks from a sequence of frames to capture movement.
    Returns a fixed-size array of landmarks, including motion features."""
    landmarks_list = []
    for frame in frames:
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        hand_results = hands.process(image_rgb)
        pose_results = pose.process(image_rgb)
        
        landmarks = []
        if hand_results.multi_hand_landmarks:
            for hand_landmarks in hand_results.multi_hand_landmarks[:2]:
                for lm in hand_landmarks.landmark:
                    landmarks.extend([lm.x, lm.y, lm.z])
            landmarks.extend([0] * (21 * 3 * (2 - len(hand_results.multi_hand_landmarks))))
        else:
            landmarks.extend([0] * (21 * 3 * 2))
        
        if pose_results.pose_landmarks:
            for i, lm in enumerate(pose_results.pose_landmarks.landmark):
                if i < 33:
                    landmarks.extend([lm.x, lm.y, lm.z])
        else:
            landmarks.extend([0] * (33 * 3))
        
        landmarks = np.array(landmarks)
        if len(landmarks) != EXPECTED_LANDMARKS:
            print(f"Warning: Landmark array size {len(landmarks)} != {EXPECTED_LANDMARKS}. Padding/truncating.")
            if len(landmarks) < EXPECTED_LANDMARKS:
                landmarks = np.pad(landmarks, (0, EXPECTED_LANDMARKS - len(landmarks)), mode='constant')
            else:
                landmarks = landmarks[:EXPECTED_LANDMARKS]
        landmarks_list.append(landmarks)
    
    # Pad or truncate to FRAMES_PER_VIDEO
    landmarks_array = np.array(landmarks_list)
    if len(landmarks_list) < FRAMES_PER_VIDEO:
        padding = np.zeros((FRAMES_PER_VIDEO - len(landmarks_list), EXPECTED_LANDMARKS))
        landmarks_array = np.vstack((landmarks_array, padding))
    elif len(landmarks_list) > FRAMES_PER_VIDEO:
        landmarks_array = landmarks_array[:FRAMES_PER_VIDEO]
    
    return landmarks_array

Loaded labels: ['One' 'Great' 'Young' 'Open' 'Strong' 'Water']
Using GPU: NVIDIA GeForce GTX 1650


  model.load_state_dict(torch.load(MODEL_PATH))


In [2]:

def reset_state():
    global sentence, predicted_words, sequence, movement_buffer, current_word, is_paused, sign_start_time, awaiting_second_scan
    print("Resetting: Clearing sentence and resuming sign collection")
    sentence = None
    predicted_words = []
    sequence.clear()
    movement_buffer.clear()
    current_word = None
    is_paused = False
    sign_start_time = None
    awaiting_second_scan = False

def real_time_recognition():
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open webcam")
        return
    
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    
    # Initialize state variables
    global sentence, predicted_words, sequence, movement_buffer, current_word, is_paused, sign_start_time, awaiting_second_scan
    movement_buffer = deque(maxlen=MOVEMENT_FRAMES)
    sequence = deque(maxlen=FRAMES_PER_VIDEO)
    predicted_words = []
    sentence = None
    current_word = None
    is_paused = False
    sign_start_time = None
    awaiting_second_scan = False
    
    window_name = 'Sign Language Recognition'
    cv2.namedWindow(window_name)
    
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                print("Error: Failed to capture frame")
                break
            
            # Check for keys
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
            elif key == ord('r'):  # Reset key changed from 's' to 'r'
                # Reset when paused or after a sentence is framed
                if is_paused:
                    reset_state()
                    print("Reset triggered by 'r' key (paused state)!")
                # Reset at any other point
                elif len(predicted_words) > 0:
                    reset_state()
                    print("Reset triggered by 'r' key!")
            elif key == ord('s'):  # Recapture first sign when asking for second sign
                if len(predicted_words) == 1:
                    first_word = predicted_words[0].lower()
                    matching_sentences = [s for s in corpus_engine.sentences if s.lower().startswith(first_word)]
                    if len(matching_sentences) > 1:
                        predicted_words.clear()
                        current_word = None
                        movement_buffer.clear()
                        sign_start_time = None
                        print("Recapturing first sign triggered by 's' key!")
            elif key == ord('a') and awaiting_second_scan and len(predicted_words) == 1:
                print("Rescanning for 2nd word...")
                awaiting_second_scan = False
                movement_buffer.clear()
                start_time = time.time()
                while len(movement_buffer) < MOVEMENT_FRAMES and (time.time() - start_time) < 2:
                    ret, frame = cap.read()
                    if not ret:
                        break
                    movement_buffer.append(frame)
                    cv2.waitKey(100)
                if len(movement_buffer) == MOVEMENT_FRAMES:
                    landmarks_array = extract_landmarks_with_movement(movement_buffer)
                    sequence_array = np.array([landmarks_array])
                    sequence_tensor = torch.tensor(sequence_array, dtype=torch.float32).to(device)
                    with torch.no_grad():
                        output = model(sequence_tensor)
                        pred_idx = torch.argmax(output, dim=1).item()
                        confidence = torch.softmax(output, dim=1)[0, pred_idx].item()
                        # Debug: Print softmax scores for all classes
                        softmax_scores = torch.softmax(output, dim=1)[0].cpu().numpy()
                        print(f"Softmax scores for all classes: {dict(zip(labels, softmax_scores))}")
                        if confidence > 0.4:
                            new_word = labels[pred_idx]
                            predicted_words[1:] = [new_word]
                            print(f"Rescanned 2nd sign: {new_word} (Confidence: {confidence:.2f})")
                            sentence = corpus_engine.frame_sentence_from_sequence(predicted_words)
                            if sentence:
                                print(f"Framed sentence: {sentence} (Triggering audio)")
                                success = generate_audio(sentence)
                                if not success:
                                    print("Warning: Audio playback failed. Check audiofication.py logs.")
                                is_paused = True
                            else:
                                first_word = predicted_words[0].lower()
                                second_word = predicted_words[1].lower()
                                matching_sentences = [s for s in corpus_engine.sentences if s.lower().startswith(f"{first_word} {second_word}")]
                                if len(matching_sentences) > 1:
                                    print("Multiple sentences with same first and second words, please show third sign")
                                    cv2.putText(frame, "Multiple sentences, show 3rd sign", 
                                                (10, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, cv2.LINE_AA)
                                    cv2.imshow(window_name, frame)
                                    cv2.waitKey(1)
                                    
                                    movement_buffer.clear()
                                    start_time = time.time()
                                    while len(movement_buffer) < MOVEMENT_FRAMES and (time.time() - start_time) < 2:
                                        ret, frame = cap.read()
                                        if not ret:
                                            break
                                        movement_buffer.append(frame)
                                        cv2.waitKey(100)
                                    
                                    if len(movement_buffer) == MOVEMENT_FRAMES:
                                        landmarks_array = extract_landmarks_with_movement(movement_buffer)
                                        sequence_array = np.array([landmarks_array])
                                        sequence_tensor = torch.tensor(sequence_array, dtype=torch.float32).to(device)
                                        with torch.no_grad():
                                            output = model(sequence_tensor)
                                            pred_idx = torch.argmax(output, dim=1).item()
                                            confidence = torch.softmax(output, dim=1)[0, pred_idx].item()
                                            softmax_scores = torch.softmax(output, dim=1)[0].cpu().numpy()
                                            print(f"Softmax scores for all classes: {dict(zip(labels, softmax_scores))}")
                                            if confidence > 0.4:
                                                new_word = labels[pred_idx]
                                                predicted_words.append(new_word)
                                                print(f"Detected third sign: {new_word} (Confidence: {confidence:.2f})")
                                                sentence = corpus_engine.frame_sentence_from_sequence(predicted_words)
                                                if sentence:
                                                    print(f"Framed sentence: {sentence} (Triggering audio)")
                                                    success = generate_audio(sentence)
                                                    if not success:
                                                        print("Warning: Audio playback failed. Check audiofication.py logs.")
                                                    is_paused = True
                                                else:
                                                    print("No unique sentence found after three words, resetting to first word")
                                                    predicted_words = predicted_words[:1]
                                                    current_word = predicted_words[-1] if predicted_words else None
                                                    awaiting_second_scan = True
                                        movement_buffer.clear()
                                    else:
                                        break
                                else:
                                    print("Rescanned 2nd word still ambiguous, awaiting further input")
                                    awaiting_second_scan = True
                        movement_buffer.clear()
                continue
            
            if is_paused:
                sentence_text = f"Sentence: {sentence if sentence else 'Detecting...'}"
                word_text = f"Current Word: {current_word if current_word else 'None'}"
                cv2.putText(frame, sentence_text, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2, cv2.LINE_AA)
                cv2.putText(frame, word_text, (10, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), 2, cv2.LINE_AA)
                cv2.putText(frame, "Press 'r' to reset", (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2, cv2.LINE_AA)
                cv2.imshow(window_name, frame)
                continue
            
            movement_buffer.append(frame)
            
            if len(movement_buffer) == 1 and sign_start_time is None:
                sign_start_time = time.time()
            
            if len(movement_buffer) == MOVEMENT_FRAMES and sign_start_time and (time.time() - sign_start_time) >= 3:
                try:
                    landmarks_array = extract_landmarks_with_movement(movement_buffer)
                    if landmarks_array.shape != (FRAMES_PER_VIDEO, EXPECTED_LANDMARKS):
                        print(f"Warning: Invalid landmarks array shape {landmarks_array.shape}. Skipping.")
                        movement_buffer.clear()
                        current_word = None
                        sign_start_time = None
                        continue
                    
                    # Debug: Check if landmarks have valid values
                    if np.isnan(landmarks_array).any() or np.isinf(landmarks_array).any():
                        print("Warning: Landmarks contain NaN or Inf values. Skipping.")
                        movement_buffer.clear()
                        current_word = None
                        sign_start_time = None
                        continue
                    
                    sequence_array = np.array([landmarks_array])
                    sequence_tensor = torch.tensor(sequence_array, dtype=torch.float32).to(device)
                    with torch.no_grad():
                        output = model(sequence_tensor)
                        pred_idx = torch.argmax(output, dim=1).item()
                        confidence = torch.softmax(output, dim=1)[0, pred_idx].item()
                        # Debug: Print softmax scores for all classes
                        softmax_scores = torch.softmax(output, dim=1)[0].cpu().numpy()
                        print(f"Softmax scores for all classes: {dict(zip(labels, softmax_scores))}")
                        
                        if confidence > 0.4:
                            new_word = labels[pred_idx]
                            current_word = new_word
                            movement_buffer.clear()
                            sign_start_time = None
                            print(f"Detected sign: {new_word} (Confidence: {confidence:.2f})")
                            
                            predicted_words.append(new_word)
                            
                            # Check for matching sentences based on the current predicted words
                            if len(predicted_words) == 1:
                                first_word = predicted_words[0].lower()
                                matching_sentences = [s for s in corpus_engine.sentences if s.lower().startswith(first_word)]
                                if len(matching_sentences) > 1:
                                    print("Multiple sentences with same first word, please show second sign")
                                    cv2.putText(frame, "Multiple sentences, show 2nd sign", 
                                                (10, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, cv2.LINE_AA)
                                    cv2.putText(frame, "First sign incorrect? Press 's' to recapture", 
                                                (10, 190), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, cv2.LINE_AA)
                                    cv2.putText(frame, "Press 'r' to reset", 
                                                (10, 220), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, cv2.LINE_AA)
                                    cv2.imshow(window_name, frame)
                                    cv2.waitKey(1)
                                    continue
                                elif len(matching_sentences) == 1:
                                    sentence = matching_sentences[0]
                                    print(f"Unique sentence found: {sentence} (Triggering audio)")
                                    success = generate_audio(sentence)
                                    if not success:
                                        print("Warning: Audio playback failed. Check audiofication.py logs.")
                                    is_paused = True
                                    continue
                            elif len(predicted_words) == 2:
                                first_word = predicted_words[0].lower()
                                second_word = predicted_words[1].lower()
                                matching_sentences = [s for s in corpus_engine.sentences if s.lower().startswith(f"{first_word} {second_word}")]
                                if len(matching_sentences) > 1:
                                    print("Multiple sentences with same first and second words, please show third sign")
                                    cv2.putText(frame, "Multiple sentences, show 3rd sign", 
                                                (10, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, cv2.LINE_AA)
                                    cv2.imshow(window_name, frame)
                                    cv2.waitKey(1)
                                    continue
                                elif len(matching_sentences) == 1:
                                    sentence = matching_sentences[0]
                                    print(f"Unique sentence found: {sentence} (Triggering audio)")
                                    success = generate_audio(sentence)
                                    if not success:
                                        print("Warning: Audio playback failed. Check audiofication.py logs.")
                                    is_paused = True
                                    continue
                                else:
                                    print("2nd word may be incorrect, press 'a' to scan again for 2nd word")
                                    cv2.putText(frame, "2nd word incorrect? Press 'a' to rescan", 
                                                (10, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, cv2.LINE_AA)
                                    awaiting_second_scan = True
                                    cv2.imshow(window_name, frame)
                                    cv2.waitKey(1)
                                    continue
                            elif len(predicted_words) == 3:
                                first_word = predicted_words[0].lower()
                                second_word = predicted_words[1].lower()
                                third_word = predicted_words[2].lower()
                                matching_sentences = [s for s in corpus_engine.sentences if s.lower().startswith(f"{first_word} {second_word} {third_word}")]
                                if len(matching_sentences) == 1:
                                    sentence = matching_sentences[0]
                                    print(f"Unique sentence found: {sentence} (Triggering audio)")
                                    success = generate_audio(sentence)
                                    if not success:
                                        print("Warning: Audio playback failed. Check audiofication.py logs.")
                                    is_paused = True
                                    continue
                                else:
                                    print("No unique sentence found after three words, resetting to first word")
                                    predicted_words = predicted_words[:1]
                                    current_word = predicted_words[-1] if predicted_words else None
                                    awaiting_second_scan = True
                                    cv2.putText(frame, "2nd word incorrect? Press 'a' to rescan", 
                                                (10, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, cv2.LINE_AA)
                                    cv2.imshow(window_name, frame)
                                    cv2.waitKey(1)
                                    continue
                        else:
                            print(f"Low confidence: {confidence:.2f}. Skipping prediction.")
                            current_word = None
                            movement_buffer.clear()
                            sign_start_time = None
                except ValueError as e:
                    print(f"Error: Failed to process sequence. {str(e)}")
                    movement_buffer.clear()
                    current_word = None
                    sign_start_time = None
                    continue
            
            sentence_text = f"Sentence: {sentence if sentence else 'Detecting...'}"
            word_text = f"Current Word: {current_word if current_word else 'None'}"
            cv2.putText(frame, sentence_text, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2, cv2.LINE_AA)
            cv2.putText(frame, word_text, (10, 80), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 0, 0), 2, cv2.LINE_AA)
            
            if awaiting_second_scan:
                cv2.putText(frame, "2nd word incorrect? Press 'a' to rescan", 
                            (10, 160), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, cv2.LINE_AA)
            
            cv2.imshow(window_name, frame)
    
    except KeyboardInterrupt:
        print("\nKeyboard interrupt detected. Releasing resources...")
    finally:
        cap.release()
        cv2.destroyAllWindows()
        hands.close()
        pose.close()
        print("Resources released successfully.")

real_time_recognition()

Softmax scores for all classes: {'One': 0.5513083, 'Great': 0.15210553, 'Young': 0.073507376, 'Open': 0.15018074, 'Strong': 0.04798989, 'Water': 0.02490818}
Detected sign: One (Confidence: 0.55)
Multiple sentences with same first word, please show second sign
Resetting: Clearing sentence and resuming sign collection
Reset triggered by 'r' key!
Softmax scores for all classes: {'One': 0.12568732, 'Great': 0.025080798, 'Young': 0.42948085, 'Open': 0.40009362, 'Strong': 0.012019877, 'Water': 0.007637632}
Detected sign: Young (Confidence: 0.43)
Unique sentence found: Young child learns fast (Triggering audio)



    Error 305 for command:
        open "C:\Users\dhanu\AppData\Local\Temp\tmpq0vd40am.mp3"
    Cannot specify extra characters after a string enclosed in quotation marks.

    Error 305 for command:
        close "C:\Users\dhanu\AppData\Local\Temp\tmpq0vd40am.mp3"
    Cannot specify extra characters after a string enclosed in quotation marks.
Failed to close the file: "C:\Users\dhanu\AppData\Local\Temp\tmpq0vd40am.mp3"


Audio file saved to C:\Users\dhanu\AppData\Local\Temp\tmpq0vd40am.mp3
Error generating audio: 
    Error 305 for command:
        open "C:\Users\dhanu\AppData\Local\Temp\tmpq0vd40am.mp3"
    Cannot specify extra characters after a string enclosed in quotation marks.
Resetting: Clearing sentence and resuming sign collection
Reset triggered by 'r' key (paused state)!
Softmax scores for all classes: {'One': 0.13362835, 'Great': 0.02719393, 'Young': 0.42146125, 'Open': 0.39710543, 'Strong': 0.012615479, 'Water': 0.00799553}
Detected sign: Young (Confidence: 0.42)
Unique sentence found: Young child learns fast (Triggering audio)
Audio file saved to C:\Users\dhanu\AppData\Local\Temp\tmpt8vs1r6a.mp3
Audio played successfully for: Young child learns fast
Resetting: Clearing sentence and resuming sign collection
Reset triggered by 'r' key (paused state)!
Softmax scores for all classes: {'One': 0.3139079, 'Great': 0.6399801, 'Young': 0.007873458, 'Open': 0.00825719, 'Strong': 0.0143539645, 'Wate


    Error 259 for command:
        play "C:\Users\dhanu\AppData\Local\Temp\tmp230fh34z.mp3" wait
    The driver cannot recognize the specified command parameter.

    Error 305 for command:
        close "C:\Users\dhanu\AppData\Local\Temp\tmp230fh34z.mp3"
    Cannot specify extra characters after a string enclosed in quotation marks.
Failed to close the file: "C:\Users\dhanu\AppData\Local\Temp\tmp230fh34z.mp3"


Audio file saved to C:\Users\dhanu\AppData\Local\Temp\tmp230fh34z.mp3
Error generating audio: 
    Error 259 for command:
        play "C:\Users\dhanu\AppData\Local\Temp\tmp230fh34z.mp3" wait
    The driver cannot recognize the specified command parameter.
Resetting: Clearing sentence and resuming sign collection
Reset triggered by 'r' key (paused state)!
Softmax scores for all classes: {'One': 0.5987637, 'Great': 0.22224902, 'Young': 0.042913027, 'Open': 0.08030811, 'Strong': 0.03590873, 'Water': 0.019857407}
Detected sign: One (Confidence: 0.60)
Multiple sentences with same first word, please show second sign
Resources released successfully.
