In [1]:
import pickle
import cv2
import mediapipe as mp
import numpy as np

# Load the trained model
try:
    with open('./model.p', 'rb') as f:
        model_dict = pickle.load(f)
    model = model_dict['model']
except Exception as e:
    print(f"❌ Failed to load model: {e}")
    exit()

# Map model output to letters (adjust if your classes are fewer)
labels_dict = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 
             10: 'A', 11: 'B', 12: 'C', 13: 'D', 14: 'E', 15: 'F', 16: 'G', 17: 'H', 18: 'I', 
             19: 'J', 20: 'K', 21: 'L', 22: 'M', 23: 'N', 24: 'O', 25: 'P', 26: 'Q', 27: 'R', 
             28: 'S', 29: 'T', 30: 'U', 31: 'V', 32: 'W', 33: 'X', 34: 'Y', 35: 'Z'}

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

# Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("❌ Error: Could not access webcam.")
    exit()

print("🎥 Press ESC to quit...")

while True:
    data_aux = []
    x_ = []
    y_ = []

    ret, frame = cap.read()
    if not ret or frame is None:
        print("⚠️ Unable to read from camera.")
        break

    H, W, _ = frame.shape
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                frame,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style()
            )

            for lm in hand_landmarks.landmark:
                x_.append(lm.x)
                y_.append(lm.y)

            # Handle empty detection error
            if not x_ or not y_ or (max(x_) - min(x_) < 0.01) or (max(y_) - min(y_) < 0.01):
                continue

            # Normalize landmarks
            for lm in hand_landmarks.landmark:
                norm_x = (lm.x - min(x_)) / (max(x_) - min(x_))
                norm_y = (lm.y - min(y_)) / (max(y_) - min(y_))
                data_aux.extend([norm_x, norm_y])

            # Bounding box
            x1 = int(min(x_) * W) - 20
            y1 = int(min(y_) * H) - 20
            x2 = int(max(x_) * W) + 20
            y2 = int(max(y_) * H) + 20

            # Predict
            try:
                prediction = model.predict([np.asarray(data_aux)])
                predicted_class = int(prediction[0])
                predicted_character = labels_dict.get(predicted_class, '?')

                # Draw results
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)
                cv2.putText(frame, predicted_character, (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3, cv2.LINE_AA)
            except Exception as e:
                print(f"❌ Prediction error: {e}")
                continue

    cv2.imshow('Sign Language Detection', frame)

    if cv2.waitKey(1) & 0xFF == 27:  # Press ESC to exit
        break

cap.release()
cv2.destroyAllWindows()


🎥 Press ESC to quit...


# Inference_ Classifier

In [1]:
import pickle
import cv2
import mediapipe as mp
import numpy as np
import tkinter as tk
from tkinter import Label
from threading import Thread
import time

# Load the trained model
try:
    with open('./model.p', 'rb') as f:
        model_dict = pickle.load(f)
    model = model_dict['model']
except Exception as e:
    print(f"❌ Failed to load model: {e}")
    exit()

# Map model output to letters (adjust if your classes are fewer)
labels_dict = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 
             10: 'A', 11: 'B', 12: 'C', 13: 'D', 14: 'E', 15: 'F', 16: 'G', 17: 'H', 18: 'I', 
             19: 'J', 20: 'K', 21: 'L', 22: 'M', 23: 'N', 24: 'O', 25: 'P', 26: 'Q', 27: 'R', 
             28: 'S', 29: 'T', 30: 'U', 31: 'V', 32: 'W', 33: 'X', 34: 'Y', 35: 'Z'}

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

# Open webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("❌ Error: Could not access webcam.")
    exit()

print("🎥 Press ESC to quit...")

# Initialize word, sentence, and time tracking
current_word = ""
sentence = ""
last_hand_time = time.time()  # Time when the hand was last detected
last_predicted_character = None  # To store the last predicted character

# Tkinter window for displaying word and sentence
def create_popup():
    global current_word, sentence

    root = tk.Tk()
    root.title("Sign Language Detection - Word & Sentence")

    word_label = Label(root, text="Word: ", font=("Helvetica", 16), width=50, anchor='w')
    word_label.pack(padx=10, pady=10)

    sentence_label = Label(root, text="Sentence: ", font=("Helvetica", 16), width=50, anchor='w')
    sentence_label.pack(padx=10, pady=10)

    def update_popup():
        word_label.config(text="Word: " + current_word)
        sentence_label.config(text="Sentence: " + sentence)
        root.after(100, update_popup)

    # Start the update thread for the popup
    root.after(100, update_popup)
    root.mainloop()

# Run the Tkinter popup in a separate thread
popup_thread = Thread(target=create_popup, daemon=True)
popup_thread.start()

# Function to store the sentence when exiting
def store_sentence_to_file():
    global sentence
    try:
        with open('sentence_output.txt', 'w') as f:
            f.write(sentence.strip())
        print("✅ Sentence saved to 'sentence_output.txt'")
    except Exception as e:
        print(f"❌ Failed to save sentence: {e}")

while True:
    data_aux = []
    x_ = []
    y_ = []

    ret, frame = cap.read()
    if not ret or frame is None:
        print("⚠️ Unable to read from camera.")
        break

    H, W, _ = frame.shape
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process hand landmarks using MediaPipe
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Draw landmarks on the frame
            mp_drawing.draw_landmarks(
                frame,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style()
            )

            # Extract landmarks coordinates
            for lm in hand_landmarks.landmark:
                x_.append(lm.x)
                y_.append(lm.y)

            # Handle empty or poor hand detection
            if not x_ or not y_ or (max(x_) - min(x_) < 0.01) or (max(y_) - min(y_) < 0.01):
                continue

            # Normalize landmarks between 0 and 1
            for lm in hand_landmarks.landmark:
                norm_x = (lm.x - min(x_)) / (max(x_) - min(x_))
                norm_y = (lm.y - min(y_)) / (max(y_) - min(y_))
                data_aux.extend([norm_x, norm_y])

            # Bounding box around the hand
            x1 = int(min(x_) * W) - 20
            y1 = int(min(y_) * H) - 20
            x2 = int(max(x_) * W) + 20
            y2 = int(max(y_) * H) + 20

            # Prediction logic
            try:
                # Run prediction using the trained model
                prediction = model.predict([np.asarray(data_aux)])
                predicted_class = int(prediction[0])
                predicted_character = labels_dict.get(predicted_class, '?')

                # Draw bounding box and predicted character on the frame
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)
                cv2.putText(frame, predicted_character, (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3, cv2.LINE_AA)

                # Check if the predicted character is different from the last one
                if predicted_character != last_predicted_character:
                    # Update word only after 5 seconds of detecting the hand gesture
                    if time.time() - last_hand_time > 5:
                        current_word += predicted_character
                        last_predicted_character = predicted_character  # Store last predicted character
                        last_hand_time = time.time()  # Reset the time after updating the word

            except Exception as e:
                print(f"❌ Prediction error: {e}")
                continue

    # Check if 6 seconds have passed since last hand detection
    if time.time() - last_hand_time > 6:
        # If no new hand gesture detected for 6 seconds, move word to sentence section
        sentence += current_word + " "
        current_word = ""  # Clear the word section
        last_hand_time = time.time()  # Reset the timer to prevent infinite empty sentences

    # Display the frame with predictions
    cv2.imshow('Sign Language Detection', frame)

    # Exit loop on pressing ESC
    if cv2.waitKey(1) & 0xFF == 27:  # Press ESC to exit
        store_sentence_to_file()  # Save the sentence to a file when exiting
        break

# Release resources
cap.release()
cv2.destroyAllWindows()


🎥 Press ESC to quit...
✅ Sentence saved to 'sentence_output.txt'


In [None]:
!pip install opencv-python


In [2]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model

# Load the trained CNN model
model = load_model('asl_model_l.keras')

# Labels dictionary (adjust if your labels are different)
labels_dict = {
    0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9',
    10: 'A', 11: 'B', 12: 'C', 13: 'D', 14: 'E', 15: 'F', 16: 'G', 17: 'H', 18: 'I',
    19: 'J', 20: 'K', 21: 'L', 22: 'M', 23: 'N', 24: 'O', 25: 'P', 26: 'Q', 27: 'R',
    28: 'S', 29: 'T', 30: 'U', 31: 'V', 32: 'W', 33: 'X', 34: 'Y', 35: 'Z'
}

# Initialize webcam
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("❌ Error: Cannot access webcam.")
    exit()

print("🎥 Show your hand in the green box. Press ESC to quit.")

# Confidence threshold (below this, do not predict)
confidence_threshold = 0.7

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Define ROI (Region of Interest)
    x1, y1, x2, y2 = 100, 100, 364, 364
    roi = frame[y1:y2, x1:x2]

    # Preprocess: grayscale, resize, normalize
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, (64, 64))
    normalized = resized / 255.0
    reshaped = normalized.reshape(1, 64, 64, 1)

    # Debugging: print the shape of the reshaped input
    print(f"Input shape: {reshaped.shape}")

    # Predict
    prediction = model.predict(reshaped)
    class_id = np.argmax(prediction)
    confidence = np.max(prediction)
    
    # Check confidence
    if confidence >= confidence_threshold:
        predicted_char = labels_dict.get(class_id, '?')
    else:
        predicted_char = '?'  # If confidence is too low, show unknown

    # Show result
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
    cv2.putText(frame, f"{predicted_char} ({confidence:.2f})", (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 255, 0), 2)

    cv2.imshow("ASL Real-Time Prediction", frame)

    if cv2.waitKey(1) & 0xFF == 27:  # ESC key
        break

cap.release()
cv2.destroyAllWindows()


🎥 Show your hand in the green box. Press ESC to quit.
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Input shape: (1, 64, 64, 1)
[1m1/1[0m [32m