## Import Libraries

In [3]:
import numpy as np 
import cv2
import mediapipe as mp
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import os
from tensorflow.keras.preprocessing import image

## Load Dataset and Preprocess

In [4]:
# Path to dataset
data_dir = "asl_alphabet/train"

# Image size
img_height, img_width = 200, 200
batch_size = 32

# Load dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size
)

class_names = train_ds.class_names
print("Classes:", class_names)

# Improve performance with caching
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 84000 files belonging to 28 classes.
Using 67200 files for training.
Found 84000 files belonging to 28 classes.
Using 16800 files for validation.
Classes: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'nothing', 'space']


## Build and Train CNN

In [5]:
model = models.Sequential([
    layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
    
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D(),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(),

    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(class_names), activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_ds, validation_data=val_ds, epochs=10)


Epoch 1/10


  super().__init__(**kwargs)


[1m  18/2100[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11:47[0m 340ms/step - accuracy: 0.0366 - loss: 3.5786

KeyboardInterrupt: 

## Save Model

In [None]:
model.evaluate(val_ds)
model.save('asl_cnn_model.keras')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.9286 - loss: 3.0502


## Real-Time Script

In [None]:
model = tf.keras.models.load_model('asl_cnn_model.keras')
labels = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
img_size = 64

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7)
mp_draw = mp.solutions.drawing_utils

# Start webcam
cap = cv2.VideoCapture(0)
print("📷 Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Get bounding box of hand
            h, w, _ = frame.shape
            x_coords = [lm.x for lm in hand_landmarks.landmark]
            y_coords = [lm.y for lm in hand_landmarks.landmark]
            padding = 70
            x_min = max(int(min(x_coords) * w) - padding, 0)
            x_max = min(int(max(x_coords) * w) + padding, w)
            y_min = max(int(min(y_coords) * h) - padding, 0)
            y_max = min(int(max(y_coords) * h) + padding, h)

            # Clamp values
            x_min = max(x_min, 0)
            y_min = max(y_min, 0)
            x_max = min(x_max, w)
            y_max = min(y_max, h)

            # Extract hand ROI
            hand_img = frame[y_min:y_max, x_min:x_max]
            if hand_img.size == 0:
                continue  # skip if invalid crop

            # Preprocess for model
            hand_resized = cv2.resize(hand_img, (img_size, img_size))
            hand_input = hand_resized.astype("float32") / 255.0
            hand_input = np.expand_dims(hand_input, axis=0)

            cv2.imshow("Hand Input", hand_resized)


            # Predict
            prediction = model.predict(hand_input)
            pred_idx = np.argmax(prediction[0])
            pred_label = labels[pred_idx]
            confidence = prediction[0][pred_idx]

            print("Prediction vector:", prediction[0])
            print("Predicted vector size:", prediction.shape)


            # Draw landmarks & label
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            cv2.putText(frame, f'{pred_label} ({confidence:.3f})', (x_min, y_min - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (255, 255, 0), 2)

    cv2.imshow("ASL Recognizer", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1747506146.182545 2933896 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1747506146.189530 3040858 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1747506146.194368 3040858 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


📷 Press 'q' to quit.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Prediction vector: [4.3271273e-05 2.6539783e-11 5.7072118e-11 1.6165095e-11 1.0910273e-03
 8.5204687e-05 1.2942136e-03 1.0957890e-05 5.4210452e-03 2.8607893e-01
 1.3490508e-07 9.9398656e-11 3.5337283e-04 1.9069981e-03 1.6991702e-05
 8.1713956e-08 1.4513165e-12 7.3006372e-11 4.0591435e-06 3.4377903e-09
 2.4491246e-13 2.8908500e-08 1.1833234e-04 1.4831914e-03 7.0186466e-01
 9.7106101e-07 1.1474699e-06 5.2817421e-11 2.2536650e-04]
Predicted vector size: (1, 29)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Prediction vector: [4.3140564e-05 2.6937006e-11 5.6678644e-11 1.5920853e-11 1.0891942e-03
 8.2931256e-05 1.3105884e-03 1.1124772e-05 5.4706060e-03 2.9090893e-01
 1.3820991e-07 9.9651676e-11 3.6357355e-04 1.9473761e-03 1.6809474e-05
 8.1575969e-08 1.4460634e-12 7.5277430e-11 4.0050168e-06 3.4022258e-09
 2.5003174e-13 2.9164527e-08 1.1943171e-04 1.4966906e-03 6.9690484e-0