In [3]:
import cv2
import os
import numpy as np
import mediapipe as mp
from scipy.ndimage import gaussian_filter1d
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models


In [4]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils

I0000 00:00:1724587932.346999  255471 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1


In [5]:
# Paths
dataset_path = '/Users/mantri/Desktop/HandGesture_Recognition/American'
processed_dataset_path = '/Users/mantri/Desktop/HandGesture_Recognition/American_processed_dataset'

In [6]:
# Create a directory for processed data
if not os.path.exists(processed_dataset_path):
    os.makedirs(processed_dataset_path)

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [7]:
classes = list('abcdefghijklmnopqrstuvwxyz') 

In [8]:
def extract_landmarks(image):
    # Process the image to detect hand landmarks
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    
    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]
        
        # Get the bounding box around the hand
        h, w, _ = image.shape
        x_min, y_min = w, h
        x_max, y_max = 0, 0
        for lm in hand_landmarks.landmark:
            x, y = int(lm.x * w), int(lm.y * h)
            if x < x_min: x_min = x
            if y < y_min: y_min = y
            if x > x_max: x_max = x
            if y > y_max: y_max = y

        # Add padding to the bounding box
        padding = 20  # You can adjust this padding value
        x_min = max(0, x_min - padding)
        y_min = max(0, y_min - padding)
        x_max = min(w, x_max + padding)
        y_max = min(h, y_max + padding)

        # Crop the image to the bounding box
        cropped_image = image[y_min:y_max, x_min:x_max]

        # Calculate landmarks on the cropped image
        cropped_results = hands.process(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
        if cropped_results.multi_hand_landmarks:
            cropped_hand_landmarks = cropped_results.multi_hand_landmarks[0]
            landmarks = []
            for lm in cropped_hand_landmarks.landmark:
                landmarks.append(lm.x)
                landmarks.append(lm.y)
            return landmarks
    return None



W0000 00:00:1724587932.377989  258713 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [9]:
# # Process the dataset
# for label in classes:
#     label_path = os.path.join(dataset_path, label)
#     processed_label_path = os.path.join(processed_dataset_path, label)

#     if not os.path.exists(processed_label_path):
#         os.makedirs(processed_label_path)

#     for image_name in os.listdir(label_path):
#         img_path = os.path.join(label_path, image_name)
#         img = cv2.imread(img_path)

#         # Extract landmarks
#         landmarks = extract_landmarks(img)
#         if landmarks:
#             # Save the landmarks as a numpy array
#             landmark_path = os.path.join(processed_label_path, image_name.split('.')[0] + '.npy')
#             np.save(landmark_path, np.array(landmarks))

# print("Dataset converted successfully. Landmarks are saved.")

W0000 00:00:1724587932.388072  258715 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [10]:
# Paths

classes = list('abcdefghijklmnopqrstuvwxyz')  # Include  alphabets

In [11]:
# Load processed data
landmarks = []
labels = []

for label in classes:
    label_path = os.path.join(processed_dataset_path, label)
    for landmark_file in os.listdir(label_path):
        landmark_path = os.path.join(label_path, landmark_file)
        landmarks.append(np.load(landmark_path))
        labels.append(classes.index(label))

In [12]:
# Convert to numpy arrays
landmarks = np.array(landmarks)
labels = np.array(labels)

# Normalize the landmarks
landmarks = landmarks / np.max(landmarks)

# Reshape landmarks for CNN input
num_landmarks = landmarks.shape[1] // 2
landmarks = landmarks.reshape(-1, num_landmarks, 2, 1)  # Reshape to (samples, num_landmarks, 2, 1) for CNN

In [13]:
# Split the dataset
train_landmarks, val_landmarks, train_labels, val_labels = train_test_split(landmarks, labels, test_size=0.2,
                                                                            random_state=42)

In [14]:
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(num_landmarks, 2, 1)),
    layers.MaxPooling2D((2, 1), padding='valid'),
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 1), padding='valid'),
    layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 1), padding='valid'),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5), 
    layers.Dense(len(classes), activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_landmarks, train_labels, epochs=100, validation_data=(val_landmarks, val_labels))

# Evaluate the model
val_loss, val_acc = model.evaluate(val_landmarks, val_labels)
print(f"Validation accuracy: {val_acc:.4f}")

# Save the trained model
model.save('sign_lang.keras')

Epoch 1/100
[1m1670/1670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.6428 - loss: 1.2006 - val_accuracy: 0.9897 - val_loss: 0.0457
Epoch 2/100
[1m1670/1670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9779 - loss: 0.0883 - val_accuracy: 0.9924 - val_loss: 0.0277
Epoch 3/100
[1m1670/1670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9847 - loss: 0.0601 - val_accuracy: 0.9960 - val_loss: 0.0146
Epoch 4/100
[1m1670/1670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.9887 - loss: 0.0434 - val_accuracy: 0.9955 - val_loss: 0.0151
Epoch 5/100
[1m1670/1670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9925 - loss: 0.0279 - val_accuracy: 0.9972 - val_loss: 0.0103
Epoch 6/100
[1m1670/1670[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9933 - loss: 0.0266 - val_accuracy: 0.9978 - val_loss: 0.0097
Epoch 7/10

In [16]:
# Load the model
model = load_model('sign_lang.keras')

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.7)
mp_drawing = mp.solutions.drawing_utils

# Parameters
classes = list('abcdefghijklmnopqrstuvwxyz') 

I0000 00:00:1724588852.974918  255471 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1


In [17]:

def smooth_landmarks(landmarks):
    return gaussian_filter1d(landmarks, sigma=2)

In [18]:
# Function to extract hand landmarks
def extract_landmarks(image):
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]
        landmarks = []
        for lm in hand_landmarks.landmark:
            landmarks.append(lm.x)
            landmarks.append(lm.y)
        return np.array(landmarks).flatten()
    return None

In [19]:
def predict_sign(landmarks):
    if landmarks is None:
        return 'Unknown'

    # Smooth the landmarks
    landmarks = smooth_landmarks(landmarks)

    # Reshape landmarks for CNN input
    num_landmarks = 21  # MediaPipe hands model provides 21 landmarks
    landmarks = landmarks.reshape(1, num_landmarks, 2, 1)  # Adjust this to (1, num_landmarks, 2, 1)

    # Ensure input shape matches model's input shape
    if landmarks.shape[1:] != (num_landmarks, 2, 1):
        raise ValueError(f"Expected input shape (1, {num_landmarks}, 2, 1), but got {landmarks.shape}")

    pred = model.predict(landmarks)
    return classes[np.argmax(pred)]


In [20]:
# Capture video from webcam
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame
    frame = cv2.flip(frame, 1)

    # Extract landmarks
    landmarks = extract_landmarks(frame)
    if landmarks is not None:
        # Draw landmarks
        mp_drawing.draw_landmarks(frame, hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)).multi_hand_landmarks[0],
                                  mp_hands.HAND_CONNECTIONS)
        # Predict the sign
        sign = predict_sign(landmarks)

        # Display the sign
        cv2.putText(frame, sign, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 3, cv2.LINE_AA)

    # Display the frame
    cv2.imshow('Sign Language Detection', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
hands.close()
cap.release()
cv2.destroyAllWindows()

W0000 00:00:1724588853.002701  274443 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


W0000 00:00:1724588853.009975  274443 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12