In [1]:
import os
import pandas as pd
import numpy as np
import cv2
import mediapipe as mp

# Enable multi-threading in OpenCV for faster image processing
cv2.setNumThreads(4)

# Initialize Mediapipe Hand Detector
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=True,
    max_num_hands=1,
    min_detection_confidence=0.5,
    model_complexity=1
)

# Dataset directories
dataset_paths = [
    "extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train"
]

# Output file
output_csv = "landmarks.csv"

# Define output columns
columns = ["gesture"] + [f"x{i}" for i in range(21)] + [f"y{i}" for i in range(21)]

# Initialize list to store all processed landmarks
landmark_data = []

def normalize_landmarks(landmarks):
    """Normalize hand landmarks to be scale-independent."""
    points = np.array(landmarks)
    base_x, base_y = points[0]  # Wrist landmark as the base reference

    # Translate all points so wrist becomes the origin
    points[:, 0] -= base_x
    points[:, 1] -= base_y

    # Scale normalization
    max_dist = np.linalg.norm(points, axis=1).max()
    points /= max_dist if max_dist > 0 else 1  # Avoid division by zero

    return points.flatten().tolist()

def process_gesture_data(dataset_paths):
    """Process images from multiple datasets and extract hand landmarks."""
    for dataset_path in dataset_paths:
        for label in os.listdir(dataset_path):  # Iterate over class folders (A, B, C, ...)
            label_path = os.path.join(dataset_path, label)
            if not os.path.isdir(label_path):
                continue  # Skip non-folder files

            print(f"Processing label: {label} in dataset {dataset_path}...")
            for image_name in os.listdir(label_path):
                image_path = os.path.join(label_path, image_name)

                # Load and preprocess the image
                image = cv2.imread(image_path)
                if image is None:
                    continue  # Skip if the image is not readable

                # Resize and convert to RGB
                image = cv2.resize(image, (200, 200), interpolation=cv2.INTER_LINEAR)
                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

                # Detect hand landmarks using Mediapipe c
                result = hands.process(image_rgb)
                if result.multi_hand_landmarks:
                    for hand_landmarks in result.multi_hand_landmarks:
                        # Extract (x, y) coordinates of 21 landmarks
                        landmarks = [(lm.x, lm.y) for lm in hand_landmarks.landmark]
                        normalized_landmarks = normalize_landmarks(landmarks)

                        # Append gesture label and normalized landmarks
                        landmark_data.append([label] + normalized_landmarks)

# Process all datasets
process_gesture_data(dataset_paths)

# Save the combined landmarks to a single CSV file
df_landmarks = pd.DataFrame(landmark_data, columns=columns)
df_landmarks.to_csv(output_csv, index=False)
print(f"✅ Combined landmark dataset saved to {output_csv}")


Processing label: A in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: B in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: C in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: D in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: del in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: E in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: F in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: G in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: H in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: I in dataset extracted_folder_1/ASL_Alphabet_Dataset/asl_alphabet_train...
Processing label: J in dataset extracted_folder_1/ASL_Alphabet_Datas

In [3]:
import pandas as pd
import numpy as np

# Load the extracted landmark dataset
input_file = "landmarks.csv"  # Update with your actual file
df = pd.read_csv(input_file)

# Normalize landmarks by centering and scaling
def normalize_landmarks(row):
    """Normalize hand landmarks to be scale-independent and centered."""
    # Extract landmarks from row (skip first column which is the label)
    landmarks = row[1:].values.astype(float)
    points = landmarks.reshape(21, 2)  # Convert to 21 (x, y) pairs

    # Get reference point (wrist at index 0)
    base_x, base_y = points[0]

    # Translate all points so wrist becomes the origin
    points[:, 0] -= base_x
    points[:, 1] -= base_y

    # Scale normalization
    max_dist = np.linalg.norm(points, axis=1).max()
    points /= max_dist if max_dist > 0 else 1  # Avoid division by zero

    return points.flatten()  # Return as a 1D array

# Apply normalization and convert back to DataFrame
normalized_data = df.apply(normalize_landmarks, axis=1, result_type="expand")

# Reattach gesture labels
normalized_data.insert(0, "gesture", df["gesture"])

# Define column names for CSV output
column_names = ["gesture"] + [f"x{i}" for i in range(21)] + [f"y{i}" for i in range(21)]
normalized_data.columns = column_names

# Compute the average landmark positions for each gesture
gesture_templates = normalized_data.groupby("gesture").mean().reset_index()

# Save the gesture templates
output_file = "gesture_templates.csv"
gesture_templates.to_csv(output_file, index=False)

print(f"✅ Gesture templates saved to {output_file}")


✅ Gesture templates saved to gesture_templates.csv


In [6]:
import os
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
from tensorflow.keras import mixed_precision
from tensorflow.keras.layers import Input, LSTM, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# -------------------------------------------------------------------
# 1) OPTIONAL: GPU & MIXED PRECISION SETUP
# -------------------------------------------------------------------
mixed_precision.set_global_policy('mixed_float16')

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("✅ GPU is enabled and TensorFlow is using it.")
    except RuntimeError as e:
        print("❌ GPU Configuration Error:", e)

# -------------------------------------------------------------------
# 2) READ THE CSV
# -------------------------------------------------------------------
dataset_file = "combined_gesture_landmarks.csv"  # <-- Adjust path
df = pd.read_csv(dataset_file)

# -------------------------------------------------------------------
# 3) SPLIT DATA INTO:
#    - X (the 42 columns: x0..y20)
#    - gesture labels (y_gesture_raw)
#    - "ideal" coords (reusing the same 42 columns here)
# -------------------------------------------------------------------
X = df.iloc[:, 1:].values            # shape: (n_samples, 42)
y_gesture_raw = df.iloc[:, 0].values # shape: (n_samples,)
y_ideal_coords = df.iloc[:, 1:].values  # same as X; shape: (n_samples, 42)

# -------------------------------------------------------------------
# 4) ENCODE GESTURE LABELS (ONE-HOT)
# -------------------------------------------------------------------
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(y_gesture_raw)

one_hot_encoder = OneHotEncoder(sparse_output=False)
y_class = one_hot_encoder.fit_transform(encoded_labels.reshape(-1, 1))

num_gestures = y_class.shape[1]
print("Number of gesture classes:", num_gestures)

# -------------------------------------------------------------------
# 5) RESHAPE X FOR LSTM
#    LSTM expects (samples, time_steps, features) => we'll set time_steps=1
# -------------------------------------------------------------------
X = X.reshape((X.shape[0], 1, X.shape[1]))  # shape => (n_samples, 1, 42)

# -------------------------------------------------------------------
# 6) TRAIN / TEST SPLIT
# -------------------------------------------------------------------
X_train, X_test, y_class_train, y_class_test, y_coords_train, y_coords_test = train_test_split(
    X, y_class, y_ideal_coords, test_size=0.2, random_state=42
)

# For multi-output, we pass dictionaries of labels to .fit()
y_train_dict = {
    "gesture_class": y_class_train,
    "landmark_coords": y_coords_train
}
y_test_dict = {
    "gesture_class": y_class_test,
    "landmark_coords": y_coords_test
}

# -------------------------------------------------------------------
# 7) BUILD THE MULTI-OUTPUT MODEL (LSTM for both classification + regression)
# -------------------------------------------------------------------
input_layer = Input(shape=(1, 42))

x = LSTM(128, return_sequences=True)(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

x = LSTM(64, return_sequences=False)(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

x = Dense(128, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

x = Dense(64, activation="relu")(x)

# -- Output 1: Classification
gesture_output = Dense(num_gestures, activation="softmax", name="gesture_class")(x)

# -- Output 2: Landmark Regression (42 floats)
coords_output = Dense(42, activation="linear", name="landmark_coords")(x)

model = Model(inputs=input_layer, outputs=[gesture_output, coords_output])

model.compile(
    optimizer="adam",
    loss={
        "gesture_class": "categorical_crossentropy",
        "landmark_coords": "mse"
    },
    loss_weights={
        "gesture_class": 1.0,       # Adjust these if you need to emphasize classification vs. regression
        "landmark_coords": 1.0
    },
    metrics={
        "gesture_class": "accuracy" # For the classification head
        # You could add custom metrics for coords here if desired
    }
)

print("✅ Built multi-output model.")
# -------------------------------------------------------------------
# 1) LOAD THE TRAINED MODEL
# -------------------------------------------------------------------
model_path = "my_saved_model.keras"  # Path to saved model
model = tf.keras.models.load_model(model_path)
print("✅ Model loaded successfully.")

# -------------------------------------------------------------------
# 2) LOAD LABEL ENCODERS
# -------------------------------------------------------------------
with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)
with open("one_hot_encoder.pkl", "rb") as f:
    one_hot_encoder = pickle.load(f)
print("✅ Label encoders loaded.")


# -------------------------------------------------------------------
# 8) TRAIN THE MODEL
# -------------------------------------------------------------------
history = model.fit(
    X_train,
    y_train_dict,
    validation_data=(X_test, y_test_dict),
    epochs=10,
    batch_size=64
)

# -------------------------------------------------------------------
# 9) SAVE THE TRAINED MODEL AS .keras
# -------------------------------------------------------------------
model_save_path = "gesture_recognition_model.keras"  # Save in new Keras format
model.save(model_save_path)
print(f"✅ Multi-output model trained and saved at: {model_save_path}")

# -------------------------------------------------------------------
# 10) LOAD MODEL TO TEST
# -------------------------------------------------------------------
loaded_model = tf.keras.models.load_model(model_save_path)
print("✅ Model loaded successfully.")

# -------------------------------------------------------------------
# 11) SAVE LABEL ENCODERS
# -------------------------------------------------------------------
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
with open("one_hot_encoder.pkl", "wb") as f:
    pickle.dump(one_hot_encoder, f)
print("✅ Label encoders saved.")


Number of gesture classes: 37
✅ Built multi-output model.


  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


✅ Model loaded successfully.
✅ Label encoders loaded.
Epoch 1/10
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - gesture_class_accuracy: 0.9126 - gesture_class_loss: 0.2621 - landmark_coords_loss: 0.0201 - loss: 0.2821 - val_gesture_class_accuracy: 0.9356 - val_gesture_class_loss: 0.2056 - val_landmark_coords_loss: 0.0117 - val_loss: 0.2181
Epoch 2/10
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - gesture_class_accuracy: 0.9142 - gesture_class_loss: 0.2632 - landmark_coords_loss: 0.0193 - loss: 0.2826 - val_gesture_class_accuracy: 0.9457 - val_gesture_class_loss: 0.1791 - val_landmark_coords_loss: 0.0138 - val_loss: 0.1940
Epoch 3/10
[1m346/346[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - gesture_class_accuracy: 0.9143 - gesture_class_loss: 0.2550 - landmark_coords_loss: 0.0187 - loss: 0.2737 - val_gesture_class_accuracy: 0.9477 - val_gesture_class_loss: 0.1813 - val_landmark_coords_loss: 0.0109 - val_l

In [7]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
import pickle
import time
import json

# -------------------------------------------------------------------
# 1) LOAD MODEL & ENCODERS
# -------------------------------------------------------------------
model_path = 'gesture_recognition_model.keras'
tf.config.experimental.set_visible_devices([], "GPU")  # Force CPU if needed

model = tf.keras.models.load_model(model_path)

with open('label_encoder.pkl', "rb") as file:
    label_encoder = pickle.load(file)

# -------------------------------------------------------------------
# 2) SET UP MEDIAPIPE (up to 2 hands)
# -------------------------------------------------------------------
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,  # Detect up to 2 hands
    min_detection_confidence=0.5
)
mp_drawing = mp.solutions.drawing_utils

# -------------------------------------------------------------------
# 3) NORMALIZATION, PREDICTION & SIMILARITY
# -------------------------------------------------------------------
def normalize_landmarks(landmarks):
    """ Normalize 21 (x,y) points by subtracting wrist coords & dividing by max distance. Returns 42 floats in [-1..1]. """
    points = np.array(landmarks).reshape(21, 2)
    base_x, base_y = points[0]
    points[:, 0] -= base_x
    points[:, 1] -= base_y

    max_dist = np.linalg.norm(points, axis=1).max()
    if max_dist > 0:
        points /= max_dist
    return points.flatten()

def predict_gesture(normalized_landmarks):
    """ Multi-output model => [class_probs, coords]. Returns (gesture_label, 21x2 'ideal' coords). """
    input_data = np.array(normalized_landmarks).reshape((1, 1, 42))
    predictions = model.predict(input_data, verbose=0)

    gesture_probs = predictions[0]  # shape (1, num_gestures)
    coords = predictions[1]         # shape (1, 42)

    # Classification
    predicted_label_idx = np.argmax(gesture_probs, axis=1)[0]
    gesture_label = label_encoder.inverse_transform([predicted_label_idx])[0]

    # Landmark regression (21x2)
    predicted_landmarks = coords[0].reshape(21, 2)
    return gesture_label, predicted_landmarks

def compute_jointwise_deviation(user_points, predicted_points):
    """ Compute per-joint deviation between user and ideal landmarks. Returns list of errors per joint. """
    return np.linalg.norm(user_points - predicted_points, axis=1)  # List of 21 joint-wise errors

# -------------------------------------------------------------------
# 4) REAL-TIME LOOP (PRINT PREDICTIONS EVERY 3 SECONDS FROM 0)
# -------------------------------------------------------------------
SIMILARITY_THRESHOLD = 80
PRINT_INTERVAL = 3  # Print every 3 seconds

# Track last confirmation time for each hand
start_time = time.time()  # Start timer at 0
last_print_time = start_time  # Track last print time

# JSON file to store errors
error_log_path = "gesture_errors.json"
error_logs = []  # Store all results

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("❌ Error: Unable to access camera.")
    exit(1)

while True:
    ret, frame = cap.read()
    if not ret:
        print("❌ Frame read error.")
        break

    frame = cv2.flip(frame, 1)
    h, w, _ = frame.shape

    # Convert to RGB for Mediapipe
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)

    current_time = time.time()
    elapsed_time = int(current_time - start_time)  # Time from 0 seconds

    # Default values if hands are not detected
    hand1_data = {
        "gesture": "Not Detected",
        "similarity": "N/A",
        "incorrect_joints": [],
        "predicted_landmarks": []
    }
    hand2_data = {
        "gesture": "Not Detected",
        "similarity": "N/A",
        "incorrect_joints": [],
        "predicted_landmarks": []
    }

    if result.multi_hand_landmarks:
        for hand_idx, hand_landmarks in enumerate(result.multi_hand_landmarks):
            user_lms = [(lm.x, lm.y) for lm in hand_landmarks.landmark]

            # (A) Normalize & run model
            norm_user = normalize_landmarks(user_lms)
            gesture_label, predicted_lms = predict_gesture(norm_user)

            # (B) Compute joint-wise deviation
            user_points_21x2 = np.array(norm_user).reshape(21, 2)
            joint_errors = compute_jointwise_deviation(user_points_21x2, predicted_lms)

            # (C) Identify incorrect joints (Threshold: > 0.1 deviation)
            incorrect_joints = [i for i, err in enumerate(joint_errors) if err > 0.1]

            # (D) Compute similarity score
            avg_distance = np.mean(joint_errors)
            sim_score = (1 - avg_distance / 0.5) * 100
            sim_score = max(0, min(100, sim_score))  # Clamp [0..100]

            # (E) Store data based on hand index
            hand_data = {
                "gesture": gesture_label,
                "similarity": round(sim_score, 1),
                "incorrect_joints": incorrect_joints,
                "predicted_landmarks": predicted_lms.tolist()
            }

            if hand_idx == 0:
                hand1_data = hand_data
            elif hand_idx == 1:
                hand2_data = hand_data

            # (F) Draw actual hand landmarks in GREEN
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        # Show text for Hand 1 and Hand 2 in camera window
        cv2.putText(frame, f"Hand 1: {hand1_data['gesture']}, Sim: {hand1_data['similarity']}%", (50, 100),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.putText(frame, f"Hand 2: {hand2_data['gesture']}, Sim: {hand2_data['similarity']}%", (50, 140),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    else:
        cv2.putText(frame, "No hands detected", (50, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # **Save Error Data in JSON & Print Every 3 Seconds**
    if current_time - last_print_time >= PRINT_INTERVAL:
        error_entry = {
            "time": elapsed_time,
            "hand1": hand1_data,
            "hand2": hand2_data
        }
        error_logs.append(error_entry)

        with open(error_log_path, "w") as f:
            json.dump(error_logs, f, indent=4)

        # Print results
        print(json.dumps(error_entry, indent=4))  # Print JSON structure

        last_print_time = current_time  # Reset timer after printing

    # Show camera window
    cv2.imshow("Two-Hand Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


{
    "time": 3,
    "hand1": {
        "gesture": "W",
        "similarity": 53.6,
        "incorrect_joints": [
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            10,
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20
        ],
        "predicted_landmarks": [
            [
                -0.0252838134765625,
                -0.0252227783203125
            ],
            [
                -0.03753662109375,
                -0.06378173828125
            ],
            [
                0.002593994140625,
                -0.2156982421875
            ],
            [
                0.05816650390625,
                -0.333251953125
            ],
            [
                0.1016845703125,
                -0.437744140625
            ],
            [
                0.0667724609375,
              