In [3]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import mediapipe as mp
from tqdm import tqdm  # For progress bars
import pickle  # To save processed data

In [6]:
# Initialize MediaPipe Holistic Model
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Define dataset path
DATASET_PATH = r'C:\Users\SAHIL GUPTA\Downloads\GestureDataTemp\Gesture Image Pre-Processed Data'
actions = sorted(os.listdir(DATASET_PATH))  # List of gesture classes

In [7]:
actions

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '_']

In [8]:
# Function to extract hand keypoints + wrist landmarks
def extract_keypoints(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = holistic.process(image)

    # Extract landmarks
    left_hand = results.left_hand_landmarks.landmark if results.left_hand_landmarks else []
    right_hand = results.right_hand_landmarks.landmark if results.right_hand_landmarks else []

    def get_landmark_array(landmarks):
        return np.array([[lm.x, lm.y, lm.z] for lm in landmarks]) if landmarks else np.zeros((21, 3))

    # Full hand keypoints
    left_hand_keypoints = get_landmark_array(left_hand)
    right_hand_keypoints = get_landmark_array(right_hand)

    # Extract wrist landmarks (landmark 0) separately if available
    left_wrist = left_hand_keypoints[0] if left_hand else np.zeros(3)
    right_wrist = right_hand_keypoints[0] if right_hand else np.zeros(3)

    # Concatenate all: left_hand + right_hand + left_wrist + right_wrist
    return np.concatenate([
        left_hand_keypoints.flatten(),
        right_hand_keypoints.flatten(),
        left_wrist,  # (x, y, z)
        right_wrist  # (x, y, z)
    ])


In [9]:
image = cv2.imread("1.jpg")  # load your image
keypoints = extract_keypoints(image)
print(keypoints.shape)
print(keypoints)  # this prints the 132 keypoint values


(132,)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [10]:
# Prepare dataset
X, y = [], []
label_map = {action: i for i, action in enumerate(actions)}

for action in actions:
    action_path = os.path.join(DATASET_PATH, action)
    image_files = sorted(os.listdir(action_path))  # Limit to 1500 images per class
    
    for img_file in tqdm(image_files, desc=f"Processing {action}"):
        img_path = os.path.join(action_path, img_file)
        image = cv2.imread(img_path)

        if image is None:
            continue  # Skip corrupted files

        keypoints = extract_keypoints(image)
        X.append(keypoints)
        y.append(label_map[action])

Processing 0:   0%|          | 0/1500 [00:00<?, ?it/s]

Processing 0: 100%|██████████| 1500/1500 [00:53<00:00, 28.25it/s]
Processing 1: 100%|██████████| 1500/1500 [00:47<00:00, 31.33it/s]
Processing 2: 100%|██████████| 1500/1500 [00:48<00:00, 31.15it/s]
Processing 3: 100%|██████████| 1500/1500 [00:47<00:00, 31.25it/s]
Processing 4: 100%|██████████| 1500/1500 [00:47<00:00, 31.31it/s]
Processing 5: 100%|██████████| 1500/1500 [00:49<00:00, 30.50it/s]
Processing 6: 100%|██████████| 1500/1500 [00:48<00:00, 30.75it/s]
Processing 7: 100%|██████████| 1500/1500 [00:49<00:00, 30.31it/s]
Processing 8: 100%|██████████| 1500/1500 [00:49<00:00, 30.26it/s]
Processing 9: 100%|██████████| 1500/1500 [00:48<00:00, 30.85it/s]
Processing A: 100%|██████████| 1500/1500 [05:23<00:00,  4.64it/s] 
Processing B: 100%|██████████| 1500/1500 [00:53<00:00, 27.97it/s]
Processing C: 100%|██████████| 1500/1500 [00:53<00:00, 27.92it/s]
Processing D: 100%|██████████| 1500/1500 [00:53<00:00, 27.95it/s]
Processing E: 100%|██████████| 1500/1500 [00:49<00:00, 30.55it/s]
Processin

In [11]:



# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

# Save dataset
with open("gesture_data.pkl", "wb") as f:
    pickle.dump((X, y), f)

print("✅ Data extraction complete! Saved as gesture_data.pkl")


✅ Data extraction complete! Saved as gesture_data.pkl


In [13]:
X.shape

(55500, 132)

In [14]:
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

# Load extracted keypoints
with open("gesture_data.pkl", "rb") as f:
    X, y = pickle.load(f)

sequence_length = 30  # Number of frames per sequence

# Reshape dataset into sequences
X_sequences, y_sequences = [], []

for i in range(len(X) - sequence_length):
    X_sequences.append(X[i:i+sequence_length])  # Get 30-frame sequence
    y_sequences.append(y[i+sequence_length-1])  # Label is the last frame's class

X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

# Split into train/test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

# Save processed sequences
with open("gesture_sequences.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

print(f"✅ Gesture sequences created! Shape: {X_sequences.shape}")
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


✅ Gesture sequences created! Shape: (55470, 30, 132)
Training samples: 44376, Testing samples: 11094


In [15]:
# Load the arrays
X = np.load('X_sequences.npy', allow_pickle=True)
y= np.load('y_labels.npy', allow_pickle=True)

print(f"Loaded data shapes - X: {X.shape}, y: {y.shape}")

Loaded data shapes - X: (1110, 30, 225), y: (1110,)


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(30, 225)),
    LSTM(128),
    Dense(64, activation='relu'),
    Dense(37, activation='softmax')  # 37 output classes
])


  super().__init__(**kwargs)


In [18]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [19]:

# Train the model
history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test),
                    epochs=100,
                    batch_size=32,)
model.save('my_model2.keras')

Epoch 1/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.0913 - loss: 3.5022 - val_accuracy: 0.1982 - val_loss: 2.9066
Epoch 2/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.3092 - loss: 2.6471 - val_accuracy: 0.4685 - val_loss: 2.0791
Epoch 3/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.4744 - loss: 1.9138 - val_accuracy: 0.4730 - val_loss: 1.8431
Epoch 4/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.5618 - loss: 1.5561 - val_accuracy: 0.5315 - val_loss: 1.5764
Epoch 5/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.5983 - loss: 1.3949 - val_accuracy: 0.5676 - val_loss: 1.3943
Epoch 6/100
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.6720 - loss: 1.1058 - val_accuracy: 0.6712 - val_loss: 1.3229
Epoch 7/100
[1m28/28[0m [

In [24]:
model = model.load_weights('my_model2.keras')

In [25]:
def predict_gesture(image_path, model, actions):
    """Predict gesture from a single image"""
    # Read and process image
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    with mp_holistic.Holistic(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    ) as holistic:
        # Make detection
        results = holistic.process(image_rgb)
        
        # Extract keypoints
        keypoints = extract_keypoints(results)
        
        # Reshape for model (1 sequence with 1 frame)
        keypoints = np.expand_dims(keypoints, axis=0)  # Shape becomes (1, 225)
        keypoints = np.expand_dims(keypoints, axis=0)  # Shape becomes (1, 1, 225)
        
        # Make prediction
        res = model.predict(keypoints)[0]
        predicted_idx = np.argmax(res)
        predicted_gesture = actions[predicted_idx]
        confidence = res[predicted_idx]
        
        # Visualize results
        annotated_image = image.copy()
        mp_drawing = mp.solutions.drawing_utils
        if results.pose_landmarks:
            mp_drawing.draw_landmarks(
                annotated_image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
        if results.left_hand_landmarks:
            mp_drawing.draw_landmarks(
                annotated_image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
        if results.right_hand_landmarks:
            mp_drawing.draw_landmarks(
                annotated_image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
        
        # Add prediction text
        cv2.putText(annotated_image, f"{predicted_gesture} ({confidence:.2f})", 
                   (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        # Display results
        cv2.imshow("Prediction", annotated_image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        
        return predicted_gesture, confidence

# Usage example
image_path = r"c:\VS code\ai\4.jpg"  # Replace with your image path
gesture, confidence = predict_gesture(image_path, model, actions)
print(f"Predicted Gesture: {gesture} with confidence {confidence:.2f}")

AttributeError: 'NoneType' object has no attribute 'predict'

In [None]:
import cv2
import numpy as np
import mediapipe as mp
from matplotlib import pyplot as plt

# Initialize MediaPipe drawing utilities
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic

def draw_landmarks(image, results):
    """Draw all detected landmarks on the image"""
    # Draw pose connections
    # mp_drawing.draw_landmarks(
    #     image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
    #     landmark_drawing_spec=mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
    #     connection_drawing_spec=mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2))
    
    # Draw left hand connections
    mp_drawing.draw_landmarks(
        image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
        landmark_drawing_spec=mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
        connection_drawing_spec=mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2))
    
    # Draw right hand connections
    mp_drawing.draw_landmarks(
        image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
        landmark_drawing_spec=mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
        connection_drawing_spec=mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2))
    
    return image

def visualize_prediction(image, results, prediction, confidence):
    """Combine landmarks and prediction visualization"""
    # Draw landmarks first
    annotated_image = draw_landmarks(image.copy(), results)
    
    # Add prediction text
    cv2.putText(annotated_image, f"{prediction} ({confidence:.2f})", 
               (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    # Add confidence bar
    bar_width = int(confidence * 200)
    cv2.rectangle(annotated_image, (10, 40), (10 + bar_width, 60), (0, 255, 0), -1)
    
    return annotated_image

# Real-time prediction with visualization
cap = cv2.VideoCapture(0)
sequence = []
prediction_history = []

with mp_holistic.Holistic(
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
) as holistic:
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Make detection
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(image)
        
        # Extract keypoints
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]  # Maintain sequence length
        
        if len(sequence) == 30:
            # Make prediction
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            prediction = actions[np.argmax(res)]
            confidence = np.max(res)
            
            # Update prediction history for smoothing
            prediction_history.append(prediction)
            if len(prediction_history) > 5:
                prediction_history.pop(0)
            
            # Get most frequent recent prediction
            final_prediction = max(set(prediction_history), key=prediction_history.count)
            
            # Visualize
            frame = visualize_prediction(frame, results, final_prediction, confidence)
        
        # Display
        cv2.imshow('Gesture Recognition', frame)
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()