In [None]:
Environment Setup

In [None]:
TensorFlow/Keras: For building deep learning models.
OpenCV: For image processing tasks.
MediaPipe: A library from Google that provides solutions for hands, face, and body tracking, which might simplify hand gesture recognition.

In [1]:
pip install tensorflow opencv-python mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.14-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting jax (from mediapipe)
  Downloading jax-0.4.34-py3-none-any.whl.metadata (22 kB)
Collecting jaxlib (from mediapipe)
  Downloading jaxlib-0.4.34-cp311-cp311-win_amd64.whl.metadata (1.0 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading protobuf-4.25.5-cp310-abi3-win_amd64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.0-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.14-cp311-cp311-win_amd64.whl (50.8 MB)
   ---------------------------------------- 0.0/50.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/50.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/50.8 MB 435.7 kB/s eta 0:01:57
   ---------------------------------------- 0.0/50.8 MB 435.7 kB/s eta 0:01:57
   ------

In [5]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Load the dataset from the provided folder structure
def load_dataset(dataset_path, img_size):
    images = []
    labels = []
    
    # Iterate through each gesture class folder
    for label, gesture in enumerate(os.listdir(dataset_path)):
        gesture_folder = os.path.join(dataset_path, gesture)
        
        for img_file in os.listdir(gesture_folder):
            img_path = os.path.join(gesture_folder, img_file)
            img = cv2.imread(img_path)
            
            if img is not None:
                # Resize image and normalize pixel values
                img = cv2.resize(img, (img_size, img_size))
                img = img / 255.0  # Normalize pixel values to [0, 1]
                images.append(img)
                labels.append(label)
    
    return np.array(images), np.array(labels)

# Paths to your dataset
train_dataset_path = 'Downloads/Hand Gesture dataset/train/train'
test_dataset_path = 'Downloads/Hand Gesture dataset/test/test'

img_size = 64  # Define image size (64x64)

# Load training and test datasets separately
X_train, y_train = load_dataset(train_dataset_path, img_size)
X_test, y_test = load_dataset(test_dataset_path, img_size)

# Convert labels to one-hot encoding
y_train = to_categorical(y_train, num_classes=len(np.unique(y_train)))
y_test = to_categorical(y_test, num_classes=len(np.unique(y_test)))

# Split training data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Define the CNN model architecture
def create_model(input_shape, num_classes):
    model = Sequential()

    # Convolutional Layer 1
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Convolutional Layer 2
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Flatten the data
    model.add(Flatten())

    # Fully connected layer
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))  # Dropout for regularization

    # Output layer for classification
    model.add(Dense(num_classes, activation='softmax'))  # Ensure num_classes is 20 for 20 gesture classes

    return model

# Create the model with 20 classes
input_shape = (img_size, img_size, 3)  # Image size (64x64) and 3 channels (RGB)
num_classes = 20  # Update this to 20 as you have 20 classes
model = create_model(input_shape, num_classes)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model using the training and validation datasets
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.8788 - loss: 0.4241 - val_accuracy: 1.0000 - val_loss: 5.7283e-05
Epoch 2/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 23ms/step - accuracy: 0.9984 - loss: 0.0065 - val_accuracy: 1.0000 - val_loss: 1.6757e-04
Epoch 3/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.9973 - loss: 0.0140 - val_accuracy: 1.0000 - val_loss: 1.8395e-05
Epoch 4/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 23ms/step - accuracy: 0.9984 - loss: 0.0049 - val_accuracy: 0.9997 - val_loss: 2.1233e-04
Epoch 5/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - accuracy: 0.9984 - loss: 0.0054 - val_accuracy: 0.9997 - val_loss: 0.0013
Epoch 6/10
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 23ms/step - accuracy: 0.9989 - loss: 0.0027 - val_accuracy: 0.9997 - val_loss: 6.9847e

In [8]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")


[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9997 - loss: 0.0031
Test Accuracy: 0.9978333115577698


In [11]:
import tensorflow as tf
# Save the model using the native Keras format
model.save('hand_gesture_model.keras')


In [None]:
# Real-time hand gesture detection using webcam
import cv2
import numpy as np

# Function to preprocess each frame
def preprocess_frame(frame, img_size):
    img = cv2.resize(frame, (img_size, img_size))
    img = img / 255.0  # Normalize pixel values
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    return img

# Start the webcam feed
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    
    if not ret:
        break
    
    # Preprocess the frame for prediction
    processed_frame = preprocess_frame(frame, img_size)
    
    # Predict the hand gesture
    prediction = model.predict(processed_frame)
    gesture = np.argmax(prediction, axis=1)[0]  # Get the predicted class index
    
    # Map the predicted index to gesture names (change this to match your classes)
    gesture_names = ['gesture1', 'gesture2', 'gesture3', 'gesture4', 'gesture5', 'gesture6', 'gesture7', 
                     'gesture8', 'gesture9', 'gesture10', 'gesture11', 'gesture12', 'gesture13', 
                     'gesture14', 'gesture15', 'gesture16', 'gesture17', 'gesture18', 'gesture19', 'gesture20']
    gesture_name = gesture_names[gesture]
    
    # Display the gesture on the frame
    cv2.putText(frame, f'Gesture: {gesture_name}', (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
    
    # Show the video feed with the detected gesture
    cv2.imshow('Hand Gesture Detection', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18