## Hand Gesture Recognition Models

### Imports and Utils

In [1]:
"""
Importing necessary libraries
"""
import os
import cv2

try:
    import mediapipe as mp
except ImportError:
    %pip install mediapipe
    import mediapipe as mp

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

# Remove all the warnings
import warnings
warnings.filterwarnings('ignore')

# Set env CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize MediaPipe Imports
mp_drawings = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands

In [2]:
"""
Defining Utiliy Functions
"""

def landmarks_to_list(image, multi_hand_landmarks) -> torch.Tensor:
    """
    image: The image on which the landmarks are detected
    multi_hand_landmarks: The landmarks of the hand

    Returns: torch.Tensor: The tensor of landmarks [x, y]
    """

    image_height, image_width, _ = image.shape

    landmarks_list = []
    if multi_hand_landmarks:
        for hand_landmarks in multi_hand_landmarks:
            for landmark in hand_landmarks.landmark:
                # orgin is top left corner (0, 0)
                landmarks_list.append([min(int(landmark.x * image_width), image_width - 1), -min(int(landmark.y * image_height), image_height - 1)])

    return torch.tensor(landmarks_list)

def normalize_landmarks(landmarks: torch.Tensor) -> torch.Tensor:
    """
    landmarks: torch.Tensor: The landmarks of the hand

    Returns: torch.Tensor: The normalized landmarks
    """

    landmarks = landmarks - landmarks[0]
    landmarks = landmarks / torch.max(landmarks)

    return landmarks

### Dataset Creation and Preprocessing

In [None]:
# Define the dataset
dataset = []

In [None]:
# Initialize MediaPipe Hands
HandLandmarker = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.8,
    min_tracking_confidence=0.5
)

# Initialize the webcam
cap = cv2.VideoCapture(0)
label = 0
count = 0

with HandLandmarker as landmarker:
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        # the BGR image to RGB.
        frame = cv2.flip(frame, 1)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Dectect the hand landmarks
        frame.flags.writeable = False
        results = landmarker.process(frame)

        # Draw the hand annotations on the image.
        frame.flags.writeable = True
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        key = cv2.waitKey(5) & 0xFF
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawings.draw_landmarks(
                    frame,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawings.DrawingSpec(color=(97, 137, 48), thickness=2, circle_radius=4),
                    mp_drawings.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=2),
                )
            
            if key == 13:
                # Convert the landmarks to a list wrt the image
                landmarks = landmarks_to_list(frame, results.multi_hand_landmarks)

                # Normalize the landmarks
                landmarks = normalize_landmarks(landmarks).flatten()

                # Append the label to the landmarks and store in the dataset
                dataset.append(torch.cat((landmarks, torch.tensor([label]))))
                print(f"{count} - Gesture Labelled: {label}")

        cv2.imshow('MediaPipe Hands', frame)
        if key == 27: # ESC
            break

cap.release()
# cv2.destroyAllWindows()

In [None]:
# Convert the dataset to a tensor
if dataset != []:
    dataset = torch.stack(dataset).to(device)
    torch.save(dataset, 'Dataset/dataset.pt')

In [12]:
# Load the tensor from a file
dataset = torch.load('/kaggle/input/dataset/dataset.pt').to(device)

# Increase the size of the dataset by 4 times
dataset = torch.cat([dataset, dataset, dataset, dataset])

# Separate the features and labels
features = dataset[:, :-1]
labels = dataset[:, -1]
labels = F.one_hot(labels.long())

# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(features.cpu().numpy(), labels.cpu().numpy(), test_size=0.2, random_state=42)

# Convert back to tensors
X_train = torch.from_numpy(X_train).to(device)
X_test = torch.from_numpy(X_test).to(device)
Y_train = torch.from_numpy(Y_train).long().to(device)
Y_test = torch.from_numpy(Y_test).long().to(device)

# Print the shape of all tensors
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("Y_train shape: ", Y_train.shape)
print("Y_test shape: ", Y_test.shape)

X_train shape:  torch.Size([256, 42])
X_test shape:  torch.Size([64, 42])
Y_train shape:  torch.Size([256, 4])
Y_test shape:  torch.Size([64, 4])


### Model Creation and Training

In [3]:
class HGRModel(nn.Module):
    """
    A MLP Model for Hand Gesture Recognition
    """
    def __init__(self, in_features, out_features):
        super(HGRModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(in_features, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, out_features),
            nn.Softmax(dim=1)  # Add softmax layer
        )
        
    def forward(self, x):
        return self.model(x)
    
    def fit(self, X, Y, epochs=1000, lr=0.01):
        """
        X: torch.Tensor of shape (n_samples, n_features)
        Y: torch.Tensor of shape (n_samples, n_channels)
        epochs: int, the number of epochs
        """
        criteria = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        
        for epoch in range(epochs):
            # Forward pass
            preds = self.forward(X)
            loss = criteria(preds, Y)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print the loss
            if (epoch+1) % 10 == 0:
                print(f'Epoch {epoch+1} Loss: {loss.item()}')
                print("\n----------------------------------------------------\n")

    def save_model(self, file_path):
        """
        Save the model to a file.
        """
        torch.save(self.state_dict(), file_path)

    def load_model(self, file_path):
        """
        Load the model from a file.
        """
        self.load_state_dict(torch.load(file_path))

In [26]:
# Initialize the model
model = HGRModel(X_train.shape[1], Y_train.shape[1]).to(device)

In [27]:
# Train the model
model.fit(X_train, torch.argmax(Y_train, axis=1), epochs=50, lr=0.01)

# Evaluate the model
model.eval()

with torch.no_grad():
    Y_pred = model(X_train)
    Y_pred = torch.argmax(Y_pred, axis=1)

    accuracy_train = (Y_pred == torch.argmax(Y_train, axis=1)).sum().item() / Y_train.shape[0]

print(f'Training Accuracy: {accuracy_train * 100}%')

Epoch 10 Loss: 1.132049322128296

----------------------------------------------------

Epoch 20 Loss: 0.8226261138916016

----------------------------------------------------

Epoch 30 Loss: 0.7468557953834534

----------------------------------------------------

Epoch 40 Loss: 0.7439714074134827

----------------------------------------------------

Epoch 50 Loss: 0.7437458634376526

----------------------------------------------------

Training Accuracy: 100.0%


### Testing and Saving

In [28]:
# Evaluate the model
model.eval()

with torch.no_grad():
    Y_pred = model(X_test)
    Y_pred = torch.argmax(Y_pred, axis=1)

    accuracy_test = (Y_pred == torch.argmax(Y_test, axis=1)).sum().item() / Y_test.shape[0]

print(f'Testing Accuracy: {accuracy_test * 100}%')

Testing Accuracy: 100.0%


In [30]:
# Models directory
directory = "Models"
os.makedirs(directory, exist_ok=True)

# Save the model
file_path = "Models/HGR_Model.pth"
model.save_model(file_path)

In [31]:
# Initialize the model
model = HGRModel(X_train.shape[1], Y_train.shape[1]).to(device)

# Load the model
file_path = "Models/HGR_Model.pth"
model.load_model(file_path)

In [4]:
# Load the model
model_path = "Models/HGR_Model.pth"
model = HGRModel(42, 4)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Initialize MediaPipe Hands
HandLandmarker = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.8,
    min_tracking_confidence=0.5
)
classes = {
    0: "Right hand open",
    1: "Left hand open",
    2: "Right hand close",
    3: "Left hand close"
}

# Initialize the webcam
cap = cv2.VideoCapture(0)

with HandLandmarker as landmarker:
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        # Convert the BGR image to RGB.
        frame = cv2.flip(frame, 1)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Detect the hand landmarks
        frame.flags.writeable = False
        results = landmarker.process(frame)

        # Draw the hand annotations on the image.
        frame.flags.writeable = True
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        key = cv2.waitKey(5) & 0xFF
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawings.draw_landmarks(
                    frame,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawings.DrawingSpec(color=(97, 137, 48), thickness=2, circle_radius=4),
                    mp_drawings.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=2),
                )

                # Convert the landmarks to a list wrt the image
                landmarks = landmarks_to_list(frame, results.multi_hand_landmarks)

                # Normalize the landmarks
                landmarks = normalize_landmarks(landmarks).reshape(1, -1)
                Y_pred = model(landmarks)
                pred_class = torch.argmax(Y_pred, axis=1).item()

                # Annotate the predicted class on the screen
                cv2.putText(frame, classes[pred_class], (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.imshow('MediaPipe Hands', frame)
        if key == 27: # ESC
            break

cap.release()
cv2.destroyAllWindows()