In [32]:
import os
import cv2
import numpy as np
import mediapipe as mp

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7)
mp_draw = mp.solutions.drawing_utils

# Create data storage directory
DATA_DIR = "gesture_data"
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# Start webcam
cap = cv2.VideoCapture(0)
print("Press 's' to save a gesture, 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Flip the frame horizontally for a mirror effect
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)
    
    # Draw landmarks if detected
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Extract landmarks as a flat list
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])
            
            # Save landmarks when 's' is pressed
            key = cv2.waitKey(1) & 0xFF
            if key == ord('s'):
                label = input("Enter gesture label: ")  # Prompt for gesture label
                gesture_dir = os.path.join(DATA_DIR, label)
                
                # Create a directory for the label if it doesn't exist
                if not os.path.exists(gesture_dir):
                    os.makedirs(gesture_dir)
                
                # Save the landmarks as a numpy file
                file_path = os.path.join(gesture_dir, f"{len(os.listdir(gesture_dir))}.npy")
                np.save(file_path, np.array(landmarks))
                print(f"Saved gesture '{label}' at {file_path}")
    
    # Display the webcam feed
    cv2.imshow("Hand Gesture Capture", frame)
    
    # Quit when 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Press 's' to save a gesture, 'q' to quit.


Enter gesture label:  volume_up


Saved gesture 'volume_up' at gesture_data\volume_up\16.npy


Enter gesture label:  volume_up


Saved gesture 'volume_up' at gesture_data\volume_up\17.npy


Enter gesture label:  volume_up


Saved gesture 'volume_up' at gesture_data\volume_up\18.npy


Enter gesture label:  volume_up


Saved gesture 'volume_up' at gesture_data\volume_up\19.npy


Enter gesture label:  volume_up


Saved gesture 'volume_up' at gesture_data\volume_up\20.npy


Enter gesture label:  volume_up


Saved gesture 'volume_up' at gesture_data\volume_up\21.npy


Enter gesture label:  volume_up


Saved gesture 'volume_up' at gesture_data\volume_up\22.npy


Enter gesture label:  volume_down


Saved gesture 'volume_down' at gesture_data\volume_down\13.npy


Enter gesture label:  volume_down


Saved gesture 'volume_down' at gesture_data\volume_down\14.npy


Enter gesture label:  volume_down


Saved gesture 'volume_down' at gesture_data\volume_down\15.npy


Enter gesture label:  volume_down


Saved gesture 'volume_down' at gesture_data\volume_down\16.npy


Enter gesture label:  volume_down


Saved gesture 'volume_down' at gesture_data\volume_down\17.npy


Enter gesture label:  volume_down


Saved gesture 'volume_down' at gesture_data\volume_down\18.npy


Enter gesture label:  volume_down


Saved gesture 'volume_down' at gesture_data\volume_down\19.npy


Enter gesture label:  volume_down


Saved gesture 'volume_down' at gesture_data\volume_down\20.npy


In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


class GestureDataset(Dataset):
    def __init__(self, data_dir):
        self.data = []
        self.labels = []
        self.classes = sorted(os.listdir(data_dir))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

        for label in self.classes:
            label_dir = os.path.join(data_dir, label)
            for file in os.listdir(label_dir):
                self.data.append(np.load(os.path.join(label_dir, file)))
                self.labels.append(self.class_to_idx[label])
        
        self.data = np.array(self.data)
        self.labels = np.array(self.labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

class GestureModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(GestureModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.fc(x)

# Dataset and DataLoader
data_dir = "gesture_data"
dataset = GestureDataset(data_dir)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Model, Loss, Optimizer
model = GestureModel(input_size=DATA_POINTS, num_classes=len(dataset.classes)).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
for epoch in range(20):
    for inputs, labels in dataloader:
        inputs, labels = inputs.cuda(), labels.cuda()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/20, Loss: {loss.item():.4f}")


Epoch 1/20, Loss: 1.1024
Epoch 2/20, Loss: 1.0767
Epoch 3/20, Loss: 1.0152
Epoch 4/20, Loss: 1.0056
Epoch 5/20, Loss: 0.9848
Epoch 6/20, Loss: 0.9625
Epoch 7/20, Loss: 0.9285
Epoch 8/20, Loss: 0.9126
Epoch 9/20, Loss: 0.8792
Epoch 10/20, Loss: 0.7348
Epoch 11/20, Loss: 0.7312
Epoch 12/20, Loss: 0.7482
Epoch 13/20, Loss: 0.9606
Epoch 14/20, Loss: 0.9197
Epoch 15/20, Loss: 0.6987
Epoch 16/20, Loss: 0.7151
Epoch 17/20, Loss: 1.1034
Epoch 18/20, Loss: 0.9399
Epoch 19/20, Loss: 0.6480
Epoch 20/20, Loss: 0.6707


In [38]:
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
from comtypes import CLSCTX_ALL
import torch.nn.functional as F

# Pycaw setup
devices = AudioUtilities.GetSpeakers()
interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
volume = interface.QueryInterface(IAudioEndpointVolume)

# Real-time prediction
cap = cv2.VideoCapture(0)
model.eval()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])

            input_tensor = torch.tensor(landmarks, dtype=torch.float32).unsqueeze(0).cuda()
            output = model(input_tensor)
            prediction = torch.argmax(F.softmax(output, dim=1)).item()
            
            if dataset.classes[prediction] == "volume_up":
                volume.SetMasterVolumeLevelScalar(min(1.0, volume.GetMasterVolumeLevelScalar() + 0.1), None)
            elif dataset.classes[prediction] == "volume_down":
                volume.SetMasterVolumeLevelScalar(max(0.0, volume.GetMasterVolumeLevelScalar() - 0.1), None)

    cv2.imshow("Gesture Control", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
