In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os
import json
import numpy as np
from PIL import Image

In [2]:
class GestureDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = os.listdir(root_dir)
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
        self.samples = self._load_samples()

    def _load_samples(self):
        samples = []
        for cls in self.classes:
            class_dir = os.path.join(self.root_dir, cls)
            landmarks_dir = os.path.join(class_dir, 'LANDMARKS')
            for landmark_file in os.listdir(landmarks_dir):
                image_file = landmark_file.replace('_landmarks', '')
                image_path = os.path.join(class_dir, image_file)
                landmark_path = os.path.join(landmarks_dir, landmark_file)
                samples.append((image_path, landmark_path, self.class_to_idx[cls]))
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        image_path, landmark_path, label = self.samples[idx]
        image = Image.open(image_path).convert('RGB')
        
        with open(landmark_path, 'r') as f:
            landmarks = json.load(f)
        
        landmarks = np.array(landmarks['landmarks']).flatten()

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(landmarks, dtype=torch.float32), label

In [3]:
class GestureRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(GestureRecognitionModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 28 * 28 + 42, 512)  # Assuming input image size is 224x224
        self.fc2 = nn.Linear(512, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, image, landmarks):
        x = self.pool(torch.relu(self.conv1(image)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(-1, 128 * 28 * 28)
        x = torch.cat((x, landmarks), dim=1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [4]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, landmarks, labels in train_loader:
            images, landmarks, labels = images.to(device), landmarks.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images, landmarks)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up data transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = GestureDataset(root_dir='DATASET', transform=transform)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

num_classes = len(dataset.classes)
model = GestureRecognitionModel(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, criterion, optimizer, device)

torch.save(model.state_dict(), 'gesture_recognition_model.pth')