In [17]:
!pip install opencv-python



In [69]:
import torch
from torch.utils.data import Dataset, DataLoader
import json
import cv2
import os
import numpy as np

class HandGestureDataset(Dataset):
    def __init__(self, json_file, img_dir, transform=None):
        with open(json_file, 'r') as f:
            self.annotations = json.load(f)
        self.img_dir = img_dir
        self.transform = transform
        self.classes = list(set(item['gesture'] for item in self.annotations.values()))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_name = list(self.annotations.keys())[idx]
        img_path = os.path.join(self.img_dir, img_name)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        annotation = self.annotations[img_name]
        gesture = annotation['gesture']
        bbox = annotation['bbox']

        label = self.class_to_idx[gesture]
        bbox = torch.tensor(bbox, dtype=torch.float32)

        if self.transform:
            image = self.transform(image)

        return image, label, bbox

In [73]:
from torchvision import transforms
from torch.utils.data import random_split

# Define transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create dataset
dataset = HandGestureDataset('hand_gesture_data/augmented_annotations.json', 'hand_gesture_data', transform=transform)

# Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print(len(train_dataset))
print(len(val_dataset))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

1008
252


In [79]:
import torch.nn as nn
import torchvision.models as models

class HandGestureModel(nn.Module):
    def __init__(self, num_classes):
        super(HandGestureModel, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.fc3 = nn.Linear(128, 4)  # for bounding box regression

    def forward(self, x):
        features = self.resnet(x)
        features = torch.relu(self.fc1(features))
        class_output = self.fc2(features)
        bbox_output = self.fc3(features)
        return class_output, bbox_output

model = HandGestureModel(num_classes=len(dataset.classes))

In [80]:
import torch.optim as optim

def adjust_alpha(epoch, max_epochs, start_alpha=1.0, end_alpha=0.5):
    return start_alpha - (start_alpha - end_alpha) * (epoch / max_epochs)
def log_mse_loss(pred, target):
    return torch.log(1 + torch.mean((pred - target) ** 2))

loss_bbox = log_mse_loss(bbox_outputs, bboxes)
# Define loss functions and optimizer
criterion_cls = nn.CrossEntropyLoss()
criterion_bbox = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
device = torch.device("mps")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for images, labels, bboxes in train_loader:
        images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
        
        optimizer.zero_grad()
        class_outputs, bbox_outputs = model(images)
        current_alpha = 1
        # if (epoch > 10):
        #     current_alpha = 0
        loss_cls = current_alpha * criterion_cls(class_outputs, labels)
        loss_bbox = (current_alpha) * log_mse_loss(bbox_outputs, bboxes)
        print(str(current_alpha) + " " + str(loss_cls) + " " + str(loss_bbox))
        loss = loss_cls + loss_bbox
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, bboxes in val_loader:
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
            class_outputs, bbox_outputs = model(images)
            loss_cls = criterion_cls(class_outputs, labels)
            loss_bbox = criterion_bbox(bbox_outputs, bboxes)
            loss = loss_cls + loss_bbox
            val_loss += loss.item()
            _, predicted = class_outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, '
          f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {100.*correct/total:.2f}%')

# Save the model
torch.save(model.state_dict(), 'hand_gesture_model.pth')

1 tensor(1.7308, device='mps:0', grad_fn=<MulBackward0>) tensor(12.9757, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.7652, device='mps:0', grad_fn=<MulBackward0>) tensor(12.6893, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.6451, device='mps:0', grad_fn=<MulBackward0>) tensor(12.8130, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.4953, device='mps:0', grad_fn=<MulBackward0>) tensor(12.9512, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.5320, device='mps:0', grad_fn=<MulBackward0>) tensor(12.8989, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.6213, device='mps:0', grad_fn=<MulBackward0>) tensor(12.8608, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.6673, device='mps:0', grad_fn=<MulBackward0>) tensor(13.0251, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.7303, device='mps:0', grad_fn=<MulBackward0>) tensor(12.6914, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.3494, device='mps:0', grad_fn=<MulBackward0>) tensor(12.7507, device='mps:0', grad_fn

In [49]:
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, bboxes in val_loader:
            #print(type(images))
            #print(images)
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
            class_outputs, bbox_outputs = model(images)
            #print(class_outputs)
            loss_cls = criterion_cls(class_outputs, labels)
            loss_bbox = criterion_bbox(bbox_outputs, bboxes)
            print("hello")
            print(loss_cls)
            print(loss_bbox)
            loss = loss_cls + loss_bbox
            val_loss += loss.item()
            _, predicted = class_outputs.max(1)
            total += labels.size(0)
            
            print("predicted, label")
            print(predicted)
            print(labels)
            correct += predicted.eq(labels).sum().item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, '
          f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {100.*correct/total:.2f}%')

hello
tensor(0.0129, device='mps:0')
tensor(328252.5938, device='mps:0')
predicted, label
tensor([2, 3, 4, 3, 2, 3, 2, 1], device='mps:0')
tensor([2, 3, 4, 3, 2, 3, 2, 1], device='mps:0')
hello
tensor(0.1278, device='mps:0')
tensor(445894.3750, device='mps:0')
predicted, label
tensor([0, 2, 1, 4, 1, 4, 4, 2], device='mps:0')
tensor([0, 2, 1, 4, 2, 4, 4, 2], device='mps:0')
hello
tensor(0.0337, device='mps:0')
tensor(291917.6875, device='mps:0')
predicted, label
tensor([1, 0, 2, 2, 0, 3, 2, 0], device='mps:0')
tensor([1, 0, 2, 2, 0, 3, 2, 0], device='mps:0')
hello
tensor(0.0059, device='mps:0')
tensor(332484.5625, device='mps:0')
predicted, label
tensor([0, 1, 1, 1, 0, 2, 1, 2], device='mps:0')
tensor([0, 1, 1, 1, 0, 2, 1, 2], device='mps:0')
hello
tensor(1.0502, device='mps:0')
tensor(400550.8750, device='mps:0')
predicted, label
tensor([4, 3, 3, 2, 1, 0, 1, 2], device='mps:0')
tensor([4, 3, 3, 2, 2, 0, 0, 2], device='mps:0')
hello
tensor(0.5759, device='mps:0')
tensor(369667.4375, dev