In [17]:
!pip install opencv-python



In [26]:
import torch
from torch.utils.data import Dataset, DataLoader
import json
import cv2
import os
import numpy as np

class HandGestureDataset(Dataset):
    def __init__(self, json_file, img_dir, transform=None):
        with open(json_file, 'r') as f:
            self.annotations = json.load(f)
        self.img_dir = img_dir
        self.transform = transform
        self.classes = list(set(item['gesture'] for item in self.annotations.values()))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_name = list(self.annotations.keys())[idx]
        img_path = os.path.join(self.img_dir, img_name)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        annotation = self.annotations[img_name]
        gesture = annotation['gesture']
        bbox = annotation['bbox']

        label = self.class_to_idx[gesture]
        bbox = torch.tensor(bbox, dtype=torch.float32)

        if self.transform:
            image = self.transform(image)

        return image, label, bbox

In [31]:
from torchvision import transforms
from torch.utils.data import random_split

# Define transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create dataset
dataset = HandGestureDataset('hand_gesture_data/augmented_annotations.json', 'hand_gesture_data', transform=transform)

# Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print(len(train_dataset))
print(len(val_dataset))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

1008
252


In [28]:
import torch.nn as nn
import torchvision.models as models

class HandGestureModel(nn.Module):
    def __init__(self, num_classes):
        super(HandGestureModel, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.fc3 = nn.Linear(128, 4)  # for bounding box regression

    def forward(self, x):
        features = self.resnet(x)
        features = torch.relu(self.fc1(features))
        class_output = self.fc2(features)
        bbox_output = self.fc3(features)
        return class_output, bbox_output

model = HandGestureModel(num_classes=len(dataset.classes))

In [23]:
import torch.optim as optim
criterion_cls = nn.CrossEntropyLoss()
criterion_bbox = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 50

In [None]:
import torch.optim as optim

def adjust_alpha(epoch, max_epochs, start_alpha=1.0, end_alpha=0.5):
    return start_alpha - (start_alpha - end_alpha) * (epoch / max_epochs)
def log_mse_loss(pred, target):
    return torch.log(1 + torch.mean((pred - target) ** 2))

loss_bbox = log_mse_loss(bbox_outputs, bboxes)
# Define loss functions and optimizer
criterion_cls = nn.CrossEntropyLoss()
criterion_bbox = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
device = torch.device("mps")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for images, labels, bboxes in train_loader:
        images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
        
        optimizer.zero_grad()
        class_outputs, bbox_outputs = model(images)
        current_alpha = 1
        # if (epoch > 10):
        #     current_alpha = 0
        loss_cls = current_alpha * criterion_cls(class_outputs, labels)
        loss_bbox = (current_alpha) * log_mse_loss(bbox_outputs, bboxes)
        print(str(current_alpha) + " " + str(loss_cls) + " " + str(loss_bbox))
        loss = loss_cls + loss_bbox
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, bboxes in val_loader:
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
            class_outputs, bbox_outputs = model(images)
            loss_cls = criterion_cls(class_outputs, labels)
            loss_bbox = criterion_bbox(bbox_outputs, bboxes)
            loss = loss_cls + loss_bbox
            val_loss += loss.item()
            _, predicted = class_outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, '
          f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {100.*correct/total:.2f}%')

# Save the model
torch.save(model.state_dict(), 'hand_gesture_model_new.pth')

In [39]:
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, bboxes in val_loader:
            #print(type(images))
            #print(images)
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
            class_outputs, bbox_outputs = model(images)
            print(class_outputs)
            print("?")
            print(class_outputs.max(1))
            #print(class_outputs)
            loss_cls = criterion_cls(class_outputs, labels)
            loss_bbox = criterion_bbox(bbox_outputs, bboxes)
            print("hello")
            print(loss_cls)
            print(loss_bbox)
            loss = loss_cls + loss_bbox
            val_loss += loss.item()
            _, predicted = class_outputs.max(1)
            total += labels.size(0)
            
            print("predicted, label")
            print(predicted)
            print(labels)
            print(bbox_outputs)
            print(bboxes)
            correct += predicted.eq(labels).sum().item()
    
    print(f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {100.*correct/total:.2f}%')

tensor([[ -5.9891,  14.2598,   3.6205,   0.9848,  -6.5170],
        [-10.2815,  -9.0256,   3.2319, -10.6233, -12.7999],
        [ 18.6079,  -2.6501, -11.8894,  -4.3995,  -0.4975],
        [ -8.4887,  -1.6573,  13.9527,  -9.1623,  -2.5786],
        [-21.8731, -10.6915,   1.3448,   7.6984,  26.2825],
        [-11.9624,  13.4229,  -2.6357,  -1.2384, -27.4332],
        [-20.4723, -24.9152,  -3.8204,  -2.4752,   5.6648],
        [ 13.2944,   1.0950,  -7.5068,   0.9365,  -1.5632]], device='mps:0')
?
torch.return_types.max(
values=tensor([14.2598,  3.2319, 18.6079, 13.9527, 26.2825, 13.4229,  5.6648, 13.2944],
       device='mps:0'),
indices=tensor([1, 2, 0, 2, 4, 1, 4, 0], device='mps:0'))
hello
tensor(5.1341e-05, device='mps:0')
tensor(3696.8472, device='mps:0')
predicted, label
tensor([1, 2, 0, 2, 4, 1, 4, 0], device='mps:0')
tensor([1, 2, 0, 2, 4, 1, 4, 0], device='mps:0')
tensor([[ 217.1376,   39.0398,  603.1738,  608.6965],
        [1167.8916,  121.2079,  434.5494,  442.7199],
        [

In [35]:
state_dict = torch.load('hand_gesture_model_new.pth')
model.load_state_dict(state_dict)
device = torch.device("mps")
model.to(device)

HandGestureModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, t

In [40]:
model

HandGestureModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, t

In [1]:
!python3 -V

Python 3.12.3


In [8]:
# project_dir = "/Users/alexandertekle/Documents/Code/Handmoji/models"
# model2 = model
# dummy_input = torch.randn(1, 3, 224, 224).to(device)
# √