In [41]:
# !pip install opencv-python

In [42]:
import torch
from torch.utils.data import Dataset, DataLoader
import json
import cv2
import os
import numpy as np

class HandGestureDataset(Dataset):
    def __init__(self, json_file, img_dir, transform=None):
        with open(json_file, 'r') as f:
            self.annotations = json.load(f)
        self.img_dir = img_dir
        self.transform = transform
        self.classes = list(set(item['gesture'] for item in self.annotations.values()))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_name = list(self.annotations.keys())[idx]
        img_path = os.path.join(self.img_dir, img_name)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        annotation = self.annotations[img_name]
        gesture = annotation['gesture']
        bbox = annotation['bbox']

        label = self.class_to_idx[gesture]
        bbox = torch.tensor(bbox, dtype=torch.float32)

        if self.transform:
            image = self.transform(image)

        return image, label, bbox

In [45]:
from torchvision import transforms
from torch.utils.data import random_split

# Define transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create dataset
dataset = HandGestureDataset('hand_gesture_data/augmented_annotations_290.json', 'hand_gesture_data', transform=transform)

# Split dataset into train and validation sets
train_size = int(0.95* len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print(len(train_dataset))
print(len(val_dataset))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

2755
145


In [46]:
import torch.nn as nn
import torchvision.models as models

class HandGestureModel(nn.Module):
    def __init__(self, num_classes):
        super(HandGestureModel, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.fc3 = nn.Linear(128, 4)  # for bounding box regression

    def forward(self, x):
        features = self.resnet(x)
        features = torch.relu(self.fc1(features))
        class_output = self.fc2(features)
        bbox_output = self.fc3(features)
        return class_output, bbox_output

model = HandGestureModel(num_classes=len(dataset.classes))

In [23]:
# import torch.optim as optim
# criterion_cls = nn.CrossEntropyLoss()
# criterion_bbox = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# num_epochs = 50

In [47]:
import torch.optim as optim

# def adjust_alpha(epoch, max_epochs, start_alpha=1.0, end_alpha=0.5):
#     return start_alpha - (start_alpha - end_alpha) * (epoch / max_epochs)
# def log_mse_loss(pred, target):
#     return torch.log(1 + torch.mean((pred - target) ** 2))

loss_bbox = log_mse_loss(bbox_outputs, bboxes)
# Define loss functions and optimizer
criterion_cls = nn.CrossEntropyLoss()
criterion_bbox = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
device = torch.device("mps")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for images, labels, bboxes in train_loader:
        images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
        
        optimizer.zero_grad()
        class_outputs, bbox_outputs = model(images)
        current_alpha = 1
        # if (epoch > 10):
        #     current_alpha = 0
        loss_cls = current_alpha * criterion_cls(class_outputs, labels)
        loss_bbox = (current_alpha) * log_mse_loss(bbox_outputs, bboxes)
        print(str(current_alpha) + " " + str(loss_cls) + " " + str(loss_bbox))
        loss = loss_cls + loss_bbox
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, bboxes in val_loader:
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
            class_outputs, bbox_outputs = model(images)
            loss_cls = criterion_cls(class_outputs, labels)
            loss_bbox = criterion_bbox(bbox_outputs, bboxes)
            loss = loss_cls + loss_bbox
            val_loss += loss.item()
            _, predicted = class_outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, '
          f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {100.*correct/total:.2f}%')

# Save the model
torch.save(model.state_dict(), 'hand_gesture_model_290.pth')

1 tensor(1.6332, device='mps:0', grad_fn=<MulBackward0>) tensor(12.8223, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.5072, device='mps:0', grad_fn=<MulBackward0>) tensor(12.7973, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.9774, device='mps:0', grad_fn=<MulBackward0>) tensor(12.8635, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(2.0499, device='mps:0', grad_fn=<MulBackward0>) tensor(12.8690, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.3229, device='mps:0', grad_fn=<MulBackward0>) tensor(12.6323, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.3971, device='mps:0', grad_fn=<MulBackward0>) tensor(12.7645, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.4486, device='mps:0', grad_fn=<MulBackward0>) tensor(13.0125, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.3960, device='mps:0', grad_fn=<MulBackward0>) tensor(12.8243, device='mps:0', grad_fn=<MulBackward0>)
1 tensor(1.3291, device='mps:0', grad_fn=<MulBackward0>) tensor(13.0889, device='mps:0', grad_fn

In [48]:
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels, bboxes in val_loader:
            #print(type(images))
            #print(images)
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
            class_outputs, bbox_outputs = model(images)
            print(class_outputs)
            print("?")
            print(class_outputs.max(1))
            #print(class_outputs)
            loss_cls = criterion_cls(class_outputs, labels)
            loss_bbox = criterion_bbox(bbox_outputs, bboxes)
            print("hello")
            print(loss_cls)
            print(loss_bbox)
            loss = loss_cls + loss_bbox
            val_loss += loss.item()
            _, predicted = class_outputs.max(1)
            total += labels.size(0)
            
            print("predicted, label")
            print(predicted)
            print(labels)
            print(bbox_outputs)
            print(bboxes)
            correct += predicted.eq(labels).sum().item()
    
    print(f'Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {100.*correct/total:.2f}%')

tensor([[ -0.8452,  28.3253,   2.7266,  -2.6235,  -2.3500],
        [ -1.0356,  19.2395,  -3.1246,  -1.5226, -14.4442],
        [ -2.5237,  -4.2579,  -0.6593,  -1.9285,  10.3436],
        [ -4.1493,  -2.7771,  -9.7393,  25.9742,   1.9306],
        [ -2.5996,  18.8586, -16.3392,   8.6765, -11.1968],
        [ -3.5294,  25.4483,  -2.1880,   0.4528,  -3.6186],
        [ -6.2625,  -4.9512,  -3.1385,  35.6877,   8.7749],
        [  8.0058,  10.0866,  31.6491, -12.7945,  11.1772]], device='mps:0')
?
torch.return_types.max(
values=tensor([28.3253, 19.2395, 10.3436, 25.9742, 18.8586, 25.4483, 35.6877, 31.6491],
       device='mps:0'),
indices=tensor([1, 1, 4, 3, 1, 1, 3, 2], device='mps:0'))
hello
tensor(7.7783e-06, device='mps:0')
tensor(4307.0034, device='mps:0')
predicted, label
tensor([1, 1, 4, 3, 1, 1, 3, 2], device='mps:0')
tensor([1, 1, 4, 3, 1, 1, 3, 2], device='mps:0')
tensor([[1108.1743,  451.5036,  574.1895,  603.2247],
        [ 101.3279,  849.2167,  584.9193,  654.7008],
        [

In [35]:
state_dict = torch.load('hand_gesture_model_new.pth')
model.load_state_dict(state_dict)
device = torch.device("mps")
model.to(device)

HandGestureModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, t

In [40]:
model

HandGestureModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, t

In [1]:
!python3 -V

Python 3.12.3


In [8]:
# project_dir = "/Users/alexandertekle/Documents/Code/Handmoji/models"
# model2 = model
# dummy_input = torch.randn(1, 3, 224, 224).to(device)
# √