In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
import os
import json
import numpy as np
from PIL import Image

from modules.model_trainer import ModelTrainer
from modules.model_performance import ModelPerformanceVisualizer

In [9]:
class GestureDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = os.listdir(root_dir)
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
        self.samples = self._load_samples()

    def _load_samples(self):
        samples = []
        for cls in self.classes:
            class_dir = os.path.join(self.root_dir, cls)
            landmarks_dir = os.path.join(class_dir, 'LANDMARKS')
            for landmark_file in os.listdir(landmarks_dir):
                image_file = landmark_file.replace('_landmarks', '')
                image_path = os.path.join(class_dir, image_file)
                landmark_path = os.path.join(landmarks_dir, landmark_file)
                samples.append((image_path, landmark_path, self.class_to_idx[cls]))
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        image_path, landmark_path, label = self.samples[idx]
        image = Image.open(image_path).convert('RGB')
        
        with open(landmark_path, 'r') as f:
            landmarks = json.load(f)
        
        landmarks = np.array(landmarks['landmarks']).flatten()

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(landmarks, dtype=torch.float32), label

In [17]:
class YOLOv5GestureModel(nn.Module):
    def __init__(self, num_classes):
        super(YOLOv5GestureModel, self).__init__()
        # Load YOLOv5 model
        self.yolo = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
        
        # Get the feature extractor
        self.feature_extractor = self.yolo.model.model[:9]  # Up to the 9th layer (adjust if needed)
        
        # Freeze YOLOv5 parameters
        for param in self.feature_extractor.parameters():
            param.requires_grad = False
        
        # Add custom layers for gesture recognition
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc1 = nn.Linear(512 + 42, 256)  # 512 from YOLOv5 feature extractor + 42 landmarks
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, image, landmarks):
        x = self.feature_extractor(image)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = torch.cat((x, landmarks), dim=1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [18]:
trainer = ModelTrainer()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set up data transforms
transform = transforms.Compose([
    transforms.Resize((640, 640)),  # YOLOv5 default input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create dataset
full_dataset = GestureDataset(root_dir='DATASET', transform=transform)

train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4)

num_classes = 37  # Set to 37 classes
model = YOLOv5GestureModel(num_classes).to(device)

# Set up loss function and optimizer
criterion = nn.CrossEntrothpyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

NUM_EPOCHS = 20
EARLY_STOP = 10


# Train the model
TRAINED_MODEL, RESULTS = trainer.train(model=model, 
                                       train_loader=train_loader, 
                                       test_loader=val_loader, 
                                       optimizer=optimizer,
                                       loss_fn=criterion,
                                       epochs=NUM_EPOCHS,
                                       scheduler=None,
                                       patience=EARLY_STOP)

visualizer = ModelPerformanceVisualizer(RESULTS)
y_true, y_pred = visualizer.get_preds(model=TRAINED_MODEL, dataloader=val_loader, device=device)
visualizer.plot_all(y_true=y_true, y_pred=y_pred, classes=num_classes, save_path="model_performance/plot.jpg")

# Save the model
# torch.save(model.state_dict(), 'yolov5_gesture_recognition_model.pth')

Using cache found in C:\Users\cloud/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-8-20 Python-3.9.12 torch-2.3.1+cu121 CUDA:0 (NVIDIA GeForce GTX 1070, 8192MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


TypeError: 'DetectionModel' object is not subscriptable