<a href="https://colab.research.google.com/github/DLNinja/HandKeypointsDetection/blob/main/ResNetV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import torch
import numpy as np
from tqdm import tqdm
from PIL import Image
from torchvision import transforms
import cv2
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow
from tqdm import tqdm
import torch.nn.functional as F

# Display Method

In [None]:
def show_sample(image, coords, original_size=(224, 224), color='bw'):
    # Transform to numpy array
    if isinstance(image, torch.Tensor):
        image = image.cpu().detach().numpy()
    if not isinstance(image, np.ndarray):
        original_image = np.array(image)
    else:
        original_image = image

    original_image = original_image.copy()
    if original_image.dtype != np.uint8:
        original_image = (original_image * 255).astype(np.uint8)

    # Scale coordinates to image size
    img_h, img_w = original_size
    if isinstance(coords, torch.Tensor):
        coords = coords.detach().cpu().numpy()
    rescaled_keypoints = [(int(x * img_w), int(y * img_h)) for x, y in coords]

    # Draw the lines - relation of keypoints on hand
    line_dict = [
        [0, 1, 2, 3, 4],
        [0, 5, 6, 7, 8],
        [9, 10, 11, 12],
        [13, 14, 15, 16],
        [0, 17, 18, 19, 20],
        [5, 9, 13, 17]
    ]
    for seq in line_dict:
        coords = []
        for i in seq:
            coords.append(rescaled_keypoints[i])
        for i in range(len(coords)-1):
            cv2.line(original_image, coords[i], coords[i+1], color=(0, 0, 0), thickness=1)

    # Draw keypoints on the original image
    for (x, y) in rescaled_keypoints:
        cv2.circle(original_image, (x, y), radius=2, color=(255, 255, 255), thickness=-1)

    # Display the image
    plt.figure(figsize=(6, 6))
    plt.imshow(original_image)
    plt.axis('off')
    plt.show()


# Dataset Loader

In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from torchvision import transforms

class HandKeypointsDataset(Dataset):
    def __init__(self, npz_file, img_size=(224, 224)):
        self.npz_file = npz_file
        self.img_size = img_size

        self.data = np.load(self.npz_file)
        self.images = self.data['images']
        self.keypoints = self.data['keypoints']

        self.transform = transforms.Compose([
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.data["images"])

    def __getitem__(self, idx):
        image = self.images[idx]
        keypoints = self.keypoints[idx].reshape(21, 3)
        keypoints = keypoints[:, :2]        # OutputKeypoints: (21, 2)

        image = self.transform(image)

        return image, torch.tensor(keypoints, dtype=torch.float32)


# Model Definition

In [None]:
import torch.nn as nn
import torchvision.models as models

class HeatmapPoseModel(nn.Module):
    def __init__(self, num_keypoints=21):
        super().__init__()
        self.backbone = models.resnet18(pretrained=True)
        self.backbone = nn.Sequential(*list(self.backbone.children())[:-1])  # Remove classifier
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.2),
            nn.Linear(256, num_keypoints * 2), # array of x and y for each coordinate
            nn.Sigmoid()  # keeps outputs in [0, 1] range
        )

    def forward(self, x):
        x = self.backbone(x)
        return self.fc(x).view(-1, 21, 2)  # return output as 21 pairs of coordinates

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = HeatmapPoseModel().to(device)

## Choosing Model Version

Two different models were trained:
- V2.1: optimizer with LR = 0.0001 (or 1e-4)
- V2.2: optimizer with LR = 0.0003 (or 3e-4)

### ResNet Model V2.1

In [None]:
model_name = 'resnet_v2_1'
checkpoint = '/content/drive/MyDrive/HandKeypoints/models/handposeV2_1_checkpoint.pth'
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

### ResNet Model V2.2

In [None]:
model_name = 'resnet_v2_2'
checkpoint = '/content/drive/MyDrive/HandKeypoints/models/handposeV2_2_checkpoint.pth'
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

### Load The Model or Start New Instance

In [None]:
# load pre-trained model from checkpoint
if os.path.exists(checkpoint):
    checkpoint = torch.load(checkpoint, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    previous_total_loss = checkpoint['total_loss']
    previous_avg_loss = checkpoint['avg_loss']
    history = checkpoint['history']
    best_acc = history['test_accuracy'][-1]
    print(f"Resuming training from epoch {start_epoch}")
else:  # start training new instance
    start_epoch = 1
    previous_acc = 0
    history = {
        'train_loss': [],
        'test_loss': [],
        'test_accuracy': [],
        'test_avg_time_per_sample': []
    }
    best_acc = 0.0

# Training The Model

In [None]:
# Test Model method - on each epoch, after training process is finished
def test_model(model, val_paths, device, batch_size=32, threshold=0.05):
    model.eval()
    total_loss = 0.0
    total_samples = 0
    correct_preds = 0
    total_time = 0.0


    for val_path in val_paths:
      dataset = HandKeypointsDataset(npz_file=val_path)
      dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
      start_time = time.time()
      with torch.no_grad():
          for images, targets in dataloader:
              images = images.to(device)
              targets = targets.to(device)

              preds = model(images)

              loss = F.l1_loss(preds, targets, reduction='sum')
              total_loss += loss.item()

              dists = torch.norm(preds - targets, dim=2)
              within_thresh = (dists < threshold).float()

              correct_preds += within_thresh.sum().item()
              total_samples += dists.numel()
      elapsed_time = time.time() - start_time
      total_time += elapsed_time
    avg_time_per_sample = total_time / total_samples

    avg_loss = total_loss / total_samples
    accuracy = correct_preds / total_samples

    return avg_loss, accuracy, avg_time_per_sample

In [None]:
from torch.utils.data import DataLoader
import random
import time

chunk_dir = "/content/drive/MyDrive/HandKeypoints/dataset"
chunk_files = sorted([f for f in os.listdir(chunk_dir) if f.endswith(".npz") and f.startswith('train')])
val_files = sorted([f for f in os.listdir(chunk_dir) if f.endswith(".npz") and f.startswith('val')])
batch_size = 32

final_epoch = 31
for epoch in range(start_epoch, final_epoch):
    model.train()
    total_loss = 0
    print(f"\nEpoch {epoch}/{final_epoch-1}")

    total_keypoints = 0
    step=0
    # randomize chunk order, then start
    random.shuffle(chunk_files)
    for chunk_file in chunk_files:
        step += 1
        chunk_path = os.path.join(chunk_dir, chunk_file)

        # load one dataset chunk at a time
        dataset = HandKeypointsDataset(npz_file=chunk_path)
        # DataLoader - wraps iterable around Dataset to enable easy access to the samples
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
        # ProgressBar - show real-time statistics about training
        progress_bar = tqdm(dataloader, desc=f"[{chunk_file}] Step {step}/10", leave=False, ncols=100, dynamic_ncols=True)

        for i, (images, targets) in enumerate(progress_bar):
            images = images.to(device)
            targets = targets.to(device)
            # predict -> apply loss -> update optimizer
            preds = model(images)

            loss = F.l1_loss(preds, targets, reduction='sum')

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_keypoints += images.size(0) * 21
            progress_bar.set_postfix(loss=loss.item())

    val_paths = [os.path.join(chunk_dir, val_file) for val_file in val_files]
    test_loss, test_acc, test_avg_time = test_model(model, val_paths, device, batch_size)

    # predict first image from last chunk, for visual proof of training
    img = images[0].clone().detach().cpu().numpy()
    img = np.transpose(img, (1, 2, 0))
    img = np.ascontiguousarray(img)
    img = (img * 255).clip(0, 255).astype(np.uint8)

    predicted = preds[0].detach().cpu()
    show_sample(img, predicted, original_size=(224, 224))

    # Print epoch statistics
    avg_loss = total_loss / total_keypoints
    print(f"Epoch {epoch}, Total Loss: {total_loss:.4f}, Avg Loss: {avg_loss:.4f}")
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}, Avg Time per Keypoint: {test_avg_time:.6f}s")

    history['train_loss'].append(avg_loss)
    history['test_loss'].append(test_loss)
    history['test_accuracy'].append(test_acc)
    history['test_avg_time_per_sample'].append(test_avg_time)

    # if better model than before, save it
    if test_acc > best_acc:
        print("Saving model!")
        best_acc = test_acc
        torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'total_loss': total_loss,
                'avg_loss': avg_loss,
                'history': history
            }, checkpoint)

# Testing Section
The model is tested on a set of 10 hand-picked images to test the models precision in images of different hand-pose complexity

In [None]:
test_path = "/content/drive/MyDrive/HandKeypoints/dataset/val_chunk0.npz"
dataset = HandKeypointsDataset(test_path)

In [None]:
img_list = [33, 71, 76, 94, 100, 239, 249, 290, 311, 331]
for i in img_list:
  (image, target) = dataset.__getitem__(i*5)

  model.eval()

  image = image.unsqueeze(0).to(device)

  with torch.no_grad():
      predicted = model(image)
  predicted = predicted.squeeze(0)

  img = image.squeeze(0).clone().detach().cpu().numpy()
  img = np.transpose(img, (1, 2, 0))
  img = np.ascontiguousarray(img)
  img = (img * 255).clip(0, 255).astype(np.uint8)

  print(f"Image {i}:");
  show_sample(img, predicted.cpu(), original_size=(224, 224))

# Saving Training Stats

In [None]:
import json

export_data = {
    'model_name': model_name,
    'history': checkpoint.get('history', {}),
    'final_epoch': checkpoint.get('epoch'),
    'total_loss': checkpoint.get('total_loss'),
    'avg_loss': checkpoint.get('avg_loss')
}

save_dir = '/content/drive/MyDrive/HandKeypoints'
save_path = os.path.join(save_dir, f'{model_name}_stats.json')

with open(save_path, 'w') as f:
    json.dump(export_data, f)