# Ergonomic pose fine-tuning: 

## MMPose Hyperparameter Optimization

In [1]:
from ultralytics import YOLO
import numpy as np
from PIL import Image
import torch
from functools import partial
from torch.utils.data import random_split, DataLoader, Subset, TensorDataset
from torchvision import datasets, transforms
import cv2
import time
import torch.nn as nn
import torch

num_classes = 3
num_keypoints = 17
data_path = 'E:/Users/Vipin/Documents/BHT/3. Semester/Learning from images/Pose Dataset_binned'

In [2]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")
    
get_device()

device(type='cuda')

In [3]:
class PILToNumpyTransform:
    def __call__(self, pil_img):
        """
        Convert a PIL Image to an OpenCV Image / Numpy Array.

        Parameters:
            pil_img (PIL.Image): The PIL Image to be converted.

        Returns:
            np.ndarray: The converted OpenCV Image in RGB format.
        """
        # Convert PIL Image to NumPy array (RGB)
        img_array = np.array(pil_img)
        img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)

        return img_array

data_transforms = transforms.Compose([
    transforms.Resize((480, 640)), # Resize images
    transforms.ToTensor(), # Convert images to PyTorch tensors
])

own_dataset = datasets.ImageFolder(root=data_path, transform=data_transforms)
display(own_dataset)

Dataset ImageFolder
    Number of datapoints: 118
    Root location: E:/Users/Vipin/Documents/BHT/3. Semester/Learning from images/Pose Dataset_binned
    StandardTransform
Transform: Compose(
               Resize(size=(480, 640), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
           )

In [4]:
generator1 = torch.Generator().manual_seed(13)  # set seed for reproducibility of the split
train_and_val_dataset, test_dataset = random_split(own_dataset, [0.8, 0.2], generator=generator1)  # 80% training and evaluation, 20% testing

### Define the models

In [5]:
class FinetunedYOLO(nn.Module):
    def __init__(self, yolo_model, h1, h2):
        super(FinetunedYOLO, self).__init__()
        num_yolo_features = 730800
        num_classes = 3
        
        # Extract the first block of the YOLO model
        self.first_yolo_block = nn.Sequential(*list(yolo_model.model.modules())[0:1])
        
        # Define the classifier
        self.classifier = nn.Sequential(
            nn.Linear(num_yolo_features, h1),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(h2, h2 // 2),
            nn.ReLU(),
            nn.Linear(h2 // 2, num_classes)
        )

    def recursive_flatten(self, tensor_struct):
        """Recursively collect tensors from a nested structure."""
        if isinstance(tensor_struct, (list, tuple)):
            # For list or tuple, extend by recursively processing each item
            tensors = []
            for item in tensor_struct:
                tensors.extend(self.recursive_flatten(item))
            return tensors
        elif isinstance(tensor_struct, torch.Tensor):
            # For tensors, return in a list
            return [torch.flatten(tensor_struct)]
        else:
            # Non-tensor, non-list/tuple items are ignored
            return []
        
    def flatten_tensors(self, tensor_struct):
        return torch.flatten(torch.cat(self.recursive_flatten(tensor_struct)))
    
    def forward(self, x):
        
        with torch.no_grad():
            x = self.first_yolo_block(x)
        
        flattened = self.flatten_tensors(x)
        x = self.classifier(flattened)
        
        return x

processing keypoints:

#### training the model:

- with cross-entropy loss function (fits our classification task)
- Adam optimizer

HPO:

In [6]:
import optuna
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np

In [7]:
def train_and_eval_model(kp_model, optimizer, loss_fn, num_epochs, train_loader, val_loader, device):
    
    kp_model = kp_model.to(device)  # move model to device
    kp_model.train()  # set model to training mode

    for epoch in range(num_epochs):
        for i, batch in enumerate(train_loader):
            # access images and labels
            inputs = batch[0].to(device)
            labels = batch[1].to(device)
            optimizer.zero_grad()  # Zero the parameter gradients
            classification_output = kp_model(inputs) # get results for the classification
            
            loss = loss_fn(classification_output.unsqueeze(0), labels)
            
            loss.backward()  # Backpropagate the loss
            optimizer.step()  # Update weights
            print(f"Processed batch {i} of epoch {epoch}")
            # todo: further processing, such as calculating accuracy or loss, goes here

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
            
        
        kp_model.eval()  # model to evaluation mode

        total_loss = 0
        all_predictions = []
        all_labels = []

        with torch.no_grad():  # no need to compute gradients, because we are in evaluation mode
            for inputs, labels in val_loader:  # iterate over validation dataset
                inputs, labels = inputs.to(device), labels.to(device)  # move data to device
                
                # Not necessary because already preprocessed
                # keypoints = get_keypoints_from_yolo(yolo_model, inputs) # get keypoints from the YOLO model
                # processed_kps = process_keypoints_for_classifier(keypoints) # prepare the keypoints for the classifier
                
                classification_output = kp_model(inputs) # get results for the classification 
                
                # Use this for classification
                loss = loss_fn(classification_output, labels)
                
                total_loss += loss.item()  # accumulate the loss
                # get predictions for output
                _, predicted = torch.max(classification_output.data, 1)
                # collect the predictions and labels
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
            # calculate the validation metrics:
            avg_loss = total_loss / len(val_loader)  # get the average loss
            # accuracy = (np.array(all_predictions) == np.array(all_labels)).mean()
            # f1 = f1_score(all_labels, all_predictions, average='weighted')  
            # conf_matrix = confusion_matrix(all_labels, all_predictions)
            
            print(f"Validation Loss: {avg_loss:.4f}")
            # print(f"Validation Accuracy: {accuracy:.4f}")
            # print(f"Validation F1 Score: {f1:.4f}")
            # print("Confusion Matrix:")
            # print(conf_matrix)
            
            # here: F1 score chosen as the metric to optimize
            # other options: - combining metrics like accuracy and F1 score to maximize on both
            #                        - or multi-objective optimization on F1 score and accuracy
    return avg_loss 

In [8]:
# this function was generated by gpt-4

def get_k_fold_indices(n, k=5, random_seed=None):
    """
    Generate indices for k-fold cross-validation.

    Parameters:
    - n: Total number of samples in the dataset.
    - k: Number of folds.
    - random_seed: Optional seed for reproducibility.

    Returns:
    - A list of tuples, each containing (train_indices, val_indices) for a fold.
    """
    # Initialize the random generator
    g = torch.Generator()
    if random_seed is not None:
        g.manual_seed(random_seed)
    
    # Generate a random permutation of indices
    indices = torch.randperm(n, generator=g).tolist()
    
    # Calculate fold sizes
    fold_sizes = [n // k for _ in range(k)]
    for i in range(n % k):
        fold_sizes[i] += 1
    
    # Generate train and validation indices for each fold
    current = 0
    folds_indices = []
    for fold_size in fold_sizes:
        start, end = current, current + fold_size
        val_indices = indices[start:end]
        train_indices = indices[:start] + indices[end:]
        folds_indices.append((train_indices, val_indices))
        current = end
    
    return folds_indices


In [9]:
def objective(trial, yolo_model, dataset):
    # Define hyperparameters to optimize
    lr = trial.suggest_categorical("lr", [1e-3])
    h1 = trial.suggest_categorical("h1", [256, 512, 1024])
    h2 = trial.suggest_categorical("h2", [256, 512, 1024])
    batch_size = trial.suggest_categorical("batch_size", [1])
    num_epochs = trial.suggest_categorical("num_epochs", [100])

    validation_scores = []

    n = len(dataset)
    k = 5
    folds_indices = get_k_fold_indices(n, k, random_seed=13)
    device = get_device()
    
    for fold, (train_idx, val_idx) in enumerate(folds_indices, start=1):
        print(f"Trial {trial.number}, Fold {fold}/{k}")
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        
        # Create data loaders for training and validation
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
        
        # Initializing the model and optimizer with the chosen hyperparameters
        model = FinetunedYOLO(yolo_model, h1, h2).to(device)
        for param in model.classifier.parameters():
            param.requires_grad = True
        optimizer = torch.optim.Adam(model.classifier.parameters(), lr=lr, weight_decay=1e-5)
    
        # training and evaluating the model - Watch out for the Loss - CrossEntropyLoss for classification and MSELoss for scoring
        validation_score = train_and_eval_model(model, optimizer, nn.CrossEntropyLoss(), num_epochs, train_loader, val_loader, device)
        validation_scores.append(validation_score)

    return np.mean(validation_scores)


In [10]:
torch.cuda.empty_cache()

yolo_model = YOLO("yolov8n-pose.pt")
optimize = partial(objective, yolo_model = yolo_model, dataset = train_and_val_dataset)
search_space = {
    "lr": [1e-3],
    "h1": [256, 512, 1024],
    "h2": [256, 512, 1024],
    "batch_size": [1],
    "num_epochs": [100]
}

study = optuna.create_study(sampler=optuna.samplers.GridSampler(search_space), direction='minimize')
study.optimize(optimize, n_trials=9)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)

[I 2024-03-23 21:09:15,476] A new study created in memory with name: no-name-81cc2d79-e7d0-4fee-b1a8-2923a24371a3


Trial 0, Fold 1/5
Processed batch 0 of epoch 0
Processed batch 1 of epoch 0
Processed batch 2 of epoch 0
Processed batch 3 of epoch 0
Processed batch 4 of epoch 0
Processed batch 5 of epoch 0
Processed batch 6 of epoch 0
Processed batch 7 of epoch 0
Processed batch 8 of epoch 0
Processed batch 9 of epoch 0
Processed batch 10 of epoch 0
Processed batch 11 of epoch 0
Processed batch 12 of epoch 0
Processed batch 13 of epoch 0
Processed batch 14 of epoch 0
Processed batch 15 of epoch 0
Processed batch 16 of epoch 0
Processed batch 17 of epoch 0


[W 2024-03-23 21:17:42,148] Trial 0 failed with parameters: {'lr': 0.001, 'h1': 512, 'h2': 256, 'batch_size': 1, 'num_epochs': 100} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "e:\Users\Vipin\Documents\BHT\3. Semester\Learning from images\ergonomic_pose_detect\.venv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Vipin\AppData\Local\Temp\ipykernel_10352\1964666508.py", line 32, in objective
    validation_score = train_and_eval_model(model, optimizer, nn.CrossEntropyLoss(), num_epochs, train_loader, val_loader, device)
  File "C:\Users\Vipin\AppData\Local\Temp\ipykernel_10352\2415607574.py", line 9, in train_and_eval_model
    inputs = batch[0].to(device)
KeyboardInterrupt
[W 2024-03-23 21:17:42,150] Trial 0 failed with value None.


KeyboardInterrupt: 

In [None]:
best_params = study.best_params
best_params

In [None]:
best_params = study.best_params
# extract best_parameters
batch_size = best_params['batch_size']
lr = best_params['lr']
h1 = best_params['h1']
h2 = best_params['h2']
num_epochs = best_params['num_epochs']

#### train and evaluate final model on test set:

Unfortunately Optuna cannot output the best model, so we train again on the combined train and validation set with the best parameters found.

dataloader for final evaluation:

In [None]:
train_and_eval_loader = preprocess_dataset(mmpose_model, train_and_val_dataset, get_device())
test_loader = preprocess_dataset(mmpose_model, test_dataset, get_device())

final evaluation on test set:

In [None]:
# new instance of the model:
kp_model = KeypointScorer(num_keypoints, h1, h2)
# kp_model = KeypointClassifier(num_keypoints, num_classes, h1, h2)
kp_model = kp_model.to(get_device())  # move model to device
kp_model.train()  # set model to training mode

# loss function and optimizer
criterion = nn.MSELoss() # nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(kp_model.parameters(), lr=lr, weight_decay=1e-5)

for epoch in range(num_epochs):
    for inputs, labels in train_and_eval_loader:  # Assuming data_loader is your DataLoader instance for the dataset
        inputs, labels = inputs.to(get_device()), labels.to(get_device())  # move data to device
        optimizer.zero_grad()  # Zero the parameter gradients
        classification_output = kp_model(inputs) # get results for the classification 
        
        # Use this for classification
        # loss = criterion(classification_output, labels)
        
        # Use this for scoring
        loss = criterion(classification_output.float(), labels.float())
        
        loss.backward()  # Backpropagate the loss
        optimizer.step()  # Update weights

        # todo: further processing, such as calculating accuracy or loss, goes here

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        
    
kp_model.eval()  # model to evaluation mode

total_loss = 0
all_predictions = []
all_labels = []
with torch.no_grad():  # no need to compute gradients, because we are in evaluation mode
    for inputs, labels in test_loader:  # iterate over validation dataset
        inputs, labels = inputs.to(get_device()), labels.to(get_device())  # move data to device
        classification_output = kp_model(inputs) # get results for the classification 
        
        # Use this for classification
        loss = criterion(classification_output, labels)
        
        # Use this for scoring
        # loss = criterion(classification_output.float(), labels.float())
        
        total_loss += loss.item()  # accumulate the loss
        
        if classification_output.data.dim() == 1:
            classification_output.data = classification_output.data.unsqueeze(0)
        
        _, predicted = torch.max(classification_output.data, 1)
        # collect the predictions and labels
        all_predictions.extend(predicted.cpu().numpy())
        
        if labels.dim() == 0:
            labels = labels.unsqueeze(0)  # Add a dimension to make it iterable
        all_labels.extend(labels.cpu().numpy())
    # calculate the validation metrics:
    avg_loss = total_loss / len(test_loader)  # get the average loss
    # accuracy = (np.array(all_predictions) == np.array(all_labels)).mean()
    # f1 = f1_score(all_labels, all_predictions, average='weighted')  
    # conf_matrix = confusion_matrix(all_labels, all_predictions)
    
    print(f"Test Loss: {avg_loss:.4f}")
    # print(f"Test Accuracy: {accuracy:.4f}")
    # print(f"Test F1 Score: {f1:.4f}")
    # print("Confusion Matrix:")
    # print(conf_matrix)