# Ergonomic pose fine-tuning: 

## fine tuning yolo for our classification task

- option 1: freeze the complete model, use it as a feature extractor and add a classification head to be trained on our dataset
- option 2: cut the keypoint output layer, freeze the rest of the model and add a classification head to be trained on our dataset

### 2) YOLOv8pose model with classification head

imports:

In [2]:
from ultralytics import YOLO
import cv2
import numpy as np
from PIL import Image
import torch
from functools import partial

In [270]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")
    
get_device()

device(type='mps')

## Fine-Tuning steps:

1) load the model
2) inspect the model layers
3) define a classification head
4) cut off the last layer of the original model
5) add the classification head as last layer
6) train the new classification head
7) evaluate the new fine-tuned model

### 1. Load the model

In [221]:
# Load the YOLO model
yolo_model = YOLO('yolov8n-pose.pt')

### 2. Inspect the model layers

- find out where to cut the model
- check the output feature size of the layer before the layer we want to cut

In [None]:
 # check the model inspection notebook for more details
print(yolo_model)  

### 3. defining a classification head:

In [265]:
import torch
import torch.nn as nn

num_classes = 3  
num_features = 4*6300
h1 = 512
h2 = 256


class ClassificationHead(nn.Module):
    def __init__(self, input_features, num_classes, h1, h2):
        super(ClassificationHead, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_features, h1),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        self.fc2 = nn.Sequential(
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        self.fc3 = nn.Sequential(
            nn.Linear(h2, h2 // 2),
            nn.ReLU()
        )
        self.out = nn.Linear(h2 // 2, num_classes)
        
    
    def forward(self, flattened_input):
        x = self.fc1(flattened_input)
        x = self.fc2(x)
        x = self.fc3(x)
        output = self.out(x)
        
        return output



### 4. get rid of the last 3 layers of the YOLO pose head:

In [213]:
import torch
import torch.nn as nn
from torchinfo import summary

yolo_model=YOLO("yolov8n-pose.pt")
yolo_backbone= nn.Sequential(*list(yolo_model.model.children())[0][:22]) # keeps layer 0 to 21, without the pose head (layer 22)
pose_head= nn.Sequential(*list(yolo_model.model.children())[0][22:23])
pose_stem=nn.Sequential(pose_head[0].cv2, pose_head[0].cv3, pose_head[0].dfl)
yolo_pose = nn.Sequential(*yolo_backbone, *pose_stem)

In [258]:
summary(yolo_model)

Layer (type:depth-idx)                                  Param #
YOLO                                                    --
├─PoseModel: 1-1                                        --
│    └─Sequential: 2-1                                  --
│    │    └─Conv: 3-1                                   (464)
│    │    └─Conv: 3-2                                   (4,672)
│    │    └─C2f: 3-3                                    (7,360)
│    │    └─Conv: 3-4                                   (18,560)
│    │    └─C2f: 3-5                                    (49,664)
│    │    └─Conv: 3-6                                   (73,984)
│    │    └─C2f: 3-7                                    (197,632)
│    │    └─Conv: 3-8                                   (295,424)
│    │    └─C2f: 3-9                                    (460,288)
│    │    └─SPPF: 3-10                                  (164,608)
│    │    └─Upsample: 3-11                              --
│    │    └─Concat: 3-12                           

In [256]:

summary(yolo_pose)
#summary(pose_head)

Layer (type:depth-idx)                             Param #
Sequential                                         --
├─Sequential: 1-1                                  --
│    └─Conv: 2-1                                   --
│    │    └─Conv2d: 3-1                            (432)
│    │    └─BatchNorm2d: 3-2                       (32)
│    │    └─SiLU: 3-3                              --
│    └─Conv: 2-2                                   --
│    │    └─Conv2d: 3-4                            (4,608)
│    │    └─BatchNorm2d: 3-5                       (64)
│    │    └─SiLU: 3-6                              --
│    └─C2f: 2-3                                    --
│    │    └─Conv: 3-7                              (1,088)
│    │    └─Conv: 3-8                              (1,600)
│    │    └─ModuleList: 3-9                        (4,672)
│    └─Conv: 2-4                                   --
│    │    └─Conv2d: 3-10                           (18,432)
│    │    └─BatchNorm2d: 3-11               

#### freeze the layers of the YOLO model we use as backbone:

In [217]:
for param in yolo_pose.parameters():
   param.requires_grad = False # freeze the cut YOLO model

In [219]:
#for name, param in yolo_pose.named_parameters():
#   print(f"{name} is {'not ' if param.requires_grad else ''}frozen")

### 5. new full model (wrapper)

- take the original model with the last layer(s) cut
- add the new classification head

In [266]:
# add a classification layer instead
num_classes = 3  
num_features = 4*6300
h1 = 512
h2 = 256

class YOLOPoseClassify(nn.Module):
    def __init__(self, yolo_pose, classification_head=ClassificationHead(num_features, num_classes, h1, h2)):
        super(YOLOPoseClassify, self).__init__()
        self.yolo = yolo_pose
        self.classification_head = classification_head

    def forward(self, x):
            # forward pass through the modified YOLO model
            features = self.yolo(x)
            
            # forward pass through the classification head
            output = self.classification_head(features)
        
            return output


### 6. Train the new Frankenstein Model

#### load the own dataset:

In [222]:
from torch.utils.data import random_split, DataLoader, Subset, TensorDataset
from torchvision import datasets, transforms

In [223]:
# Normalization should be applied to the keypoints not the images I think - Vipin
  
data_transforms = transforms.Compose([
    transforms.Resize((640, 480)), # Resize images to 640x640
    transforms.ToTensor() # Convert to tensor
])

In [227]:
data_path = 'example_images'
own_dataset = datasets.ImageFolder(root=data_path, transform=data_transforms)

In [228]:
own_dataset

Dataset ImageFolder
    Number of datapoints: 8
    Root location: example_images
    StandardTransform
Transform: Compose(
               Resize(size=(640, 480), interpolation=bilinear, max_size=None, antialias=warn)
               ToTensor()
           )

#### splitting the data into train, validation and test set:

In [229]:
generator1 = torch.Generator().manual_seed(13)  # set seed for reproducibility of the split
train_and_val_dataset, test_dataset = random_split(own_dataset, [0.8, 0.2], generator=generator1)  # 80% training and evaluation, 20% testing

#### training new classification head of the yolo pose model :

- with cross-entropy loss function (fits our classification task)
- Adam optimizer

#### HPO with Optuna:

In [230]:
import optuna
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


preprocess the dataset:

In [277]:
def preprocess_dataset(data, batch_size, num_workers):
    loader = DataLoader(data, batch_size=batch_size, num_workers=num_workers, shuffle=True)

    return loader

train and evaluate the model on the own dataset:

In [278]:
def train_and_eval_model(yolo_fine_model, optimizer, loss_function, num_epochs, train_loader, val_loader, device):
    
    yolo_fine_model = yolo_fine_model.to(device)  # move YOLO model to device
    yolo_fine_model.train()  # set model to training mode
    
    print("type of train_loader", type(train_loader))

    for epoch in range(num_epochs):
        for inputs, labels in train_loader:
            # access images and labels
            inputs, labels = inputs.to(device), labels.to(device)  # move data to device
            
            optimizer.zero_grad()  # Zero the parameter gradients
            classification_output = yolo_fine_model(inputs) # get results for the classification
            
            # Use this for classification
            loss = loss_function(classification_output, labels)
            
            # Use this for scoring
            # loss = loss_fn(classification_output.float(), labels.float())
            
            loss.backward()  # Backpropagate the loss
            optimizer.step()  # Update weights

            # todo: further processing, such as calculating accuracy or loss, goes here

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
            
        
        yolo_fine_model.eval()  # model to evaluation mode

        total_loss = 0
        all_predictions = []
        all_labels = []

        with torch.no_grad():  # no need to compute gradients, because we are in evaluation mode
            for inputs, labels in val_loader:  # iterate over validation dataset (lables are loaded automatically in dataloader from the Image Folder)
                inputs, labels = inputs.to(device), labels.to(device)  # move data to device
                
                classification_output = YOLOPoseClassify(inputs) # get results for the classification 
                
                # calculate the loss between the labels and the predictions
                loss = loss_function(classification_output, labels) 
                
                total_loss += loss.item()  # accumulate the loss
                # get predictions for output
                _, predicted = torch.max(classification_output.data, 1)
                # collect the predictions and labels
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
            # calculate the validation metrics:
            avg_loss = total_loss / len(val_loader)  # get the average loss
            accuracy = (np.array(all_predictions) == np.array(all_labels)).mean()
            f1 = f1_score(all_labels, all_predictions, average='weighted')  
            conf_matrix = confusion_matrix(all_labels, all_predictions)
            
            print(f"Validation Loss: {avg_loss:.4f}")
            print(f"Validation Accuracy: {accuracy:.4f}")
            print(f"Validation F1 Score: {f1:.4f}")
            print("Confusion Matrix:")
            print(conf_matrix)
            
            # here: F1 score chosen as the metric to optimize
            # other options: - combining metrics like accuracy and F1 score to maximize on both
            #                        - or multi-objective optimization on F1 score and accuracy
    return f1 

In [279]:
# this function was generated by gpt-4

def get_k_fold_indices(n, k=5, random_seed=None):
    """
    Generate indices for k-fold cross-validation.

    Parameters:
    - n: Total number of samples in the dataset.
    - k: Number of folds.
    - random_seed: Optional seed for reproducibility.

    Returns:
    - A list of tuples, each containing (train_indices, val_indices) for a fold.
    """
    # Initialize the random generator
    g = torch.Generator()
    if random_seed is not None:
        g.manual_seed(random_seed)
    
    # Generate a random permutation of indices
    indices = torch.randperm(n, generator=g).tolist()
    
    # Calculate fold sizes
    fold_sizes = [n // k for _ in range(k)]
    for i in range(n % k):
        fold_sizes[i] += 1
    
    # Generate train and validation indices for each fold
    current = 0
    folds_indices = []
    for fold_size in fold_sizes:
        start, end = current, current + fold_size
        val_indices = indices[start:end]
        train_indices = indices[:start] + indices[end:]
        folds_indices.append((train_indices, val_indices))
        current = end
    
    return folds_indices


In [280]:
def objective(trial, dataset):
    # Define hyperparameters to optimize
    lr = trial.suggest_categorical("lr", [1e-4, 5e-5])
    h1 = trial.suggest_categorical("h1", [256, 512])
    h2 = trial.suggest_categorical("h2", [256, 512])
    #batch_size = trial.suggest_categorical("batch_size", [4, 8])
    num_epochs = trial.suggest_categorical("num_epochs", [200, 300])

    validation_scores = []

    n = len(dataset)
    k = 5
    folds_indices = get_k_fold_indices(n, k, random_seed=13)
    device = get_device()
    
    for fold, (train_idx, val_idx) in enumerate(folds_indices, start=1):
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        print("type of train subset: " , type(train_subset))
        
        # Batch size goes here
        train_loader = preprocess_dataset(train_subset, batch_size=1, num_workers=4)
        val_loader = preprocess_dataset(val_subset, batch_size=1, num_workers=4)
        print("type of train loader: " , type(train_loader))
        
        # Initializing the model and optimizer with the chosen hyperparameters
        yolo_fine_model = YOLOPoseClassify(yolo_pose, classification_head=ClassificationHead(num_features, num_classes, h1, h2)).to(device)
        optimizer = torch.optim.Adam(yolo_fine_model.parameters(), lr=lr, weight_decay=1e-5)
    
        # training and evaluating the model - Watch out for the Loss - CrossEntropyLoss for classification 
        validation_score = train_and_eval_model(yolo_fine_model, optimizer, nn.CrossEntropyLoss(), num_epochs, train_loader, val_loader, device)
        validation_scores.append(validation_score)

    return np.mean(validation_scores)


In [268]:
yolo_fine_model = YOLOPoseClassify(yolo_pose, classification_head=ClassificationHead(num_features, num_classes, h1, h2))
# print the model
summary(yolo_fine_model)

Layer (type:depth-idx)                                  Param #
YOLOPoseClassify                                        --
├─Sequential: 1-1                                       --
│    └─Sequential: 2-1                                  --
│    │    └─Conv: 3-1                                   (464)
│    │    └─Conv: 3-2                                   (4,672)
│    │    └─C2f: 3-3                                    (7,360)
│    │    └─Conv: 3-4                                   (18,560)
│    │    └─C2f: 3-5                                    (49,664)
│    │    └─Conv: 3-6                                   (73,984)
│    │    └─C2f: 3-7                                    (197,632)
│    │    └─Conv: 3-8                                   (295,424)
│    │    └─C2f: 3-9                                    (460,288)
│    │    └─SPPF: 3-10                                  (164,608)
│    │    └─Upsample: 3-11                              --
│    │    └─Concat: 3-12                           

In [281]:
optimize = partial(objective, dataset = train_and_val_dataset)
search_space = {
    "lr": [1e-4, 5e-5],
    "h1": [256, 512],
    "h2": [256, 512],
    "batch_size": [4, 8],
    "num_epochs": [200, 300]
}

study = optuna.create_study(sampler=optuna.samplers.GridSampler(search_space), direction='maximize')
study.optimize(optimize, n_trials=32)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)

[I 2024-03-23 01:26:28,491] A new study created in memory with name: no-name-47e7256b-6335-4009-8547-3970dfe4d4d3


type of train subset:  <class 'torch.utils.data.dataset.Subset'>
type of train loader:  <class 'torch.utils.data.dataloader.DataLoader'>
type of train_loader <class 'torch.utils.data.dataloader.DataLoader'>


[W 2024-03-23 01:26:57,357] Trial 0 failed with parameters: {'lr': 5e-05, 'h1': 512, 'h2': 256, 'num_epochs': 300} because of the following error: TypeError('cat() received an invalid combination of arguments - got (Tensor, int), but expected one of:\n * (tuple of Tensors tensors, int dim, *, Tensor out)\n * (tuple of Tensors tensors, name dim, *, Tensor out)\n').
Traceback (most recent call last):
  File "/Users/ari/Documents/Data_Science/3_semester/learning_from_images/egronomic_pose_project/ergonomic_pose_detect/ergonomic_pose.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/9c/8w6dd7612jv4k22v8dfgbq040000gn/T/ipykernel_90385/2049859940.py", line 31, in objective
    validation_score = train_and_eval_model(yolo_fine_model, optimizer, nn.CrossEntropyLoss(), num_epochs, train_loader, val_loader, device)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

TypeError: cat() received an invalid combination of arguments - got (Tensor, int), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)


In [19]:
best_params = study.best_params
best_params

{'lr': 0.0001, 'h1': 512, 'h2': 1024, 'batch_size': 4, 'num_epochs': 300}

In [20]:
best_params = study.best_params
# extract best_parameters
batch_size = best_params['batch_size']
lr = best_params['lr']
h1 = best_params['h1']
h2 = best_params['h2']
num_epochs = best_params['num_epochs']

#### train and evaluate final model on test set:

Unfortunately Optuna cannot output the best model, so we train again on the combined train and validation set with the best parameters found.

dataloader for final evaluation:

In [21]:
train_and_eval_loader = DataLoader(train_and_val_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True)

train_and_eval_loader = preprocess_dataset(yolo_model, train_and_eval_loader, batch_size, get_device())
test_loader = preprocess_dataset(yolo_model, test_loader, batch_size, get_device())


0: 640x480 (no detections), 9.0ms
Speed: 0.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
torch.Size([0, 2])

0: 640x480 1 person, 6.5ms
Speed: 0.0ms preprocess, 6.5ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 480)
torch.Size([17, 2])

0: 640x480 1 person, 6.5ms
Speed: 0.0ms preprocess, 6.5ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 480)
torch.Size([17, 2])

0: 640x480 (no detections), 6.5ms
Speed: 0.0ms preprocess, 6.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
torch.Size([0, 2])

0: 640x480 1 person, 7.0ms
Speed: 0.0ms preprocess, 7.0ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 480)
torch.Size([17, 2])

0: 640x480 1 person, 7.0ms
Speed: 0.0ms preprocess, 7.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)
torch.Size([17, 2])

0: 640x480 1 person, 7.5ms
Speed: 0.0ms preprocess, 7.5ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 480)


final evaluation on test set:

In [23]:
# new instance of the model:
yolo_finetuned_model = YOLOPoseClassify(num_features, num_classes, h1, h2)
yolo_finetuned_model = yolo_finetuned_model.to(get_device())  # move model to device
yolo_finetuned_model.train()  # set model to training mode

# loss function and optimizer
criterion = nn.CrossEntropyLoss() #nn.MSELoss()
optimizer = torch.optim.Adam(yolo_finetuned_model.parameters(), lr=lr, weight_decay=1e-5)

for epoch in range(num_epochs):
    for inputs, labels in train_and_eval_loader:  # Assuming data_loader is your DataLoader instance for the dataset
        inputs, labels = inputs.to(get_device()), labels.to(get_device())  # move data to device
        optimizer.zero_grad()  # Zero the parameter gradients
        classification_output = yolo_finetuned_model (inputs) # get results for the classification 
        
        # Use this for classification
        loss = criterion(classification_output, labels)
        
        # Use this for scoring
        # loss = criterion(classification_output.float(), labels.float())
        
        loss.backward()  # Backpropagate the loss
        optimizer.step()  # Update weights

        # todo: further processing, such as calculating accuracy or loss, goes here

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        
    
yolo_finetuned_model.eval()  # model to evaluation mode

total_loss = 0
all_predictions = []
all_labels = []
with torch.no_grad():  # no need to compute gradients, because we are in evaluation mode
    for inputs, labels in test_loader:  # iterate over validation dataset
        inputs, labels = inputs.to(get_device()), labels.to(get_device())  # move data to device
        classification_output = yolo_finetuned_model(inputs) # get results for the classification 
        
        # Use this for classification
        loss = criterion(classification_output, labels)
        
        # Use this for scoring
        # loss = criterion(classification_output.float(), labels.float())
        
        total_loss += loss.item()  # accumulate the loss
        _, predicted = torch.max(classification_output.data, 1)
        # collect the predictions and labels
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    # calculate the validation metrics:
    avg_loss = total_loss / len(test_loader)  # get the average loss
    accuracy = (np.array(all_predictions) == np.array(all_labels)).mean()
    f1 = f1_score(all_labels, all_predictions, average='weighted')  
    conf_matrix = confusion_matrix(all_labels, all_predictions)
    
    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)

Epoch [1/300], Loss: 1.0472
Epoch [2/300], Loss: 1.1228
Epoch [3/300], Loss: 1.1898
Epoch [4/300], Loss: 0.9125
Epoch [5/300], Loss: 0.7267
Epoch [6/300], Loss: 1.0057
Epoch [7/300], Loss: 0.5941
Epoch [8/300], Loss: 1.0955
Epoch [9/300], Loss: 0.5706
Epoch [10/300], Loss: 1.4644
Epoch [11/300], Loss: 1.0247
Epoch [12/300], Loss: 0.8891
Epoch [13/300], Loss: 0.8012
Epoch [14/300], Loss: 1.3533
Epoch [15/300], Loss: 0.8050
Epoch [16/300], Loss: 0.9901
Epoch [17/300], Loss: 0.6548
Epoch [18/300], Loss: 0.9349
Epoch [19/300], Loss: 1.0869
Epoch [20/300], Loss: 0.8145
Epoch [21/300], Loss: 0.8749
Epoch [22/300], Loss: 1.4428
Epoch [23/300], Loss: 0.6674
Epoch [24/300], Loss: 1.4872
Epoch [25/300], Loss: 0.5583
Epoch [26/300], Loss: 0.3842
Epoch [27/300], Loss: 1.0396
Epoch [28/300], Loss: 0.9086
Epoch [29/300], Loss: 0.6882
Epoch [30/300], Loss: 1.0397
Epoch [31/300], Loss: 0.4248
Epoch [32/300], Loss: 0.7491
Epoch [33/300], Loss: 0.5157
Epoch [34/300], Loss: 0.4379
Epoch [35/300], Loss: 0