# Ergonomic pose fine-tuning: 

## fine tuning yolo for our classification task

- option 1: freeze the complete model, use it as a feature extractor and add a classification head to be trained on our dataset
- option 2: cut the keypoint output layer, freeze the rest of the model and add a classification head to be trained on our dataset

### 1) YOLOv8pose model as feature extractor

imports:

In [1]:
from ultralytics import YOLO
import cv2
import numpy as np
from PIL import Image
#%pip install -U ultralytics

### load the model

In [51]:
model = YOLO('yolov8n-pose.pt')

for param in model.parameters():
    param.requires_grad = False # freeze the YOLO model

#for name, param in model.named_parameters():
    #print(f"{name} is {'not ' if param.requires_grad else ''}frozen"

defining a classification layer:

In [29]:
import torch
import torch.nn as nn

# `feature_extractor` is the part of the model suitable for feature extraction

class YOLO_kp_Classifier(nn.Module):
    def __init__(self, num_keypoints, num_classes=3):
        super(YOLO_kp_Classifier, self).__init__()
        # to flatten the output 
        #self.flatten = nn.Flatten()
        # add new classification layer(s) to the model
        self.classifier = nn.Sequential(
            nn.Linear(num_keypoints*num_classes, 512),  # 17 keypoints * 3 (x, y, z coordinates for each keypoint)
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes),
        )
    
    def forward(self, keypoints_3d):
        #keypoints_3d = self.flatten(keypoints_3d)
        output = self.classifier(keypoints_3d)
        return output


the keypoints are calculated in 2D and 3D, which we will leverage for our predictions

initializing the model:

In [30]:
# initialize the classifier model
num_classes = 3  #  3-class classification problem
num_keypoints = 17  # 17 keypoints in the model

kp_classifier_model = YOLO_kp_Classifier(num_keypoints=num_keypoints, num_classes=num_classes)

load the own dataset:

In [31]:
from torch.utils.data import random_split, DataLoader, Subset
from torchvision import datasets, transforms

In [33]:
data_transforms = transforms.Compose([
    transforms.Resize((224, 224)), # Resize images for the model?
    transforms.ToTensor()
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # normalize
])

In [34]:
data_path = 'example_images'
own_dataset = datasets.ImageFolder(root=data_path, transform=data_transforms)
#type(own_dataset)

splitting the data into train, validation and test set:

In [37]:
generator1 = torch.Generator().manual_seed(13)  # set seed for reproducibility of the split
train_and_val_dataset, test_dataset =random_split(own_dataset, [0.8, 0.2], generator=generator1)  # 80% training and evaluation, 20% testing

#type(train_and_val_dataset)

feature extraction with yolo:

In [None]:
def get_keypoints_from_yolo(model, inputs):
    model.eval()  # YOLO model set to evaluation mode because we are not training it
    with torch.no_grad(): # gradients are not computed for the frozen model
        results = model(inputs)
        keypoints = results[0].keypoints.data  # extract the keypoints from the results
    return keypoints

processing keypoints:

In [40]:
# todo: check if this is sufficient, otherwise try z-score normalization or other methods

def process_keypoints_for_classifier(keypoints):
    # processing the keypoints in a suitable format for the classifier
    # keypoints is of shape [batch_size, 17, 3]
    # normalizing keypoints across the keypoints dimension
    # with min-max normalization i
    kp_tensor_norm = (keypoints - keypoints.min(dim=1, keepdim=True)[0]) / (keypoints.max(dim=1, keepdim=True)[0] - keypoints.min(dim=1, keepdim=True)[0])
    
    # Flatten the last two dimensions while keeping the batch dimension
    batch_flattened = kp_tensor_norm.view(keypoints.size(0), -1)  # Reshapes to [batch_size, 17*3]
    return batch_flattened

#### training the model:

- with cross-entropy loss function (fits our classification task)
- Adam optimizer

HPO:

In [41]:
#!pip install --upgrade pip
#%pip install scikit-learn
#%pip install optuna
import optuna
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np

In [42]:
# initialize the classifier model
num_classes = 3  #  3-class classification problem
num_keypoints = 17  # 17 keypoints in the model
# loss function and optimizer
criterion = nn.CrossEntropyLoss()

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # else ("mps" if torch.backends.mps.is_available())
print(f"Using device: {device}")

Using device: cpu


In [44]:
def train_and_eval_model(kp_classifier_model, model, optimizer, num_epochs, train_loader, val_loader, device):
    
    kp_classifier_model = kp_classifier_model.to(device)  # move model to device
    kp_classifier_model.train()  # set model to training mode
    
    print("type of train_loader", type(train_loader))

    for epoch in range(num_epochs):
        for batch in train_loader:
            # access images and labels
            inputs = batch[0].to(device)
            labels = batch[1].to(device)
            #print("inputs", type(inputs), len(inputs), inputs)
            #inputs, labels = inputs.to(device), labels.to(device)  # move data to device
            model = model.to(device) # YOLO move model to device
            optimizer.zero_grad()  # Zero the parameter gradients
            keypoints = get_keypoints_from_yolo(model, inputs) # get keypoints from the YOLO model
            processed_kps = process_keypoints_for_classifier(keypoints) # prepare the keypoints for the classifier
            classification_output = kp_classifier_model(processed_kps) # get results for the classification 
            loss = criterion(classification_output , labels)  # Compute loss
            loss.backward()  # Backpropagate the loss
            optimizer.step()  # Update weights

            # todo: further processing, such as calculating accuracy or loss, goes here

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
            
        
        kp_classifier_model.eval()  # model to evaluation mode

        total_loss = 0
        all_predictions = []
        all_labels = []

        with torch.no_grad():  # no need to compute gradients, because we are in evaluation mode
            for inputs, labels in val_loader:  # iterate over validation dataset
                inputs, labels = inputs.to(device), labels.to(device)  # move data to device
                keypoints = get_keypoints_from_yolo(model, inputs) # get keypoints from the YOLO model
                processed_kps = process_keypoints_for_classifier(keypoints) # prepare the keypoints for the classifier
                classification_output = kp_classifier_model(processed_kps) # get results for the classification 
                loss = criterion(classification_output , labels)  # compute loss
                total_loss += loss.item()  # accumulate the loss
                # get predictions for output
                _, predicted = torch.max(classification_output.data, 1)
                # collect the predictions and labels
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
            # calculate the validation metrics:
            avg_loss = total_loss / len(val_loader)  # get the average loss
            accuracy = (np.array(all_predictions) == np.array(all_labels)).mean()
            f1 = f1_score(all_labels, all_predictions, average='weighted')  
            conf_matrix = confusion_matrix(all_labels, all_predictions)
            
            print(f"Validation Loss: {avg_loss:.4f}")
            print(f"Validation Accuracy: {accuracy:.4f}")
            print(f"Validation F1 Score: {f1:.4f}")
            print("Confusion Matrix:")
            print(conf_matrix)
            
            # here: F1 score chosen as the metric to optimize
            # other options: - combining metrics like accuracy and F1 score to maximize on both
            #                        - or multi-objective optimization on F1 score and accuracy
    return f1 

In [45]:
# this function was generated by gpt-4

def get_k_fold_indices(n, k=5, random_seed=None):
    """
    Generate indices for k-fold cross-validation.

    Parameters:
    - n: Total number of samples in the dataset.
    - k: Number of folds.
    - random_seed: Optional seed for reproducibility.

    Returns:
    - A list of tuples, each containing (train_indices, val_indices) for a fold.
    """
    # Initialize the random generator
    g = torch.Generator()
    if random_seed is not None:
        g.manual_seed(random_seed)
    
    # Generate a random permutation of indices
    indices = torch.randperm(n, generator=g).tolist()
    
    # Calculate fold sizes
    fold_sizes = [n // k for _ in range(k)]
    for i in range(n % k):
        fold_sizes[i] += 1
    
    # Generate train and validation indices for each fold
    current = 0
    folds_indices = []
    for fold_size in fold_sizes:
        start, end = current, current + fold_size
        val_indices = indices[start:end]
        train_indices = indices[:start] + indices[end:]
        folds_indices.append((train_indices, val_indices))
        current = end
    
    return folds_indices


In [46]:
def objective(trial):
    # Define hyperparameters to optimize
    lr = trial.suggest_categorical("lr", [1e-4, 5e-4, 1e-3])
    #momentum = trial.suggest_categorical("momentum", [0.9, 0.95])
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    num_epochs = trial.suggest_categorical("num_epochs", [100, 200, 300])

    # convert dataset into a list for index-based access
    dataset = train_and_val_dataset

    validation_scores = []

    n = len(dataset)
    k = 5
    folds_indices = get_k_fold_indices(n, k, random_seed=13)

    for fold, (train_idx, val_idx) in enumerate(folds_indices, start=1):
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        print("type of train subset: " , type(train_subset))
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
        print("type of train loader: " , type(train_loader))
        # initializing the model and optimizer with the chosen hyperparameters
        kp_classifier_model = YOLO_kp_Classifier(num_keypoints=num_keypoints, num_classes=num_classes)
        optimizer = torch.optim.Adam(kp_classifier_model.classifier.parameters(), lr=lr, weight_decay=1e-5)
    
        # training and evaluating the model
        validation_score = train_and_eval_model(kp_classifier_model, model, optimizer, num_epochs, train_loader, val_loader, device)
        validation_scores.append(validation_score)

    return np.mean(validation_scores)


In [52]:
#-----------------------------------------------------------------------------------------------------------
        #  todo!! there is a bug --> YOLO downloads its own data to use...etc. #
#-----------------------------------------------------------------------------------------------------------

In [48]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # run 20 trials

# Best hyperparameters
print("Best hyperparameters:", study.best_params)

[I 2024-02-11 22:02:09,664] A new study created in memory with name: no-name-49138c93-bf46-440a-9b23-b1317ec3177e


type of train subset:  <class 'torch.utils.data.dataset.Subset'>
type of train loader:  <class 'torch.utils.data.dataloader.DataLoader'>
type of train_loader <class 'torch.utils.data.dataloader.DataLoader'>
New https://pypi.org/project/ultralytics/8.1.11 available 😃 Update with 'pip install -U ultralytics'
[34m[1mengine/trainer: [0mtask=pose, mode=train, model=yolov8n-pose.pt, data=coco8-pose.yaml, epochs=100, time=None, patience=50, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=0, project=None, name=train, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=

[34m[1mtrain: [0mScanning /Users/ari/Documents/Data_Science/3_semester/learning_from_images/egronomic_pose_project/datasets/coco8-pose/labels/train.cache... 4 images, 0 backgrounds, 0 corrupt: 100%|██████████| 4/4 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/ari/Documents/Data_Science/3_semester/learning_from_images/egronomic_pose_project/datasets/coco8-pose/labels/val.cache... 4 images, 0 backgrounds, 0 corrupt: 100%|██████████| 4/4 [00:00<?, ?it/s]

Plotting labels to /Users/ari/Documents/Data_Science/3_semester/learning_from_images/egronomic_pose_project/ergonomic_pose_detect/runs/pose/train/labels.jpg... 





[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 63 weight(decay=0.0), 73 weight(decay=0.0005), 72 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1m/Users/ari/Documents/Data_Science/3_semester/learning_from_images/egronomic_pose_project/ergonomic_pose_detect/runs/pose/train[0m
Starting training for 100 epochs...

      Epoch    GPU_mem   box_loss  pose_loss  kobj_loss   cls_loss   dfl_loss  Instances       Size


      1/100         0G      2.929      8.319     0.6615      2.865      2.973         11        640: 100%|██████████| 1/1 [00:01<00:00,  1.91s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00,  1.14it/s]

                   all          4         14      0.109      0.143     0.0365     0.0224          0          0          0          0






      Epoch    GPU_mem   box_loss  pose_loss  kobj_loss   cls_loss   dfl_loss  Instances       Size


      2/100         0G      3.336      9.957     0.7004       4.88      3.033          7        640: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00,  1.22it/s]

                   all          4         14      0.262      0.214      0.083     0.0343          0          0          0          0






      Epoch    GPU_mem   box_loss  pose_loss  kobj_loss   cls_loss   dfl_loss  Instances       Size


      3/100         0G      2.911      8.605     0.8767      3.701      3.041         14        640: 100%|██████████| 1/1 [00:01<00:00,  1.68s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<00:00,  1.08it/s]

                   all          4         14      0.494      0.143      0.141     0.0959          0          0          0          0






      Epoch    GPU_mem   box_loss  pose_loss  kobj_loss   cls_loss   dfl_loss  Instances       Size


      4/100         0G      2.795       8.92     0.8114      3.253      2.882         17        640: 100%|██████████| 1/1 [00:01<00:00,  1.96s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Pose(P          R      mAP50  mAP50-95):   0%|          | 0/1 [00:00<?, ?it/s]
[W 2024-02-11 22:02:24,007] Trial 0 failed with parameters: {'lr': 0.0001, 'batch_size': 4, 'num_epochs': 200} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/ari/Documents/Data_Science/3_semester/learning_from_images/egronomic_pose_project/ergonomic_pose_detect/ergonomic_pose.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/9c/8w6dd7612jv4k22v8dfgbq040000gn/T/ipykernel_94853/3745596659.py", line 29, in objective
    validation_score = train_and_eval_model(kp_classifier_model, model, optimizer, 

KeyboardInterrupt: 

In [None]:
best_params = study.best_params
# extract best_parameters
batch_size = best_params['batch_size']
lr = best_params['lr']
momentum = best_params['momentum']
num_epochs = best_params['num_epochs']

#### train and evaluate final model on test set:

Unfortunately Optuna cannot output the best model, so we train again on the combined train and validation set with the best parameters found.

dataloader for final evaluation:

In [None]:
train_and_eval_loader = DataLoader(train_and_val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

final evaluation on test set:

In [None]:
# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(kp_classifier_model.classifier.parameters(), lr=lr, momentum=momentum, weight_decay=1e-5)

# new instance of the model:
kp_classifier_model = YOLO_kp_Classifier(num_keypoints=num_keypoints, num_classes=num_classes)
kp_classifier_model = kp_classifier_model.to(device)  # move model to device
kp_classifier_model.train()  # set model to training mode

for epoch in range(num_epochs):
    for inputs, labels in train_and_eval_loader:  # Assuming data_loader is your DataLoader instance for the dataset
        inputs, labels = inputs.to(device), labels.to(device)  # move data to device
        optimizer.zero_grad()  # Zero the parameter gradients
        keypoints = get_keypoints_from_yolo(model, inputs) # get keypoints from the YOLO model
        processed_kps = process_keypoints_for_classifier(keypoints) # prepare the keypoints for the classifier
        classification_output = kp_classifier_model(processed_kps) # get results for the classification 
        loss = criterion(classification_output , labels)  # Compute loss
        loss.backward()  # Backpropagate the loss
        optimizer.step()  # Update weights

        # todo: further processing, such as calculating accuracy or loss, goes here

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        
    
    kp_classifier_model.eval()  # model to evaluation mode

    total_loss = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():  # no need to compute gradients, because we are in evaluation mode
        for inputs, labels in test_loader:  # iterate over validation dataset
            inputs, labels = inputs.to(device), labels.to(device)  # move data to device
            keypoints = get_keypoints_from_yolo(model, inputs) # get keypoints from the YOLO model
            processed_kps = process_keypoints_for_classifier(keypoints) # prepare the keypoints for the classifier
            classification_output = kp_classifier_model(processed_kps) # get results for the classification 
            loss = criterion(classification_output , labels)  # compute loss
            total_loss += loss.item()  # accumulate the loss
            _, predicted = torch.max(classification_output.data, 1)
            # collect the predictions and labels
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
        # calculate the validation metrics:
        avg_loss = total_loss / len(test_loader)  # get the average loss
        accuracy = (np.array(all_predictions) == np.array(all_labels)).mean()
        f1 = f1_score(all_labels, all_predictions, average='weighted')  
        conf_matrix = confusion_matrix(all_labels, all_predictions)
        
        print(f"Test Loss: {avg_loss:.4f}")
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test F1 Score: {f1:.4f}")
        print("Confusion Matrix:")
        print(conf_matrix)


