In [2]:
from ultralytics import YOLO
import cv2
import numpy as np
from PIL import Image

def visualize_output(image, boxes, keypoints):
    """
    Visualize the output model on the input image.

    Parameters:
    image (numpy.ndarray): The original image.
    boxes (numpy.ndarray): Array of bounding boxes, each box is [x, y, width, height].
    keypoints (numpy.ndarray): Array of keypoints for pose estimation.
    """

    result_image = image.copy()
    
    # Font setting for the numbers
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 1
    font_thickness = 2
    
    # Draw bounding boxes
    for i, box in enumerate(boxes):
        center_x, center_y, width, height = box
        top_left_x = int(center_x - width / 2)
        top_left_y = int(center_y - height / 2)
        bottom_right_x = int(top_left_x + width)
        bottom_right_y = int(top_left_y + height)

        # Draw the box
        #cv2.rectangle(result_image, (top_left_x, top_left_y), (bottom_right_x, bottom_right_y), (0, 255, 0), 2)
        
        # # Write the number of the box
        # text = str(i)
        # (text_width, text_height), _ = cv2.getTextSize(text, font, font_scale, font_thickness)
        # text_offset_x = top_left_x + text_width
        # text_offset_y = top_left_y + text_height
        # box_coords = ((text_offset_x, text_offset_y - text_height), (top_left_x, text_offset_y + 4))
        # cv2.rectangle(result_image, box_coords[0], box_coords[1], (255, 255, 255), cv2.FILLED)
        # cv2.putText(result_image, text, (top_left_x, text_offset_y), font, font_scale, (0, 0, 255), font_thickness)

    # Draw keypoints
    for keypoint in keypoints:
        for x, y in keypoint:
            cv2.circle(result_image, (int(x), int(y)), 20, (255, 0, 0), -1)
    
    return result_image

def plot_results(image, mode="inline", scale=1):
    """
    Displays an image using either a popup window or inline display.

    Parameters
    ----------
    image : numpy.ndarray or CV2 image
        The image to display.
    mode : str, optional
        The display mode. Must be either "popup" or "inline". If "popup", the image is displayed in a popup window. If "inline", the image is displayed inline in the notebook. Default is "inline".
    scale : float, optional
        The scaling factor for the image. Default is 1.

    Raises
    ------
    ValueError
        If the mode is not "popup" or "inline".
    """
    
    if mode == "popup":
        pass
    elif mode == "inline":
        if type(image) == np.ndarray:
            # Convert from BGR to RGB (because OpenCV uses BGR order for color channels, whereas PIL uses RGB.)
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = Image.fromarray(image_rgb)
    else:
        raise ValueError("Mode must be either 'popup' or 'inline'")

    if type(image) is np.ndarray:
        cv2.imshow('Image', image)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
    else:
        width, height = image.size
        display(image.resize((int(width*scale), int(height*scale))))

### load the model

In [3]:
model = YOLO('yolov8n-pose.pt')

#### small example image prediction with visualization

In [7]:
# Predict with the model
image = cv2.imread('example_images/yellow3.jpg') 

results = model(image)

boxes = results[0].boxes.xywh  # Boxes object for bbox outputs
keypoints = results[0].keypoints.xy  # Keypoints object for pose outputs



0: 640x480 1 person, 78.1ms
Speed: 2.9ms preprocess, 78.1ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 480)


the keypoints are also calculated in 3D which we will leverage for our predictions

In [22]:
#print(results)
keypoints_all = results[0].keypoints
#print(keypoints_3d)
print(keypoints_all.data)
print(keypoints_all.data.shape)

tensor([[[1.2903e+03, 1.4783e+03, 9.9350e-01],
         [1.3883e+03, 1.3790e+03, 9.9267e-01],
         [1.2600e+03, 1.3812e+03, 7.7818e-01],
         [1.7325e+03, 1.3233e+03, 9.6801e-01],
         [0.0000e+00, 0.0000e+00, 4.2132e-02],
         [1.9563e+03, 1.7039e+03, 9.9153e-01],
         [1.6893e+03, 1.5877e+03, 9.9206e-01],
         [1.5165e+03, 2.2164e+03, 9.7732e-01],
         [1.2452e+03, 1.8650e+03, 9.7607e-01],
         [1.3291e+03, 2.2291e+03, 9.8469e-01],
         [8.3613e+02, 2.0155e+03, 9.8376e-01],
         [2.4183e+03, 2.8356e+03, 9.8898e-01],
         [2.2288e+03, 2.7412e+03, 9.9286e-01],
         [1.5074e+03, 2.8463e+03, 9.8288e-01],
         [1.4614e+03, 2.7262e+03, 9.8562e-01],
         [1.2421e+03, 3.3962e+03, 9.0765e-01],
         [1.3356e+03, 3.2737e+03, 9.3313e-01]]])
torch.Size([1, 17, 3])


In [8]:
result = visualize_output(image, boxes, keypoints)
# plot_results(result)

In [39]:
# Save the image for the mockup:
cv2.imwrite('ari_yellow3_kps.jpg', result)

True

## fine tuning yolo for our classification task

- option 1: freeze the complete model, use it as a feature extractor and add a classification head to be trained on our dataset
- option 2: cut the keypoint output layer, freeze the rest of the model and add a classification head to be trained on our dataset

#### 1) using yolo as feature extractor:

freezing the yolo model:

In [None]:
## freezing the model by setting the requires_grad attribute to False:
for param in model.parameters():
    param.requires_grad = False

defining a classification layer:

In [None]:
import torch
import torch.nn as nn

# `feature_extractor` is the part of the model suitable for feature extraction

class YOLO_kp_Classifier(nn.Module):
    def __init__(self, num_keypoints, num_classes=3):
        super(YOLO_kp_Classifier, self).__init__()
        # to flatten the output 
        self.flatten = nn.Flatten()
        # add new classification layer(s) to the model
        self.classifier = nn.Sequential(
            nn.Linear(num_keypoints* 3, 512),  # 17 keypoints * 3 (x, y, z coordinates for each keypoint)
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes),
        )
    
    def forward(self, keypoints_3d):
        keypoints_3d = self.flatten(keypoints_3d)
        output = self.classifier(keypoints_3d)
        return output


add cross-entropy loss function (fits our classification task):

In [None]:
# initialize the classifier
num_classes = 3  #  3-class classification problem
num_keypoints = 17  # 17 keypoints in the model

kp_classifier_model = YOLO_kp_Classifier(num_keypoints=num_keypoints, num_classes=num_classes)

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(kp_classifier_model.classifier.parameters(), lr=0.001)

In [None]:

model.eval()  # YOLO model set to evaluation mode because we are not training it

with torch.no_grad():  # gradients are not computed for the frozen model
    results = model(input)
    keypoints = results[0].keypoints.data
    # do we need to adjust the shape?
    
    #keypoints = keypoints.view(1, -1)  # Reshape keypoints for the classifier, if necessary
    classification_output = kp_classifier_model(keypoints)


training the model:

!! add dataloader

In [None]:

# todo add dataloader!!!

# Example training loop
num_epochs = 50 # check for appropriate number of epochs
dataloader = None  # todo: replace with our own dataloader

for epoch in range(num_epochs):
    for inputs, labels in dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

evaluate the model: