# Reimplementation from scatch of YOLOv8 for Car Detection

This notebook contains the implementation of the Yolo-v8 architecture. We use it for car detection

## All the Imports

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
import os
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image, ImageDraw
import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
import cv2
import matplotlib.pyplot as plt
import math
import json
import random
import matplotlib.patches as patches
import sklearn

## Constants
The constants above are used to index the encoded target and output in order to retrieve the corresponding quantities.

* `DELTA_X = 0`
* `DELTA_Y = 1`
* `WIDTH = 2`
* `HEIGHT = 3`
* `CONFIDENCE = 4`
* `CLASS = 5`

In [None]:
DELTA_X = 0
DELTA_Y = 1
WIDTH = 2
HEIGHT = 3
CONFIDENCE = 4
CLASS = 5

## Utils Function

* `scale_to_range`: TODO
* `show_image_with_boxes`: TODO
* `from_grid_coordinates_to_bbox`: TODO


In [None]:
def scale_to_range(bboxes, old_x=676, old_y=380, new_x = 128, new_y=128):
    x_scale = new_x / old_x
    y_scale = new_y / old_y
    
    for box in bboxes:
        box[0] = int(np.round(box[0]*x_scale))
        box[1] = int(np.round(box[1]*y_scale))
        box[2] = int(np.round(box[2]*x_scale))
        box[3] = int(np.round(box[3]*y_scale))
    return bboxes

In [None]:
def transform_in_grid_coordinates(x_center, y_center, width, height, size="large"):

    #print(f"la nostra bounding box è x_center = {x_center}, y_center = {y_center}, width = {width}, height = {height}\n")

    if size == "small":
        number_of_cells = 16
        n_pixel_per_grid = 8

    if size == "medium":
        number_of_cells = 8
        n_pixel_per_grid = 16

    if size == "large":
        number_of_cells = 4
        n_pixel_per_grid = 32

    i = 0
    for cell in range(number_of_cells * number_of_cells):
        #print(f"colonna = {cell}\n")
        #print(f"riga = {i}\n")

        if cell % number_of_cells  == 0:
            i += 1

        x_a = n_pixel_per_grid * (cell % number_of_cells)
        y_a = n_pixel_per_grid * (i-1) 
        #print(f"x_a = {x_a}, y_a = {y_a}\n")

        # if verifica se il centro ricade nella cella
        #print(f"controlliamo se il centro ricade tra x = {x_a} e x = {x_a + n_pixel_per_grid}...\n")
        if x_center >= x_a and x_center <= (x_a + n_pixel_per_grid):
            #print(f"controlliamo se il centro ricade tra x = {y_a} e x = {y_a + n_pixel_per_grid}...\n")
            if y_center >= y_a and y_center <= (y_a + n_pixel_per_grid):
                #print("il centro ricade qua! Calcoliamo quantità!\n")
                delta_x = (x_center - x_a) / n_pixel_per_grid
                delta_y = (y_center - y_a) / n_pixel_per_grid
                delta_width = width / 128
                delta_height = height / 128 # 128 is the heigth and width of the image
                #print(f"delta_x = {delta_x}, delta_y = {delta_y}, delta_width = {delta_width}, delta_height = {delta_height}")
                confidence = 1 # 100% confidence that is a car
                cl = 1 # 1 = car, 0 = nothing
                column = cell
                row = i-1
                return delta_x, delta_y, delta_width, delta_height, confidence, cl, column%number_of_cells, row
            else:
                continue
        else:
              continue
        print("[ERROR]: there is a box but we have not found the cell in which it lies")



In [None]:
def show_image_with_boxes(self, idx):
        image, boxes_list = self[idx]
        image_draw = ImageDraw.Draw(image)
        
        # Draw the bounding boxes
        for box in boxes_list:
            xmin, ymin, xmax, ymax = box
            image_draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=2)
        
        # Display the image
        plt.imshow(image)
        plt.axis("off")
        plt.show()

In [None]:
def show_image_and_bbox(image, encoding_of_boxes, filename=None, training=False):
    
    image_np = None

    # if a filename is not provided, a 128 x 128 image is displayed
    if filename == None:
        image = image[0]

        if image.shape == torch.Size([3, 128, 128]):
            image_np = image.permute(1, 2, 0).numpy()

    # if a filename is provided, the 676 x 380 image is displayed
    else:
        img_path = os.path.join("/kaggle/input/car-object-detection/data/testing_images", filename)
    
        if training == True:
            img_path = os.path.join("/kaggle/input/car-object-detection/data/training_images", filename)
    
        image = Image.open(img_path).convert("RGB")
        image_np = np.array(image)
            
    # Plot the image with bounding boxes
    fig, ax = plt.subplots(1)
    ax.imshow(image_np)
    
    bboxes = from_grid_coordinate_to_bbox(encoding_of_boxes)

    for box in bboxes:
        
        #print(f"box = {box}")

        x_center = box[0]
        y_center = box[1]
        width = box[2]
        height = box[3]
        
        xmin = (2*x_center - width) / 2
        ymin = (2*y_center - height) / 2
        xmax = (2*x_center + width) / 2
        ymax = (2*y_center + height) / 2
        
        rect = None
        # if a filename is not provided, the image is 128 x 128 and so the box should be
        if filename == None:
            rect = patches.Rectangle((xmin, ymin), width, height, linewidth=2, edgecolor='r', facecolor='none')
        
        # if the image is 676 x 380, the rectangle must be scaled
        else:
            bbox = scale_to_range(bboxes=[[xmin, ymin, xmax, ymax]], old_x=128, old_y=128, new_x=676, new_y=380)
            bbox = bbox[0]
            xmin = bbox[0]
            ymin = bbox[1]
            xmax = bbox[2]
            ymax = bbox[3]
            width = xmax - xmin
            height = ymax - ymin
            rect = patches.Rectangle((xmin, ymin), width, height, linewidth=2, edgecolor='r', facecolor='none')
        
        
        # Add the patch to the Axes
        ax.add_patch(rect)
        
        
        confidence_score = box[4]
        ax.text(xmin, ymin - 5, f'{confidence_score:.4f}', color='white', fontsize=12, bbox=dict(facecolor='red', alpha=0.5))

    
    plt.axis('off')  # Turn off axis
    plt.show()
    
    return 

In [None]:
def from_grid_coordinate_to_bbox(output, size="large"):
        
        if size == "small":
            number_of_cells = 16
            n_pixel_per_grid = 8
        
        if size == "medium":
            number_of_cells = 8
            n_pixel_per_grid = 16
        
        if size == "large":
            number_of_cells = 4
            n_pixel_per_grid = 32
    
        bboxes = []
        for i in range(number_of_cells):
            for j in range(number_of_cells):
                
                if output[0][CLASS][i][j] >= 0.6:
                    delta_x = float(output[0][DELTA_X][i][j])
                    delta_y = float(output[0][DELTA_Y][i][j])
                    delta_w = float(output[0][WIDTH][i][j])
                    delta_h = float(output[0][HEIGHT][i][j])

                    x_a = n_pixel_per_grid * j
                    y_a = n_pixel_per_grid * i

                    x = delta_x * n_pixel_per_grid + x_a
                    y = delta_y * n_pixel_per_grid + y_a
                    w = delta_w * 128
                    h = delta_h * 128
                    confidence = float(output[0][CONFIDENCE][i][j])                 

                    bbox = [x, y, w, h, confidence]
                    print(f"bbox = {bbox}, class = {output[0][CLASS][i][j]}")

                    bboxes.append(bbox)

        return bboxes                 

## Testing

### Load the Model 

In [None]:
model = YOLO();
model_load_path = '/kaggle/input/yolo_v8_700_epoch/pytorch/default/1/yolov8_new_700_epoch.pth'
model.load_state_dict(torch.load(model_load_path));
model.to(device);
model.eval();  # Set the model to evaluation mode
print(f"model loaded: {model_load_path}")

### Parse the annotated test set

In [None]:
# Load the JSON data
test_annotation_path = '/kaggle/input/test-annotation-car-detection-dataset/2D-on-2D_annotations_export.json'
  
# Open the file and read its contents
with open(test_annotation_path, 'r') as file:
    data = json.load(file)

test_image_boxes = {}

# Iterate through each image entry in the JSON
for image_entry in data['images']:
    image_name = image_entry['image']
    annotations = image_entry.get('annotations', [])
    
    # Initialize a list to store bounding boxes for the current image
    boxes = []
    
    # Iterate through each annotation
    for annotation in annotations:
        bbox = annotation['boundingBox']

        # Extract x, y, width, and height
        x_min = bbox['x']
        y_min = bbox['y']
        w = bbox['width']
        h = bbox['height']
        
        x_max = x_min + w
        y_max = y_min + h
        
        x_center = (x_min + x_max) / 2.0
        y_center = (y_min + y_max) / 2.0
    
    
        confidence = 1
        
        # Append the bounding box to the list
        boxes.append([x_center, y_center, w, h, confidence])
    
    # convert in 128 x 128
    boxes = scale_to_range(bboxes=boxes)
     
    number_of_cells = 4
    encoded_result = torch.zeros(6, number_of_cells, number_of_cells)
    for box in boxes:
        delta_x, delta_y, delta_width, delta_height, confidence, cl, column, row = transform_in_grid_coordinates(x_center=box[0], y_center=box[1], width=box[2], height=box[3])
    
        box = [delta_x, delta_y, delta_width, delta_height, confidence, cl]
        
        encoded_result[0][row][column] = box[0]
        encoded_result[1][row][column] = box[1]
        encoded_result[2][row][column] = box[2]
        encoded_result[3][row][column] = box[3]
        encoded_result[4][row][column] = box[4]
        encoded_result[5][row][column] = box[5]

    # Store the list of bounding boxes in the dictionary
    test_image_boxes[image_name] = encoded_result.unsqueeze(0)


#for filename in test_image_boxes:
#    show_image_and_bbox(image=None, encoding_of_boxes=test_image_boxes[filename], filename=filename)

### Test the Model

#### mean Average Precision (mAP)

TODO: insert latex formula

In [None]:
def get_cls_from_encode(encoded, threshold = 0.5):
    cls_list = []
    number_of_cells = 4
    for i in range(number_of_cells):
        for j in range(number_of_cells):
            cls_list.append(encoded[0][CLASS][i][j])
    cls_list = ["positive" if cls_score >= threshold else "negative" for cls_score in cls_list]
    return cls_list

In [None]:
def precision_recall_curve(target, out, thresholds):
    precisions = []
    recalls = []
    
    for threshold in thresholds:
        # encoding: from cls get "positive" and "negative" using 0.5 as a threshold
        y_pred = get_cls_from_encode(out, threshold=threshold)
        y_true = get_cls_from_encode(target, threshold=threshold)
        precision = sklearn.metrics.precision_score(y_true=y_true, y_pred=y_pred, pos_label="positive", zero_division=0)
        recall = sklearn.metrics.recall_score(y_true=y_true, y_pred=y_pred, pos_label="positive", zero_division=0)
        
        precisions.append(precision)
        recalls.append(recall)

    return precisions, recalls

In [None]:
def compute_mAP(out, target, thresholds):
    
        precisions, recalls = precision_recall_curve(target=target, 
                                                 out=out,
                                                 thresholds=thresholds)
        precisions.append(1)
        recalls.append(0)

        precisions = np.array(precisions)
        recalls = np.array(recalls)

        AP = np.sum((recalls[:-1] - recalls[1:]) * precisions[:-1])
        return AP, precisions, recalls

In [None]:
thresholds = np.arange(start=0.2, stop=0.7, step=0.05)

In [None]:
if TRAINING == False and TRAINING_DEBUG == False:
    
    # Testing 
    test_transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor(),
    ])

    test_img_dir = '/kaggle/input/car-object-detection/data/testing_images'
    file_list = os.listdir(test_img_dir)
    TEST_SET_SIZE = len(file_list)

    random_number = random.randint(0, TEST_SET_SIZE)

    filename = None
    
    image = None
    target = None
    for i in range(TEST_SET_SIZE):
        if i == random_number:
            filename = file_list[i]
            print(f"filename = {filename} randomly chosen")
            
            target = test_image_boxes[filename]

            img_path = os.path.join(test_img_dir, filename)

            image = Image.open(img_path).convert("RGB")

            break 
            

    image = test_transform(image)
    image = image.unsqueeze(0)
    image = image.to(device)

    model.eval()

    with torch.no_grad():
        
        out = model(image)
        
        AP, precisions, recalls = compute_mAP(out=out, target=target, thresholds=thresholds)
        
        plt.plot(recalls, precisions, linewidth=4, color="red")
        plt.xlabel("Recall", fontsize=12, fontweight='bold')
        plt.ylabel("Precision", fontsize=12, fontweight='bold')
        plt.title("Precision-Recall Curve", fontsize=15, fontweight="bold")
        plt.show()
        
      
        # Add a small epsilon to avoid division by zero, but handle the cases where both are zero
        epsilon = 1e-10
        f1_scores = 2 * ((precisions * recalls) / (precisions + recalls + epsilon))

        # Set F1 score to 0 where both precision and recall are 0 (instead of NaN)
        f1_scores[(precisions == 0) & (recalls == 0)] = 0

        
        # giving filename to the function, it displays the result in 676 x 380
        # without it, it displays it in 128 x 128
        
        # if training = true you want to use images from the training set
        
        show_image_and_bbox(image.cpu(), out, filename, training=False)
        show_image_and_bbox(image.cpu(), target, filename, training=False)