## 1. Imports

In [1]:
import os
os.chdir("..")

In [1]:
import numpy as np
import pandas as pd
from thop import profile

import torch
import torch.nn as nn
from model import YoloV8I, YoloV8I_CONFIGS

## Helper functions

In [2]:
def get_model_size(model: nn.Module):
    """
    This method computes the model size

    Args
    ----------
    model : object
        A torch.nn.Module object

    Returns
    -------
    float

    """
    KILOBYTE_TO_BYTE = MEGABYTE_TO_KILOBYTE = 1024
    
    size_of_parameters = sum([param.element_size() * param.nelement() for param in model.parameters()])
    size_of_buffers = sum([buf.element_size() * buf.nelement() for buf in model.buffers()])
    model_size = size_of_parameters + size_of_buffers # Bytes
    model_size_mb = model_size / (KILOBYTE_TO_BYTE * MEGABYTE_TO_KILOBYTE) # MegaBytes
    return round(model_size_mb, 3)

def get_inference_time(
    model: nn.Module,
    inputs: torch.Tensor,
    device: torch.device,
    n_iters:int=10,
    gpu_warmup:bool=False
    ):
    """
    This method computes the inference time of the model for both targets, cpu and cuda

    Args
    ----------
    model : object
        A torch.nn.Module object
    inputs : object
        A numpy array or torch tensor
    device : [object, str]
        A string or torch.device object representing the device. E.g "cpu" or torch.device("cpu")
    n_iters : int
        An integer value specifying the number of iterations to be performed for a given 
        input batch
    gpu_warmup : bool
        If set to True; warms up the GPU before computing the actual GPU inference time

    Returns
    -------
    float

    """

    is_gpu = False if str(device) == "cpu" else True
    batch_size = inputs[0].shape[0]
    model.to(device).eval()
    timings = np.zeros((n_iters, 1))

    if is_gpu:
        if gpu_warmup:
            # GPU-WARM-UP
            warm_ups = [
                [(1, 4), 12],
                [(5, 8), 9],
                [(8, 12), 6],
                [(12, 16), 3]
            ]

            if batch_size > 16:
                num_warm_ups = 1
            else:
                for warm_up in warm_ups:
                    if warm_up[0][0] <= batch_size <= warm_up[0][1]:
                        num_warm_ups = warm_up[1]
                        break

            print(f"Performing GPU warm up with {num_warm_ups} iterations...")
            for _ in range(num_warm_ups):
                model(*inputs)
            print("GPU warmup complete!")

        with torch.inference_mode():
            for rep in range(n_iters):
                starter = torch.cuda.Event(enable_timing=True)
                ender = torch.cuda.Event(enable_timing=True)

                starter.record()
                model(*inputs)
                ender.record()

                # WAIT FOR GPU SYNC
                torch.cuda.synchronize()

                elapsed_time = starter.elapsed_time(ender)  # milliseconds
                timings[rep] = elapsed_time

    else:
        with torch.inference_mode():
            for rep in range(n_iters):
                start_time = time.perf_counter()
                model(*inputs)
                end_time = time.perf_counter()
                elapsed_time = (end_time - start_time) / 1000  # milliseconds
                timings[rep] = elapsed_time

    avg_batch_inference_time = round(np.sum(timings)/ n_iters, 3)
    avg_sample_inference_time = round(avg_batch_inference_time / batch_size, 3)

    return avg_batch_inference_time, avg_sample_inference_time

def get_model_metrics(
    model: nn.Module,
    inputs: torch.Tensor,
    device: torch.device
    ):
        """
        This method computes the model metrics - MACs, FLOPs and number of parameters

        Args
        ----------
        model : object
            A torch.nn.Module object
        inputs : object
            A numpy array or torch tensor
        device : object | str
            A string or torch.device object representing the device. E.g "cpu" or torch.device("cpu")
            
        Returns
        -------
        tuple
        
        """

        batch_size = inputs[0].shape[0]
        model.to(device).eval()
        
        MACs, params = profile(model, inputs=inputs, verbose=False)

        M_params = round(params * 1e-6, 6)
        M_MACs_batch = round(MACs * 1e-9, 6) 
        M_FLOPs_batch = 2 * M_MACs_batch

        M_MACs = M_MACs_batch / batch_size
        M_FLOPs = M_FLOPs_batch / batch_size

        return M_MACs, M_MACs_batch, M_FLOPs, M_FLOPs_batch, M_params

## 3. Define Model, device and inputs

In [3]:
NUM_CLASSES = 80

model_type = "m"
model_config = YoloV8I_CONFIGS[model_type]
model_config.num_classes = NUM_CLASSES

model = YoloV8I(model_config)


device = torch.device("cuda")
inputs = [torch.randn(16, 3, 480, 640).to(device)]

## 4. Perform Profiling

In [4]:
# get model size
model_size = get_model_size(model)
print(f"Model Size in MB: {model_size}")

Model Size in MB: 77.594


In [5]:
M_MACs, M_MACs_batch, M_FLOPs, M_FLOPs_batch, M_params = get_model_metrics(model, inputs, device)
print(f"MACs per sample (10^9): {M_MACs}")
print(f"FLOPs per sample (10^9): {M_FLOPs}")
print(f"MACs per batch (10^9): {M_MACs_batch}")
print(f"FLOPs per batch (10^9): {M_FLOPs_batch}")
print(f"Number of Parameters (Million): {M_params}")

MACs per sample (10^9): 24.3368815
FLOPs per sample (10^9): 48.673763
MACs per batch (10^9): 389.390104
FLOPs per batch (10^9): 778.780208
Number of Parameters (Million): 20.280884


In [6]:
avg_batch_inference_time, avg_sample_inference_time = get_inference_time(
    model=model,
    inputs=inputs,
    device=device,
    n_iters=10,
    gpu_warmup=True
)
print(f"Sample Inference Time (ms): {avg_sample_inference_time}")
print(f"Batch Inference Time (ms): {avg_batch_inference_time}")

Performing GPU warm up with 3 iterations...
GPU warmup complete!
Sample Inference Time (ms): 5.876
Batch Inference Time (ms): 94.009


In [8]:
model_df = pd.DataFrame({
    "Model" : ["YOLOv8I-m"],
    "Model_size (MB)" : [model_size],
    "Num_Parameters (Million)": [M_params],
    "Sample MACs (10^9)" : [M_MACs],
    "Sample FLOPs (10^9)" : [M_FLOPs],
    "Batch MACs (10^9)" : [M_MACs_batch],
    "Batch FLOPs (10^9)" : [M_FLOPs_batch],
    "Sample Latency (ms)": [avg_sample_inference_time],
    "Batch Latency (ms)": [avg_batch_inference_time]
})

## 5. Compare with Standard YOLOv8 and YOLOv12 

In [9]:
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\dheer\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


### YOLOv8

In [10]:
yolov8 = YOLO("yolov8m.pt").model

In [11]:
# get model size
model_size = get_model_size(yolov8)
print(f"Model Size in MB: {model_size}")

Model Size in MB: 98.938


In [12]:
M_MACs, M_MACs_batch, M_FLOPs, M_FLOPs_batch, M_params = get_model_metrics(yolov8, inputs, device)
print(f"MACs per sample (10^9): {M_MACs}")
print(f"FLOPs per sample (10^9): {M_FLOPs}")
print(f"MACs per batch (10^9): {M_MACs_batch}")
print(f"FLOPs per batch (10^9): {M_FLOPs_batch}")
print(f"Number of Parameters (Million): {M_params}")

MACs per sample (10^9): 29.745158375
FLOPs per sample (10^9): 59.49031675
MACs per batch (10^9): 475.922534
FLOPs per batch (10^9): 951.845068
Number of Parameters (Million): 25.90264


In [13]:
avg_batch_inference_time, avg_sample_inference_time = get_inference_time(
    model=yolov8,
    inputs=inputs,
    device=device,
    n_iters=10,
    gpu_warmup=True
)
print(f"Sample Inference Time (ms): {avg_sample_inference_time}")
print(f"Batch Inference Time (ms): {avg_batch_inference_time}")

Performing GPU warm up with 3 iterations...
GPU warmup complete!
Sample Inference Time (ms): 6.425
Batch Inference Time (ms): 102.805


In [14]:
yolov8_df = pd.DataFrame({
    "Model" : ["YOLOv8-m"],
    "Model_size (MB)" : [model_size],
    "Num_Parameters (Million)": [M_params],
    "Sample MACs (10^9)" : [M_MACs],
    "Sample FLOPs (10^9)" : [M_FLOPs],
    "Batch MACs (10^9)" : [M_MACs_batch],
    "Batch FLOPs (10^9)" : [M_FLOPs_batch],
    "Sample Latency (ms)": [avg_sample_inference_time],
    "Batch Latency (ms)": [avg_batch_inference_time]
})

### YOLOv12

In [15]:
yolov12 = YOLO("yolo12m.pt").model

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo12m.pt to 'yolo12m.pt': 100% ━━━━━━━━━━━━ 39.0/39.0MB 31.0MB/s 1.3s0.0s


In [16]:
# get model size
model_size = get_model_size(yolov12)
print(f"Model Size in MB: {model_size}")

Model Size in MB: 77.311


In [18]:
M_MACs, M_MACs_batch, M_FLOPs, M_FLOPs_batch, M_params = get_model_metrics(yolov12, inputs, device)
print(f"MACs per sample (10^9): {M_MACs}")
print(f"FLOPs per sample (10^9): {M_FLOPs}")
print(f"MACs per batch (10^9): {M_MACs_batch}")
print(f"FLOPs per batch (10^9): {M_FLOPs_batch}")
print(f"Number of Parameters (Million): {M_params}")

MACs per sample (10^9): 25.529721625
FLOPs per sample (10^9): 51.05944325
MACs per batch (10^9): 408.475546
FLOPs per batch (10^9): 816.951092
Number of Parameters (Million): 20.201216


In [20]:
avg_batch_inference_time, avg_sample_inference_time = get_inference_time(
    model=yolov12,
    inputs=inputs,
    device=device,
    n_iters=10,
    gpu_warmup=True
)
print(f"Sample Inference Time (ms): {avg_sample_inference_time}")
print(f"Batch Inference Time (ms): {avg_batch_inference_time}")

Performing GPU warm up with 3 iterations...
GPU warmup complete!
Sample Inference Time (ms): 8.345
Batch Inference Time (ms): 133.523


In [21]:
yolov12_df = pd.DataFrame({
    "Model" : ["YOLOv12-m"],
    "Model_size (MB)" : [model_size],
    "Num_Parameters (Million)": [M_params],
    "Sample MACs (10^9)" : [M_MACs],
    "Sample FLOPs (10^9)" : [M_FLOPs],
    "Batch MACs (10^9)" : [M_MACs_batch],
    "Batch FLOPs (10^9)" : [M_FLOPs_batch],
    "Sample Latency (ms)": [avg_sample_inference_time],
    "Batch Latency (ms)": [avg_batch_inference_time]
})

# 6) Comparison Results

In [26]:
comparison_df = pd.concat([model_df, yolov8_df, yolov12_df], ignore_index=True)
comparison_df

Unnamed: 0,Model,Model_size (MB),Num_Parameters (Million),Sample MACs (10^9),Sample FLOPs (10^9),Batch MACs (10^9),Batch FLOPs (10^9),Sample Latency (ms),Batch Latency (ms)
0,YOLOv8I-m,77.594,20.280884,24.336882,48.673763,389.390104,778.780208,5.876,94.009
1,YOLOv8-m,98.938,25.90264,29.745158,59.490317,475.922534,951.845068,6.425,102.805
2,YOLOv12-m,77.311,20.201216,25.529722,51.059443,408.475546,816.951092,8.345,133.523
