# benchmark

> A module to benchmark Pytorch model according to: size, speed, compute and energy.

In [None]:
#| default_exp benchmark

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#| export
import torch
import time
from codecarbon import OfflineEmissionsTracker
import numpy as np
import os
from thop import profile, clever_format
from tqdm.notebook import tqdm
from prettytable import PrettyTable
from torchprofile import profile_macs

## Size

In [None]:
#| export
def get_model_size(model, temp_path="temp_model.pth"):
    torch.save(model.state_dict(), temp_path)
    model_size = os.path.getsize(temp_path)
    os.remove(temp_path)
    
    return model_size

In [None]:
#| export
def get_num_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## Speed

In [None]:
#| export
@torch.inference_mode()
def evaluate_gpu_speed(model, dummy_input, warmup_rounds=50, test_rounds=100):
    device = torch.device("cuda")
    model.eval()
    model.to(device)
    dummy_input = dummy_input.to(device)
    
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    latencies = []

    # Warm up GPU
    for _ in range(warmup_rounds):
        _ = model(dummy_input)
    
    # Measure Latency
    for _ in range(test_rounds):
        starter.record()
        _ = model(dummy_input)
        ender.record()
        torch.cuda.synchronize()
        latencies.append(starter.elapsed_time(ender))  # time in milliseconds
    
    latencies = np.array(latencies)
    mean_latency = np.mean(latencies)
    std_latency = np.std(latencies)

    # Measure Throughput
    throughput = dummy_input.size(0) * 1000 / mean_latency  # Inferences per second

    return mean_latency, std_latency, throughput

In [None]:
#| export
@torch.inference_mode()
def evaluate_cpu_speed(model, dummy_input, warmup_rounds=50, test_rounds=100):
    device = torch.device("cpu")
    model.eval()
    model.to(device)
    dummy_input = dummy_input.to(device)
    
    # Warm up CPU
    for _ in range(warmup_rounds):
        _ = model(dummy_input)
    
    # Measure Latency
    latencies = []
    for _ in range(test_rounds):
        start_time = time.perf_counter()
        _ = model(dummy_input)
        end_time = time.perf_counter()
        latencies.append(end_time - start_time)
    
    latencies = np.array(latencies) * 1000  # Convert to milliseconds
    mean_latency = np.mean(latencies)
    std_latency = np.std(latencies)

    # Measure Throughput
    throughput = dummy_input.size(0) * 1000 / mean_latency  # Inferences per second

    return mean_latency, std_latency, throughput

## Compute

In [None]:
#| export
@torch.inference_mode()
def get_model_macs(model, inputs) -> int:
    return profile_macs(model, inputs)

In [None]:
#| export
@torch.inference_mode()
def evaluate_gpu_memory_usage(model, dummy_input, warmup_rounds=10, test_rounds=100):
    device = torch.device("cuda")
    model.eval()
    model.to(device)
    dummy_input = dummy_input.to(device)
    
    # Warm up GPU
    for _ in range(warmup_rounds):
        _ = model(dummy_input)
    
    # Measure Memory Usage
    memory_usages = []
    for _ in range(test_rounds):
        torch.cuda.reset_peak_memory_stats(device)
        _ = model(dummy_input)
        torch.cuda.synchronize()
        memory_usages.append(torch.cuda.memory_allocated(device))
    
    memory_usages = np.array(memory_usages)
    average_memory_usage = np.mean(memory_usages)
    peak_memory_usage = torch.cuda.max_memory_allocated(device)
    
    return average_memory_usage, peak_memory_usage

## Energy

In [None]:
#| export
@torch.inference_mode()
def evaluate_emissions(model, dummy_input, warmup_rounds=50, test_rounds=100):
    device = torch.device("cuda")
    model.eval()
    model.to(device)
    dummy_input = dummy_input.to(device)

    # Warm up GPU
    for _ in range(warmup_rounds):
        _ = model(dummy_input)
    
    # Measure Latency
    tracker = OfflineEmissionsTracker(country_iso_code="USA")
    tracker.start()
    for _ in range(test_rounds):
        _ = model(dummy_input)
    tracker.stop()
    total_emissions = tracker.final_emissions
    total_energy_consumed = tracker.final_emissions_data.energy_consumed
    
    # Calculate average emissions and energy consumption per inference
    average_emissions_per_inference = total_emissions / test_rounds
    average_energy_per_inference = total_energy_consumed / test_rounds
    
    return average_emissions_per_inference, average_energy_per_inference

In [None]:
#| export
@torch.inference_mode()
def benchmark(model, dummy_input):
    # Model Size
    disk_size = get_model_size(model)
    #num_parameters = get_num_parameters(model)
    
    # GPU Speed
    gpu_latency, gpu_std_latency, gpu_throughput = evaluate_gpu_speed(model, dummy_input)
    
    # CPU Speed
    cpu_latency, cpu_std_latency, cpu_throughput = evaluate_cpu_speed(model, dummy_input)
    
    # Model MACs
    #macs = get_model_macs(model, dummy_input)
    macs, params = profile(model, inputs=(dummy_input, ))
    macs, num_parameters = clever_format([macs, params], "%.3f")
    
    # GPU Memory Usage
    avg_gpu_memory, peak_gpu_memory = evaluate_gpu_memory_usage(model, dummy_input)
    
    # Emissions
    avg_emissions, avg_energy = evaluate_emissions(model, dummy_input)
    
    # Print results
    print(f"Model Size: {disk_size / 1e6:.2f} MB (disk), {num_parameters} parameters")
    print(f"GPU Latency: {gpu_latency:.3f} ms (± {gpu_std_latency:.3f} ms)")
    print(f"GPU Throughput: {gpu_throughput:.2f} inferences/sec")
    print(f"CPU Latency: {cpu_latency:.3f} ms (± {cpu_std_latency:.3f} ms)")
    print(f"CPU Throughput: {cpu_throughput:.2f} inferences/sec")
    print(f"Model MACs: {macs}")
    print(f"Average GPU Memory Usage: {avg_gpu_memory / 1e6:.2f} MB")
    print(f"Peak GPU Memory Usage: {peak_gpu_memory / 1e6:.2f} MB")
    print(f"Average Carbon Emissions per Inference: {avg_emissions*1e3:.6f} gCO2e")
    print(f"Average Energy Consumption per Inference: {avg_energy*1e3:.6f} Wh")

    return {

        'disk_size': disk_size,
        'num_parameters': num_parameters, 
        'gpu_latency': gpu_latency, 
        'gpu_throughput': gpu_throughput,
        'cpu_latency': cpu_latency,
        'cpu_throughput': cpu_throughput,
        'macs': macs, 
        'avg_gpu_memory': avg_gpu_memory, 
        'peak_gpu_memory': peak_gpu_memory,
        'avg_emissions': avg_emissions, 
        'avg_energy': avg_energy
        
    }

In [None]:
#|eval: false
from torchvision.models import resnet18

model = resnet18()
dummy_input = torch.randn(64, 3, 224, 224)

In [None]:
#|eval: false
benchmark(model, dummy_input)

[INFO] Register count_convNd() for <class 'torch.nn.modules.conv.Conv2d'>.
[INFO] Register count_normalization() for <class 'torch.nn.modules.batchnorm.BatchNorm2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.activation.ReLU'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.pooling.MaxPool2d'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.container.Sequential'>.
[INFO] Register count_adap_avgpool() for <class 'torch.nn.modules.pooling.AdaptiveAvgPool2d'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.


[codecarbon INFO @ 13:19:30] offline tracker init
[codecarbon INFO @ 13:19:30] [setup] RAM Tracking...
[codecarbon INFO @ 13:19:30] [setup] GPU Tracking...
[codecarbon INFO @ 13:19:30] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 13:19:30] [setup] CPU Tracking...
[codecarbon INFO @ 13:19:32] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i9-12900K
[codecarbon INFO @ 13:19:32] >>> Tracker's metadata:
[codecarbon INFO @ 13:19:32]   Platform system: Linux-5.15.0-113-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 13:19:32]   Python version: 3.9.0
[codecarbon INFO @ 13:19:32]   CodeCarbon version: 2.3.4
[codecarbon INFO @ 13:19:32]   Available RAM : 125.578 GB
[codecarbon INFO @ 13:19:32]   CPU count: 24
[codecarbon INFO @ 13:19:32]   CPU model: 12th Gen Intel(R) Core(TM) i9-12900K
[codecarbon INFO @ 13:19:32]   GPU count: 1
[codecarbon INFO @ 13:19:32]   GPU model: 1 x NVIDIA GeForce RTX 3090
[codecarbon INFO @ 13:19:33] Energy consumed for RAM : 0.000016 kWh. RA

Model Size: 46.84 MB (disk), 11.690M parameters
GPU Latency: 13.110 ms (± 0.022 ms)
GPU Throughput: 4881.84 inferences/sec
CPU Latency: 475.591 ms (± 6.319 ms)
CPU Throughput: 134.57 inferences/sec
Model MACs: 116.738G
Average GPU Memory Usage: 94.18 MB
Peak GPU Memory Usage: 504.97 MB
Average Carbon Emissions per Inference: 0.000526 gCO2e
Average Energy Consumption per Inference: 0.001386 Wh


{'disk_size': 46835512,
 'num_parameters': '11.690M',
 'gpu_latency': 13.109815979003907,
 'gpu_throughput': 4881.838166340362,
 'cpu_latency': 475.5907801212743,
 'cpu_throughput': 134.56947164467778,
 'macs': '116.738G',
 'avg_gpu_memory': 94181376.0,
 'peak_gpu_memory': 504967168,
 'avg_emissions': 5.256446662000115e-07,
 'avg_energy': 1.385974440225733e-06}

In [None]:
@torch.inference_mode()
def evaluate(model, dataloader, device=None, verbose=True):
    if device is None: device = torch.device("cuda")
    model.eval()
    model.to(device)

    with torch.no_grad():
        correct = 0
        total = 0
        local_acc = []
        loader = tqdm(dataloader.valid, desc="valid", leave=False)
        for i, data in enumerate(loader):
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0) - 1
            correct += ((predicted.as_subclass(torch.Tensor) == labels.as_subclass(torch.Tensor)).sum().item())

        acc = 100 * correct / total
        if verbose:
            print(f"Valid Accuracy: {acc:.2f} %")
        return acc

In [None]:
#| export
def compute_model_metrics(model, dls, dummy_input):
    metrics = {}
    metrics['accuracy'] = round(evaluate(model, dls, device='cpu'), 2)
    metrics['latency'] = round(evaluate_cpu_speed(model.to("cpu"), dummy_input=dummy_input)[0] * 1000, 1)
    metrics['size'] = get_model_size(model)
    try:
        metrics['params'] = round(get_num_parameters(model) / 1e6, 2)
    except RuntimeError:
        metrics['params'] = "*"
    try:
        metrics['mac'] = round(get_model_macs(model, dummy_input) / 1e6)
    except (AttributeError, RuntimeError):
        metrics['mac'] = "*"
    return metrics

In [None]:
#| export
def compare_models(model_list, dls):

    metrics_keys = ["latency", "accuracy", "params", "size", "mac"]
    metrics_names = {
        "latency": "Latency (ms/sample)",
        "accuracy": "Accuracy (%)",
        "params": "Params (M)",
        "size": "Size (MiB)",
        "mac": "MACs (M)",
    }
    table_data = {key: [metrics_names[key]] for key in metrics_keys}
    model_names = ["Original Model", "Pruned Model", "Quantized Model"]


    table = PrettyTable()
    table.field_names = [""] + model_names
    table.align = "r"
    table.align[""] = "l"

    dummy_input = next(iter(dls.valid))[0][0][None].to('cpu')

    model_metrics_list = []
    for model in model_list:
        metrics = compute_model_metrics(model, dls, dummy_input)
        model_metrics_list.append(metrics)

    for metrics in model_metrics_list:
        for key in metrics_keys:
            table_data[key].append(metrics.get(key, "*"))

    for key in metrics_keys:
        values = table_data[key]
        original_value = values[1]
        for i in range(2, len(values)):
            current_value = values[i]
            gain_info = ''
            try:
                orig_val = float(original_value)
                curr_val = float(current_value)
                if key == 'accuracy':
                    gain = curr_val - orig_val
                    gain_info = f'({gain:+.2f}%)'
                else:
                    gain = orig_val / curr_val if curr_val != 0 else float('inf')
                    gain_info = f'({gain:.2f}x)' if gain != float('inf') else '(inf)'
            except (ValueError, TypeError):
                gain_info = ''
            if gain_info:
                values[i] = f'{current_value:<8} {gain_info:>8}'
            else:
                values[i] = f'{current_value}'

    for key in metrics_keys:
        table.add_row(table_data[key])

    print(table)

In [None]:
#|eval: false
from fastai.vision.all import *
path = untar_data(URLs.PETS)
files = get_image_files(path/"images")

def label_func(f): return f[0].isupper()

dls = ImageDataLoaders.from_name_func(path, files, label_func, item_tfms=Resize(64))

In [None]:
#|eval: false
learn = vision_learner(dls, resnet18, metrics=accuracy)
learn.unfreeze()

In [None]:
#|eval: false
learn.fit_one_cycle(5)

epoch,train_loss,valid_loss,accuracy,time
0,0.714393,0.421342,0.841678,00:03
1,0.39692,0.254652,0.888363,00:03
2,0.228655,0.230342,0.907307,00:03
3,0.139857,0.181267,0.933018,00:03
4,0.07833,0.166232,0.935724,00:03


In [None]:
#|eval: false
model = deepcopy(learn.model)

In [None]:
#|eval: false
from fasterai.prune.all import *

pr_cb = PruneCallback(sparsity=25, context='local', criteria=large_final, schedule=one_cycle, layer_type=[nn.Conv2d])
learn.fit_one_cycle(3, cbs=pr_cb)

Pruning until a sparsity of [25]%


epoch,train_loss,valid_loss,accuracy,time
0,0.050833,0.203945,0.932341,00:04
1,0.157106,0.257223,0.897835,00:04
2,0.18936,0.263265,0.894452,00:04


Sparsity at the end of epoch 0: [5.2]%
Sparsity at the end of epoch 1: [24.15]%
Sparsity at the end of epoch 2: [25.0]%
Final Sparsity: [25.0]%


In [None]:
#|eval: false
pruned_model = deepcopy(learn.model)

In [None]:
#|eval: false
from fasterai.quantize.all import *

qt = Quantizer()

q_model = qt.quantize(learn.model.to('cpu'), dls)



In [None]:
#|eval: false
compare_models([model, pruned_model, q_model], dls)

valid:   0%|          | 0/24 [00:00<?, ?it/s]

Valid Accuracy: 95.12 %


valid:   0%|          | 0/24 [00:00<?, ?it/s]

Valid Accuracy: 90.92 %


valid:   0%|          | 0/24 [00:00<?, ?it/s]

Valid Accuracy: 89.61 %
+---------------------+----------------+-------------------+-------------------+
|                     | Original Model |      Pruned Model |   Quantized Model |
+---------------------+----------------+-------------------+-------------------+
| Latency (ms/sample) |         3517.7 | 2480.0    (1.42x) | 2063.2    (1.70x) |
| Accuracy (%)        |          95.12 | 90.92    (-4.20%) | 89.61    (-5.51%) |
| Params (M)          |           11.7 | 6.69      (1.75x) |                 * |
| Size (MiB)          |       46912066 | 26829378  (1.75x) | 6827042   (6.87x) |
| MACs (M)            |            149 | 86        (1.73x) |                 * |
+---------------------+----------------+-------------------+-------------------+
