In [1]:
import torch
from torch import nn
from src.utils import load_model, load_quantized_model
from src.model import ResNet, BasicBlock, resnet110
import tracemalloc
import time
import torch.quantization
import psutil
import os
from torch.profiler import profile, ProfilerActivity


In [6]:
device = torch.device("cpu")
model_path = "models/resnet110_baseline_120_mps.pth"
#model_path = "models/quantized_resnet110_baseline_120_cpu.pt"
backend = 'qnnpack'

torch.backends.quantized.engine = backend

batch_size = 128


def test_model_loading():
    # needs to be changed depending on loading a quantized or normal model
    model = load_model(model_path, device=device)
    #model = load_quantized_model(model_path)
    return model

model = test_model_loading()

Trace Memory allaction during loading the model with tracemalloc

In [7]:
torch.backends.quantized.engine = backend

# Start measuring memory usage before loading
tracemalloc.start()
start_time = time.time()

model = test_model_loading()

load_time = time.time() - start_time
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f"Model load time: {load_time:.4f} seconds")
print(f"Model load memory usage (peak): {peak / 1024 / 1024:.2f} MB")


Model load time: 0.1250 seconds
Model load memory usage (peak): 2.80 MB


Trace total memory change when loading the model into memory with psutil (sometimes not correct / fluctuates but gives an estimate, depending on the state of the machine (idle, just started IDE))

In [8]:
def get_memory_mb():
    process = psutil.Process(os.getpid())
    mem_bytes = process.memory_info().rss  # in bytes
    return mem_bytes / (1024 * 1024)

# Measure baseline memory
print("Measuring baseline memory...")
time.sleep(1)
baseline_mem = get_memory_mb()
print(f"Baseline: {baseline_mem:.2f} MB")

# Load the model
print("Loading model...")
model = test_model_loading()
model.eval()

# Wait a bit to let memory settle
time.sleep(1)
post_load_mem = get_memory_mb()
print(f"After model load: {post_load_mem:.2f} MB")

# Calculate delta
model_static_mem = post_load_mem - baseline_mem
print(f"Static memory used by model (just sitting in RAM): {model_static_mem:.2f} MB")


Measuring baseline memory...
Baseline: 256.59 MB
Loading model...
After model load: 263.02 MB
Static memory used by model (just sitting in RAM): 6.42 MB


CPU and Memory allocation during one forward pass with warm up with torch.profiler

In [9]:
model.eval()
example_inputs = torch.randn(1, 3, 32, 32)

# Warm-up
for _ in range(5):
    with torch.no_grad():
        _ = model(example_inputs)

# Profiling
with profile(
    activities=[ProfilerActivity.CPU],
    profile_memory=True,
    record_shapes=True,
    with_stack=False,
    with_flops=True
) as prof:
    with torch.no_grad():
        _ = model(example_inputs)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage"))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  Total KFLOPs  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::empty         4.01%     309.908us         4.01%     309.908us       0.349us      33.18 Mb      33.18 Mb           888            --  
                   aten::resize_         0.75%      57.836us         0.75%      57.836us       0.521us       2.53 Mb       2.53 Mb           111            --  
                       aten::add         2.07%     159.702us         2.07%     159.702us       2.957us       1.97 Mb       1.97 Mb            54       516.096  
                aten::empty_like  