In [1]:
from models import get_model, avail_models, norm_vals
from video_dataset import get_data_loader, get_wlasl_info
from configs import get_avail_splits
import torch
import gc
import torch.nn as nn
import torchvision.models as models
from torch.profiler import profile, ProfilerActivity, record_function



In [2]:
device = torch.device('cuda')
print(device)

cuda


### Available models

In [3]:
avail_m = avail_models()
print(', '.join(avail_m))

S3D, R3D_18, R(2+1)D_18, Swin3D_T, Swin3D_S, Swin3D_B, MViTv2_S, MViTv1_B


### Available splits

In [4]:
avail_sp = get_avail_splits()
print(', '.join(avail_sp))

asl100, asl300


## Input

In [5]:
# fs = 224 #frame size
# nf = 32 #num frames
# bs = 8 #batch size
fs = 224 
nf = 16
bs = 2


nvals = norm_vals(avail_m[0]) #normalisation won't make a difference in this case, but S3D

wlasl_info = get_wlasl_info(avail_sp[0], 'test') #asl100

testloader, ncls, _, _ = get_data_loader(
    mean=nvals['mean'],
    std=nvals['std'],
    frame_size=fs,
    num_frames=nf,
    set_info=wlasl_info,
    batch_size=bs
)

print("Numclasses: ", ncls)

Numclasses:  100


### Get a single video

In [6]:
dicty = next(iter(testloader))
vid0, target = dicty["frames"], dicty["label_num"]
vid0 = vid0.to(device)  # Fix: reassign the result
target = target.to(device)
print(vid0.shape)
print(f"vid0 device: {vid0.device}")  # Verify it's on CUDA

torch.Size([2, 3, 16, 224, 224])
vid0 device: cuda:0


## Get models

In [7]:
nc = 100 #num classes
dropout = 0.0 #no dropout
all_models = []
for arch in avail_m:
    model = get_model(arch, nc, dropout)
    all_models.append((arch, model)) #tuple: arch, model



## From PyTorch Docs:

In [8]:
def profile_it(model, inputs, title):
    activities = [ProfilerActivity.CPU]
    if torch.cuda.is_available():
        device = "cuda"
        activities += [ProfilerActivity.CUDA]
    elif torch.xpu.is_available():
        device = "xpu"
        activities += [ProfilerActivity.XPU]
    else:
        print(
            "Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices"
        )
        import sys

        sys.exit(0)

    sort_by_keyword = device + "_time_total"

    model = model.to(device)
    inputs = inputs.to(device)

    with profile(activities=activities, record_shapes=True, profile_memory=True) as prof:
        with record_function(f"{title}_inference"):
            model(inputs)

    print(prof.key_averages().table(sort_by=sort_by_keyword, row_limit=200))

In [9]:
# for arch, model in all_models:
#     profile_it(model, vid0, arch)
#     print("\n"*2, "-"*(250-22), "\n"*2)

In [10]:
print(len('----------------------'))

22


In [None]:
# Replace cell 12 with this version that monitors memory DURING inference

import time
import subprocess
import threading

def get_gpu_memory():
    """Get actual GPU memory usage in MiB using nvidia-smi"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'],
            capture_output=True,
            text=True,
            check=True
        )
        return int(result.stdout.strip())
    except:
        return None

class MemoryMonitor:
    """Monitor GPU memory in a background thread during inference"""
    def __init__(self, interval=0.001):  # Sample every 1ms
        self.interval = interval
        self.monitoring = False
        self.peak_memory = 0
        self.memory_samples = []
        self.thread = None
    
    def _monitor(self):
        while self.monitoring:
            mem = get_gpu_memory()
            if mem is not None:
                self.memory_samples.append(mem)
                self.peak_memory = max(self.peak_memory, mem)
            time.sleep(self.interval)
    
    def start(self):
        self.monitoring = True
        self.peak_memory = 0
        self.memory_samples = []
        self.thread = threading.Thread(target=self._monitor, daemon=True)
        self.thread.start()
    
    def stop(self):
        self.monitoring = False
        if self.thread:
            self.thread.join()
        return self.peak_memory, self.memory_samples

# Clear GPU memory before starting
torch.cuda.empty_cache()
time.sleep(0.5)

baseline_memory = get_gpu_memory()
print(f"Baseline GPU Memory: {baseline_memory} MiB ({baseline_memory/1024:.3f} GiB)\n")

for arch, model in all_models:
    # Clear everything first
    torch.cuda.empty_cache()
    time.sleep(0.3)
    
    print(f"\n{'='*80}")
    print(f"Model: {arch}")
    print(f"{'='*80}")
    
    mem_before = get_gpu_memory()
    
    # Load model to GPU
    model.to(device)
    model.eval()
    torch.cuda.synchronize()
    time.sleep(0.2)
    
    mem_after_model = get_gpu_memory()
    
    # Prepare input
    vid0_gpu = vid0.to(device)
    torch.cuda.synchronize()
    
    # Warm-up run
    with torch.no_grad():
        _ = model(vid0_gpu)
    torch.cuda.synchronize()
    time.sleep(0.2)
    
    mem_after_warmup = get_gpu_memory()
    
    # Create memory monitor
    monitor = MemoryMonitor(interval=0.001)  # Sample every 1ms
    
    # Start monitoring and run inference
    monitor.start()
    
    start_time = time.time()
    with torch.no_grad():
        output = model(vid0_gpu)
    torch.cuda.synchronize()
    end_time = time.time()
    
    # Stop monitoring
    peak_memory, samples = monitor.stop()
    
    # Model parameters
    total_params = sum(p.numel() for p in model.parameters())
    
    print(f"\nInference Time: {(end_time - start_time)*1000:.2f} ms")
    print(f"Throughput: {bs / (end_time - start_time):.2f} samples/sec")
    print(f"Memory samples taken: {len(samples)}")
    
    print(f"\nGPU Memory Usage (via nvidia-smi):")
    print(f"  Baseline:          {baseline_memory} MiB ({baseline_memory/1024:.3f} GiB)")
    print(f"  Before model:      {mem_before} MiB")
    print(f"  After model load:  {mem_after_model} MiB  (+{mem_after_model - mem_before} MiB)")
    print(f"  After warmup:      {mem_after_warmup} MiB")
    print(f"  PEAK during run:   {peak_memory} MiB ({peak_memory/1024:.3f} GiB)")
    print(f"  Peak over baseline: +{peak_memory - baseline_memory} MiB")
    
    if len(samples) > 0:
        avg_memory = sum(samples) / len(samples)
        print(f"  Average during run: {int(avg_memory)} MiB ({avg_memory/1024:.3f} GiB)")
    
    print(f"\nModel Info:")
    print(f"  Total Parameters: {total_params:,}")
    print(f"  Output Shape: {output.shape}")
    
    # Clean up for next model
    del output
    model.cpu()
    del model
    torch.cuda.empty_cache()
    time.sleep(0.3)
    
    mem_after_cleanup = get_gpu_memory()
    print(f"  After cleanup:     {mem_after_cleanup} MiB ({mem_after_cleanup/1024:.3f} GiB)")
    
    print(f"{'='*80}\n")

# Recreate model list
print("\n" + "="*80)
print("Re-initializing models...")
print("="*80)
all_models = []
for arch in avail_m:
    model = get_model(arch, nc, dropout)
    all_models.append((arch, model))

Baseline GPU Memory: 188 MiB


Model: S3D

Inference Time: 26.47 ms
Throughput: 75.56 samples/sec

GPU Memory Usage (via nvidia-smi):
  Before model load: 188 MiB
  After model load:  224 MiB  (+36 MiB)
  Before inference:  444 MiB
  Peak during run:   444 MiB
  Total used:        444 MiB (0.434 GiB)

Model Info:
  Total Parameters: 8,012,548
  Output Shape: torch.Size([2, 100])
  After cleanup:     210 MiB


Model: R3D_18

Inference Time: 88.19 ms
Throughput: 22.68 samples/sec

GPU Memory Usage (via nvidia-smi):
  Before model load: 210 MiB
  After model load:  348 MiB  (+138 MiB)
  Before inference:  848 MiB
  Peak during run:   946 MiB
  Total used:        946 MiB (0.924 GiB)

Model Info:
  Total Parameters: 33,217,572
  Output Shape: torch.Size([2, 100])
  After cleanup:     314 MiB


Model: R(2+1)D_18

Inference Time: 127.22 ms
Throughput: 15.72 samples/sec

GPU Memory Usage (via nvidia-smi):
  Before model load: 314 MiB
  After model load:  360 MiB  (+46 MiB)
  Before inference: 

