In [None]:
from quewing import gpu_manager
from models import get_model, avail_models
import torch
import gc
import torch.nn as nn



In [2]:
device = torch.device('cuda')

In [6]:
print(f"Allocated: {torch.cuda.memory_allocated(device) / 1024**3:.3f} GB")
print(f"Reserved: {torch.cuda.memory_reserved(device) / 1024**3:.3f} GB")
print(f"Max allocated: {torch.cuda.max_memory_allocated(device) / 1024**3:.3f} GB")


Allocated: 0.000 GB
Reserved: 0.000 GB
Max allocated: 0.127 GB


In [4]:
print("avail_models: ")
print(', '.join(avail_models()))

avail_models: 
S3D, R3D_18, R(2+1)D_18, Swin3D_T, Swin3D_S, Swin3D_B, MViTv2_S, MViTv1_B


In [8]:
def measure_model_memory(model_name, num_classes, dropout, device, batch_size=4, input_shape=(3, 16, 112, 112)):
    """
    Measure memory usage for a model during inference and training
    
    Args:
        model_name: Architecture name
        num_classes: Number of output classes
        dropout: Dropout rate
        device: torch.device
        batch_size: Batch size for testing
        input_shape: (channels, frames, height, width) for video models
    """
    results = {}
    
    # Baseline
    torch.cuda.empty_cache()
    gc.collect()
    baseline = gpu_manager.get_gpu_memory_usage()[0]
    results['baseline'] = baseline
    
    # Load model
    model = get_model(model_name, num_classes, dropout)
    model.to(device)
    torch.cuda.synchronize()
    
    model_loaded = gpu_manager.get_gpu_memory_usage()[0]
    results['model_only'] = model_loaded - baseline
    
    # Create dummy input
    dummy_input = torch.randn(batch_size, *input_shape).to(device)
    dummy_target = torch.randint(0, num_classes, (batch_size,)).to(device)
    
    # INFERENCE
    model.eval()
    with torch.no_grad():
        output = model(dummy_input)
        torch.cuda.synchronize()
        inference_mem = gpu_manager.get_gpu_memory_usage()[0]
        results['inference_peak'] = inference_mem - baseline
        results['inference_activations'] = inference_mem - model_loaded
    
    del output
    torch.cuda.empty_cache()
    
    # TRAINING
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Forward pass
    output = model(dummy_input)
    torch.cuda.synchronize()
    forward_mem = gpu_manager.get_gpu_memory_usage()[0]
    results['training_forward'] = forward_mem - baseline
    
    # Backward pass
    loss = criterion(output, dummy_target)
    loss.backward()
    torch.cuda.synchronize()
    backward_mem = gpu_manager.get_gpu_memory_usage()[0]
    results['training_peak'] = backward_mem - baseline
    results['training_gradients'] = backward_mem - forward_mem
    
    # Optimizer step (includes optimizer states)
    optimizer.step()
    torch.cuda.synchronize()
    optimizer_mem = gpu_manager.get_gpu_memory_usage()[0]
    results['training_with_optimizer'] = optimizer_mem - baseline
    results['optimizer_states'] = optimizer_mem - backward_mem
    
    # Cleanup
    del model, dummy_input, dummy_target, output, loss, optimizer, criterion
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    
    final_mem = gpu_manager.get_gpu_memory_usage()[0]
    results['after_cleanup'] = final_mem - baseline
    
    return results

In [9]:
# Compare all models
avm = avail_models()
device = torch.device('cuda')
print("CUDA available:", torch.cuda.is_available())
print(f"\nBatch size: 4, Input shape: (3, 16, 112, 112)\n")

all_results = {}
for model_name in avm:
    print(f"{'='*60}")
    print(f"Architecture: {model_name}")
    print(f"{'='*60}")
    
    try:
        results = measure_model_memory(model_name, 100, 0.5, device)
        all_results[model_name] = results
        
        print(f"Model weights only:        {results['model_only']:.3f} GB")
        print(f"\nINFERENCE:")
        print(f"  Peak memory:             {results['inference_peak']:.3f} GB")
        print(f"  Activations:             {results['inference_activations']:.3f} GB")
        print(f"\nTRAINING:")
        print(f"  After forward pass:      {results['training_forward']:.3f} GB")
        print(f"  After backward (+ grad): {results['training_peak']:.3f} GB")
        print(f"  Gradient memory:         {results['training_gradients']:.3f} GB")
        print(f"  With optimizer states:   {results['training_with_optimizer']:.3f} GB")
        print(f"  Optimizer state memory:  {results['optimizer_states']:.3f} GB")
        print(f"\nMemory after cleanup:      {results['after_cleanup']:.3f} GB")
        print(f"\nRatio (training/inference): {results['training_peak']/results['inference_peak']:.2f}x")
        
    except Exception as e:
        print(f"Error with {model_name}: {e}")
    
    print()

# Summary comparison
print(f"\n{'='*60}")
print("SUMMARY COMPARISON")
print(f"{'='*60}")
print(f"{'Model':<20} {'Weights':<10} {'Inference':<12} {'Training':<12} {'Ratio':<8}")
print(f"{'-'*60}")
for model_name, results in all_results.items():
    ratio = results['training_peak'] / results['inference_peak']
    print(f"{model_name:<20} {results['model_only']:<10.3f} {results['inference_peak']:<12.3f} {results['training_peak']:<12.3f} {ratio:<8.2f}x")

CUDA available: True

Batch size: 4, Input shape: (3, 16, 112, 112)

Architecture: S3D
Error with S3D: input image (T: 2 H: 3 W: 3) smaller than kernel size (kT: 2 kH: 7 kW: 7)

Architecture: R3D_18
Error with R3D_18: name 'nn' is not defined

Architecture: R(2+1)D_18
Error with R(2+1)D_18: name 'nn' is not defined

Architecture: Swin3D_T
Error with Swin3D_T: name 'nn' is not defined

Architecture: Swin3D_S
Error with Swin3D_S: name 'nn' is not defined

Architecture: Swin3D_B
Error with Swin3D_B: name 'nn' is not defined

Architecture: MViTv2_S




Error with MViTv2_S: shape '[4, 96, 8, 56, 56]' is invalid for input of size 2408448

Architecture: MViTv1_B




Error with MViTv1_B: shape '[4, 96, 8, 56, 56]' is invalid for input of size 2408448


SUMMARY COMPARISON
Model                Weights    Inference    Training     Ratio   
------------------------------------------------------------
