# GPU vs CPU Imputation Benchmark

This notebook benchmarks GPU-accelerated imputation vs CPU imputation.

In [1]:
import torch
import time
import sys
sys.path.insert(0, '/home/student/Hackathon/imputer/src')
from imputer.gpu_imputer import GPUImageImputer

import torch
import time
import sys
sys.path.insert(0, '/home/student/Hackathon/imputer/src')
from imputer.gpu_imputer import GPUImageImputer

In [2]:
# Check availability
print(f"Torch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

Torch version: 2.9.0+cu128
CUDA available: True
CUDA device: NVIDIA GeForce RTX 3080


## Single Image Benchmark

In [3]:
image_path = "/home/student/Hackathon/imputer/girafe.jpg"

# CPU
cpu_imputer = GPUImageImputer(device="cpu")
start = time.time()
cpu_pred = cpu_imputer.impute_single(image_path)
cpu_time = time.time() - start

# GPU
if torch.cuda.is_available():
    gpu_imputer = GPUImageImputer(device="cuda")
    start = time.time()
    gpu_pred = gpu_imputer.impute_single(image_path)
    gpu_time = time.time() - start
    
    print(f"CPU time: {cpu_time:.4f}s")
    print(f"GPU time: {gpu_time:.4f}s")
    print(f"Speedup: {cpu_time/gpu_time:.2f}x")
else:
    print(f"CPU time: {cpu_time:.4f}s")
    print("GPU not available")

CPU time: 0.0218s
GPU time: 0.1750s
Speedup: 0.12x


## Batch Processing Benchmark

In [4]:
# Simulate batch of 50 images
batch_paths = [image_path] * 50

# CPU
cpu_imputer = GPUImageImputer(device="cpu", batch_size=16)
start = time.time()
cpu_batch = cpu_imputer.impute_batch(batch_paths)
cpu_batch_time = time.time() - start

# GPU
if torch.cuda.is_available():
    gpu_imputer = GPUImageImputer(device="cuda", batch_size=16)
    start = time.time()
    gpu_batch = gpu_imputer.impute_batch(batch_paths)
    gpu_batch_time = time.time() - start
    
    print(f"CPU batch time: {cpu_batch_time:.4f}s ({cpu_batch_time/50*1000:.2f}ms per image)")
    print(f"GPU batch time: {gpu_batch_time:.4f}s ({gpu_batch_time/50*1000:.2f}ms per image)")
    print(f"Speedup: {cpu_batch_time/gpu_batch_time:.2f}x")
else:
    print(f"CPU batch time: {cpu_batch_time:.4f}s ({cpu_batch_time/50*1000:.2f}ms per image)")
    print("GPU not available")

CPU batch time: 0.6033s (12.07ms per image)
GPU batch time: 0.2174s (4.35ms per image)
Speedup: 2.77x


## Masked Tensor Imputation Benchmark

In [5]:
# Large tensor batch
tensor = torch.randn(32, 3, 224, 224)
mask = torch.randint(0, 2, (32, 3, 224, 224))

# CPU
cpu_imputer = GPUImageImputer(device="cpu")
start = time.time()
cpu_imputed = cpu_imputer.impute_masked_tensor(tensor, mask)
cpu_tensor_time = time.time() - start

# GPU
if torch.cuda.is_available():
    gpu_imputer = GPUImageImputer(device="cuda")
    start = time.time()
    gpu_imputed = gpu_imputer.impute_masked_tensor(tensor, mask)
    gpu_tensor_time = time.time() - start
    
    print(f"CPU tensor time: {cpu_tensor_time:.4f}s")
    print(f"GPU tensor time: {gpu_tensor_time:.4f}s")
    print(f"Speedup: {cpu_tensor_time/gpu_tensor_time:.2f}x")
else:
    print(f"CPU tensor time: {cpu_tensor_time:.4f}s")
    print("GPU not available")

CPU tensor time: 0.0188s
GPU tensor time: 0.0157s
Speedup: 1.20x


## Summary

GPU acceleration provides significant speedup for:
- Single image inference
- Batch processing (scales better)
- Large tensor operations

## Baseline Imputer Strategies Benchmark

Test all baseline strategies (mean, median, constant) with GPU acceleration

In [6]:
from torchvision.io import read_image
from imputer.gpu_imputer import GPUBaselineImputer

# Load multiple images as background data
image_path = "/home/student/Hackathon/imputer/girafe.jpg"
data_cpu = torch.stack([read_image(image_path).float() for _ in range(10)])
data_gpu = data_cpu.clone()

# Test image
test_img = read_image(image_path).float().unsqueeze(0)

print("Testing all baseline strategies on GPU vs CPU:\n")

for strategy in ["mean", "median", "constant"]:
    print(f"--- {strategy.upper()} Strategy ---")
    
    # CPU
    cpu_baseline = GPUBaselineImputer(
        model=cpu_imputer.model,
        data=data_cpu,
        device="cpu",
        strategy=strategy,  # type: ignore
        constant_value=128.0
    )
    
    start = time.time()
    coalition = torch.tensor([True, False, True])
    cpu_imputed = cpu_baseline.impute_with_coalition(test_img, coalition)
    cpu_preds = cpu_baseline.predict_batch(cpu_imputed)
    cpu_strategy_time = time.time() - start
    
    # GPU
    if torch.cuda.is_available():
        gpu_baseline = GPUBaselineImputer(
            model=gpu_imputer.model,
            data=data_gpu,
            device="cuda",
            strategy=strategy,  # type: ignore
            constant_value=128.0
        )
        
        start = time.time()
        gpu_imputed = gpu_baseline.impute_with_coalition(test_img, coalition)
        gpu_preds = gpu_baseline.predict_batch(gpu_imputed)
        gpu_strategy_time = time.time() - start
        
        print(f"CPU: {cpu_strategy_time:.4f}s, Top prob: {cpu_preds[0].max():.4f}")
        print(f"GPU: {gpu_strategy_time:.4f}s, Top prob: {gpu_preds[0].max():.4f}")
        print(f"Speedup: {cpu_strategy_time/gpu_strategy_time:.2f}x\n")
    else:
        print(f"CPU: {cpu_strategy_time:.4f}s, Top prob: {cpu_preds[0].max():.4f}")
        print("GPU not available\n")

Testing all baseline strategies on GPU vs CPU:

--- MEAN Strategy ---
CPU: 0.0886s, Top prob: 1.0000
GPU: 0.0171s, Top prob: 1.0000
Speedup: 5.18x

--- MEDIAN Strategy ---
CPU: 0.0852s, Top prob: 1.0000
GPU: 0.0168s, Top prob: 1.0000
Speedup: 5.08x

--- CONSTANT Strategy ---
CPU: 0.0841s, Top prob: 1.0000
GPU: 0.0028s, Top prob: 1.0000
Speedup: 30.09x

CPU: 0.0852s, Top prob: 1.0000
GPU: 0.0168s, Top prob: 1.0000
Speedup: 5.08x

--- CONSTANT Strategy ---
CPU: 0.0841s, Top prob: 1.0000
GPU: 0.0028s, Top prob: 1.0000
Speedup: 30.09x



## Large Dataset Test

Process multiple images with different strategies

In [7]:
# Create dataset of 100 images
large_dataset = torch.stack([read_image(image_path).float() for _ in range(100)])
print(f"Dataset shape: {large_dataset.shape}")

# Background data (20 images)
background = large_dataset[:20]

# Test different batch sizes
batch_sizes = [16, 32, 64]

for bs in batch_sizes:
    print(f"\n=== Batch size: {bs} ===")
    
    if torch.cuda.is_available():
        gpu_bl = GPUBaselineImputer(
            model=gpu_imputer.model,
            data=background,
            device="cuda",
            batch_size=bs,
            strategy="mean"  # type: ignore
        )
        
        start = time.time()
        for i in range(0, len(large_dataset), bs):
            batch = large_dataset[i:i+bs]
            coalition = torch.ones(3, dtype=torch.bool)
            imputed = gpu_bl.impute_with_coalition(batch, coalition)
            preds = gpu_bl.predict_batch(imputed)
        gpu_large_time = time.time() - start
        
        print(f"GPU processed 100 images in {gpu_large_time:.4f}s")
        print(f"Throughput: {100/gpu_large_time:.2f} images/sec")

Dataset shape: torch.Size([100, 3, 614, 1024])

=== Batch size: 16 ===
GPU processed 100 images in 0.3894s
Throughput: 256.77 images/sec

=== Batch size: 32 ===
GPU processed 100 images in 0.3894s
Throughput: 256.77 images/sec

=== Batch size: 32 ===
GPU processed 100 images in 0.3585s
Throughput: 278.93 images/sec

=== Batch size: 64 ===
GPU processed 100 images in 0.3585s
Throughput: 278.93 images/sec

=== Batch size: 64 ===
GPU processed 100 images in 0.3662s
Throughput: 273.09 images/sec
GPU processed 100 images in 0.3662s
Throughput: 273.09 images/sec
