In [1]:
# Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import os
import sys

# Check if pytorch-cifar directory exists
if not os.path.exists('pytorch-cifar'):
    print("Cloning pytorch-cifar repository...")
    !git clone https://github.com/kuangliu/pytorch-cifar.git
else:
    print("✅ pytorch-cifar directory already exists, skipping clone")

# Copy resnet.py if needed
if not os.path.exists('resnet.py') and os.path.exists('pytorch-cifar/models/resnet.py'):
    !cp pytorch-cifar/models/resnet.py .
    print("✅ Copied resnet.py")

# Import ResNet model
sys.path.append('./pytorch-cifar')
from models import resnet

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")


✅ pytorch-cifar directory already exists, skipping clone
PyTorch version: 2.8.0+cu128
CUDA available: True
Number of GPUs: 4
GPU 0: NVIDIA L4
GPU 1: NVIDIA L4
GPU 2: NVIDIA L4
GPU 3: NVIDIA L4


In [2]:
def get_dataloaders():
    """
    Create DataLoaders for CIFAR10 with specified transformations
    """
    # Training transformations
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=(0.4914, 0.4822, 0.4465),
            std=(0.2023, 0.1994, 0.2010)
        ),
    ])
    
    # Testing transformations
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(
            mean=(0.4914, 0.4822, 0.4465),
            std=(0.2023, 0.1994, 0.2010)
        ),
    ])
    
    # Load datasets
    trainset = torchvision.datasets.CIFAR10(
        root='./data',
        train=True,
        download=True,
        transform=transform_train
    )
    
    trainloader = DataLoader(
        trainset,
        batch_size=128,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    testset = torchvision.datasets.CIFAR10(
        root='./data',
        train=False,
        download=True,
        transform=transform_test
    )
    
    testloader = DataLoader(
        testset,
        batch_size=100,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    
    return trainloader, testloader

# Create dataloaders
print("==> Preparing data...")
trainloader, testloader = get_dataloaders()
print(f"Training batches: {len(trainloader)}")
print(f"Testing batches: {len(testloader)}")


==> Preparing data...
Training batches: 391
Testing batches: 100


In [3]:
# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create ResNet18 model
print("==> Building model...")
net = resnet.ResNet18()

# Wrap with DataParallel for multi-GPU training
if torch.cuda.device_count() > 1:
    print(f"Let's use {torch.cuda.device_count()} GPUs with DataParallel!")
    net = nn.DataParallel(net)
else:
    print("Using single GPU")

net = net.to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
    net.parameters(),
    lr=0.1,
    momentum=0.9,
    weight_decay=5e-4
)

# Learning rate scheduler
scheduler = optim.lr_scheduler.MultiStepLR(
    optimizer,
    milestones=[150, 250],
    gamma=0.1
)

print("Model setup complete!")


Using device: cuda:0
==> Building model...
Let's use 4 GPUs with DataParallel!
Model setup complete!


In [4]:
def train_epoch(model, trainloader, criterion, optimizer, device, epoch):
    """Train for one epoch"""
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        if batch_idx % 100 == 0:
            print(f'Epoch: {epoch} | Batch: {batch_idx}/{len(trainloader)} | '
                  f'Loss: {train_loss/(batch_idx+1):.3f} | '
                  f'Acc: {100.*correct/total:.2f}% ({correct}/{total})')
    
    return train_loss / len(trainloader), 100. * correct / total


def test(model, testloader, criterion, device):
    """Evaluate on test set"""
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    test_acc = 100. * correct / total
    avg_loss = test_loss / len(testloader)
    print(f'Test Loss: {avg_loss:.3f} | Test Acc: {test_acc:.2f}% ({correct}/{total})')
    
    return avg_loss, test_acc

print("Training functions defined!")


Training functions defined!


In [7]:
import torchvision.transforms as transforms

# Training transformations
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.4914, 0.4822, 0.4465),
        std=(0.2023, 0.1994, 0.2010)
    ),
])

# Testing transformations (if needed)
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.4914, 0.4822, 0.4465),
        std=(0.2023, 0.1994, 0.2010)
    ),
])


In [8]:
# Define the measuring functions
import time
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Part 1
def measure_single_gpu_training(batch_sizes=[32, 128, 512]):
    """
    Measure training time for different batch sizes on single GPU
    """
    results = {}
    
    for batch_size in batch_sizes:
        try:
            # Clear GPU cache
            torch.cuda.empty_cache()
            
            # Create fresh model for each run
            net = resnet.ResNet18()
            device = torch.device("cuda:0")
            net = net.to(device)
            
            # Create dataloader with current batch size
            trainset = torchvision.datasets.CIFAR10(
                root='./data', train=True, download=True,
                transform=transform_train
            )
            trainloader = DataLoader(
                trainset, batch_size=batch_size,
                shuffle=True, num_workers=2, pin_memory=True
            )
            
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.SGD(net.parameters(), lr=0.1, 
                                momentum=0.9, weight_decay=5e-4)
            
            # Warmup epoch (not timed)
            net.train()
            for inputs, targets in trainloader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
            
            # Measured epoch
            torch.cuda.synchronize()  # Ensure all GPU operations complete
            start_time = time.time()
            
            net.train()
            for inputs, targets in trainloader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
            
            torch.cuda.synchronize()  # Ensure all GPU operations complete
            end_time = time.time()
            
            epoch_time = end_time - start_time
            results[batch_size] = epoch_time
            print(f"Batch size {batch_size}: {epoch_time:.2f}s")
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"Batch size {batch_size}: OOM - stopping")
                break
            else:
                raise e
    
    return results


In [None]:
# Part 2: Using the function for multiple GPU-s
def measure_multi_gpu_training(batch_size, num_gpus):
    """
    Measure total training time (including data loading) on multiple GPUs
    """
    torch.cuda.empty_cache()
    
    # Create model
    net = resnet.ResNet18()
    device = torch.device("cuda:0")
    
    # Apply DataParallel for multi-GPU
    if num_gpus > 1:
        net = nn.DataParallel(net, device_ids=list(range(num_gpus)))
    net = net.to(device)
    
    # Create dataloader
    trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True,
        transform=transform_train
    )
    trainloader = DataLoader(
        trainset, batch_size=batch_size,
        shuffle=True, num_workers=2, pin_memory=True
    )
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.1, 
                        momentum=0.9, weight_decay=5e-4)
    
    # Warmup epoch
    net.train()
    for inputs, targets in trainloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    # Measured epoch (include everything)
    torch.cuda.synchronize()
    start_time = time.time()
    
    net.train()
    for inputs, targets in trainloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    torch.cuda.synchronize()
    end_time = time.time()
    
    return end_time - start_time

# Run experiments
batch_sizes = [32, 128, 512]  # Adjust based on Part 1 results
gpu_configs = [1, 2, 4]

results_table = {}
for bs in batch_sizes:
    results_table[bs] = {}
    for num_gpus in gpu_configs:
        time_taken = measure_multi_gpu_training(bs, num_gpus)
        results_table[bs][num_gpus] = {
            'time': time_taken,
            'speedup': results_table[bs][1]['time'] / time_taken if num_gpus > 1 else 1.0
        }
        print(f"BS={bs}, GPUs={num_gpus}: {time_taken:.2f}s, Speedup={results_table[bs][num_gpus]['speedup']:.2f}x")


BS=32, GPUs=1: 19.82s, Speedup=1.00x
BS=32, GPUs=2: 57.82s, Speedup=0.34x
BS=32, GPUs=4: 75.65s, Speedup=0.26x
BS=128, GPUs=1: 17.75s, Speedup=1.00x
BS=128, GPUs=2: 17.41s, Speedup=1.02x
BS=128, GPUs=4: 19.26s, Speedup=0.92x
BS=512, GPUs=1: 19.76s, Speedup=1.00x
BS=512, GPUs=2: 10.57s, Speedup=1.87x
BS=512, GPUs=4: 9.38s, Speedup=2.11x


In [13]:
#Part 3 Communication
def calculate_compute_communication_breakdown(results_table, single_gpu_results):
    """
    Calculate compute and communication time for multi-GPU setups
    
    Compute time ≈ single GPU time (same batch size, same computation)
    Communication time = Total multi-GPU time - Compute time
    """
    breakdown = {}
    
    for batch_size in results_table.keys():
        breakdown[batch_size] = {}
        
        # Single GPU compute time (no communication)
        compute_time_base = single_gpu_results[batch_size]
        
        for num_gpus in [2, 4]:
            total_time = results_table[batch_size][num_gpus]['time']
            
            # Compute time: same as single GPU for same batch size per GPU
            compute_time = compute_time_base
            
            # Communication time: includes gradient allreduce
            communication_time = total_time - compute_time
            
            breakdown[batch_size][num_gpus] = {
                'compute': compute_time,
                'communication': communication_time,
                'total': total_time
            }
    
    return breakdown


In [14]:
# Part 4 Bandwith utilization
def calculate_bandwidth_utilization(model, breakdown_table, num_gpus_list=[2, 4]):
    """
    Calculate bandwidth utilization for allreduce operations
    
    Formulas:
    - Ring AllReduce time: T_comm = 2 * (N-1) / N * S / B
      where N = number of GPUs, S = model size in bytes, B = bandwidth
    - Bandwidth utilization: B_effective = 2 * (N-1) / N * S / T_measured
    """
    # Calculate model size (number of parameters * 4 bytes for float32)
    total_params = sum(p.numel() for p in model.parameters())
    model_size_bytes = total_params * 4  # float32
    model_size_gb = model_size_bytes / (1024**3)
    
    print(f"Model size: {total_params:,} parameters = {model_size_gb:.4f} GB")
    
    results = {}
    
    for batch_size, gpu_data in breakdown_table.items():
        results[batch_size] = {}
        
        for num_gpus in num_gpus_list:
            if num_gpus in gpu_data:
                # Measured communication time
                T_measured = gpu_data[num_gpus]['communication']
                
                # Ring allreduce coefficient
                ring_factor = 2 * (num_gpus - 1) / num_gpus
                
                # Effective bandwidth (GB/s)
                B_effective = ring_factor * model_size_gb / T_measured
                
                # Theoretical time formula: T = 2*(N-1)/N * S/B
                # (assuming peak bandwidth, e.g., NVLink 300 GB/s for A100)
                B_peak = 300  # GB/s for NVLink on A100
                T_theoretical = ring_factor * model_size_gb / B_peak
                
                # Bandwidth utilization percentage
                utilization = (B_effective / B_peak) * 100
                
                results[batch_size][num_gpus] = {
                    'T_measured': T_measured,
                    'T_theoretical': T_theoretical,
                    'B_effective': B_effective,
                    'B_peak': B_peak,
                    'utilization': utilization
                }
    
    return results


In [15]:
# Training configuration
num_epochs = 50
best_acc = 0
train_losses = []
train_accs = []
test_losses = []
test_accs = []

# Create checkpoint directory
if not os.path.isdir('checkpoint'):
    os.mkdir('checkpoint')

# Training loop
for epoch in range(num_epochs):
    print(f'\n{"="*60}')
    print(f'Epoch: {epoch}/{num_epochs}')
    print(f'{"="*60}')
    
    train_loss, train_acc = train_epoch(net, trainloader, criterion, optimizer, device, epoch)
    test_loss, test_acc = test(net, testloader, criterion, device)
    scheduler.step()
    
    # Store metrics
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    test_losses.append(test_loss)
    test_accs.append(test_acc)
    
    # Save checkpoint if best accuracy
    if test_acc > best_acc:
        print('Saving checkpoint...')
        state = {
            'net': net.state_dict(),
            'acc': test_acc,
            'epoch': epoch,
        }
        torch.save(state, './checkpoint/resnet18_cifar10_best.pth')
        best_acc = test_acc
    
    print(f'Current LR: {optimizer.param_groups[0]["lr"]:.6f}')
    print(f'Best Test Acc: {best_acc:.2f}%')

print(f'\nTraining Complete! Best test accuracy: {best_acc:.2f}%')



Epoch: 0/50
Epoch: 0 | Batch: 0/391 | Loss: 0.277 | Acc: 90.62% (116/128)
Epoch: 0 | Batch: 100/391 | Loss: 0.324 | Acc: 89.02% (11509/12928)
Epoch: 0 | Batch: 200/391 | Loss: 0.325 | Acc: 88.92% (22878/25728)
Epoch: 0 | Batch: 300/391 | Loss: 0.327 | Acc: 88.89% (34247/38528)
Test Loss: 0.515 | Test Acc: 82.96% (8296/10000)
Saving checkpoint...
Current LR: 0.100000
Best Test Acc: 82.96%

Epoch: 1/50
Epoch: 1 | Batch: 0/391 | Loss: 0.289 | Acc: 89.06% (114/128)
Epoch: 1 | Batch: 100/391 | Loss: 0.324 | Acc: 89.07% (11515/12928)
Epoch: 1 | Batch: 200/391 | Loss: 0.325 | Acc: 88.93% (22879/25728)
Epoch: 1 | Batch: 300/391 | Loss: 0.334 | Acc: 88.68% (34165/38528)
Test Loss: 0.503 | Test Acc: 83.80% (8380/10000)
Saving checkpoint...
Current LR: 0.100000
Best Test Acc: 83.80%

Epoch: 2/50
Epoch: 2 | Batch: 0/391 | Loss: 0.356 | Acc: 85.94% (110/128)
Epoch: 2 | Batch: 100/391 | Loss: 0.319 | Acc: 89.38% (11555/12928)
Epoch: 2 | Batch: 200/391 | Loss: 0.323 | Acc: 89.16% (22938/25728)
Epoch

After training 50 epochs we got the best accuracy as 87.55%

In [17]:
# === Full Benchmarking Pipeline ===

# 1. Measure single GPU training times for different batch sizes
print("=== Part 1: Single GPU with varying batch sizes ===")
single_gpu_results = measure_single_gpu_training([32, 128, 512])

# 2. Measure multi-GPU training and speedup
print("\n=== Part 2: Multi-GPU training and speedup ===")
batch_sizes = list(single_gpu_results.keys())
multi_gpu_results = {}
for bs in batch_sizes:
    multi_gpu_results[bs] = {}
    for num_gpus in [1, 2, 4]:
        time_taken = measure_multi_gpu_training(bs, num_gpus)
        multi_gpu_results[bs][num_gpus] = {
            'time': time_taken,
            'speedup': multi_gpu_results[bs][1]['time'] / time_taken if num_gpus > 1 else 1.0
        }
        print(f"BS={bs}, GPUs={num_gpus}: {time_taken:.2f}s, Speedup={multi_gpu_results[bs][num_gpus]['speedup']:.2f}x")

# 3. Compute vs Communication breakdown
print("\n=== Part 3: Compute vs Communication breakdown ===")
breakdown = calculate_compute_communication_breakdown(multi_gpu_results, single_gpu_results)

# 4. Bandwidth utilization
print("\n=== Part 4: Bandwidth utilization ===")
net = resnet.ResNet18()
bandwidth_results = calculate_bandwidth_utilization(net, breakdown, [2, 4])

# Print summary tables (you can format these as needed)
print("\nTable 1: Training Time and Speedup")
for bs in batch_sizes:
    print(f"Batch Size {bs}: {multi_gpu_results[bs]}")

print("\nTable 2: Compute vs Communication Breakdown")
for bs in batch_sizes:
    print(f"Batch Size {bs}: {breakdown[bs]}")

print("\nTable 3: Bandwidth Utilization")
for bs in batch_sizes:
    print(f"Batch Size {bs}: {bandwidth_results[bs]}")


=== Part 1: Single GPU with varying batch sizes ===
Batch size 32: 20.15s
Batch size 128: 17.63s
Batch size 512: 19.66s

=== Part 2: Multi-GPU training and speedup ===
BS=32, GPUs=1: 20.20s, Speedup=1.00x
BS=32, GPUs=2: 58.31s, Speedup=0.35x
BS=32, GPUs=4: 76.15s, Speedup=0.27x
BS=128, GPUs=1: 17.74s, Speedup=1.00x
BS=128, GPUs=2: 17.48s, Speedup=1.01x
BS=128, GPUs=4: 19.16s, Speedup=0.93x
BS=512, GPUs=1: 19.78s, Speedup=1.00x
BS=512, GPUs=2: 10.57s, Speedup=1.87x
BS=512, GPUs=4: 10.11s, Speedup=1.96x

=== Part 3: Compute vs Communication breakdown ===

=== Part 4: Bandwidth utilization ===
Model size: 11,173,962 parameters = 0.0416 GB

Table 1: Training Time and Speedup
Batch Size 32: {1: {'time': 20.19777274131775, 'speedup': 1.0}, 2: {'time': 58.31253170967102, 'speedup': 0.3463710483688018}, 4: {'time': 76.14964199066162, 'speedup': 0.26523792119462153}}
Batch Size 128: {1: {'time': 17.73724102973938, 'speedup': 1.0}, 2: {'time': 17.47592544555664, 'speedup': 1.0149528896193123}, 4

The output for the 4 points is the following:
=== Part 1: Single GPU with varying batch sizes ===
Batch size 32: 20.15s
Batch size 128: 17.63s
Batch size 512: 19.66s

=== Part 2: Multi-GPU training and speedup ===
BS=32, GPUs=1: 20.20s, Speedup=1.00x
BS=32, GPUs=2: 58.31s, Speedup=0.35x
BS=32, GPUs=4: 76.15s, Speedup=0.27x
BS=128, GPUs=1: 17.74s, Speedup=1.00x
BS=128, GPUs=2: 17.48s, Speedup=1.01x
BS=128, GPUs=4: 19.16s, Speedup=0.93x
BS=512, GPUs=1: 19.78s, Speedup=1.00x
BS=512, GPUs=2: 10.57s, Speedup=1.87x
BS=512, GPUs=4: 10.11s, Speedup=1.96x

=== Part 3: Compute vs Communication breakdown ===

=== Part 4: Bandwidth utilization ===
Model size: 11,173,962 parameters = 0.0416 GB

Table 1: Training Time and Speedup
Batch Size 32: {1: {'time': 20.19777274131775, 'speedup': 1.0}, 2: {'time': 58.31253170967102, 'speedup': 0.3463710483688018}, 4: {'time': 76.14964199066162, 'speedup': 0.26523792119462153}}
Batch Size 128: {1: {'time': 17.73724102973938, 'speedup': 1.0}, 2: {'time': 17.47592544555664, 'speedup': 1.0149528896193123}, 4: {'time': 19.15953230857849, 'speedup': 0.9257658665184486}}
Batch Size 512: {1: {'time': 19.77684736251831, 'speedup': 1.0}, 2: {'time': 10.57288932800293, 'speedup': 1.8705243901625024}, 4: {'time': 10.108532428741455, 'speedup': 1.9564508994685583}}

Table 2: Compute vs Communication Breakdown
Batch Size 32: {2: {'compute': 20.145222902297974, 'communication': 38.16730880737305, 'total': 58.31253170967102}, 4: {'compute': 20.145222902297974, 'communication': 56.00441908836365, 'total': 76.14964199066162}}
Batch Size 128: {2: {'compute': 17.627889394760132, 'communication': -0.1519639492034912, 'total': 17.47592544555664}, 4: {'compute': 17.627889394760132, 'communication': 1.5316429138183594, 'total': 19.15953230857849}}
Batch Size 512: {2: {'compute': 19.66075325012207, 'communication': -9.08786392211914, 'total': 10.57288932800293}, 4: {'compute': 19.66075325012207, 'communication': -9.552220821380615, 'total': 10.108532428741455}}

Table 3: Bandwidth Utilization
Batch Size 32: {2: {'T_measured': 38.16730880737305, 'T_theoretical': 0.00013875417411327362, 'B_effective': 0.001090625813941088, 'B_peak': 300, 'utilization': 0.00036354193798036265}, 4: {'T_measured': 56.00441908836365, 'T_theoretical': 0.00020813126116991042, 'B_effective': 0.0011149009197373588, 'B_peak': 300, 'utilization': 0.00037163363991245295}}
Batch Size 128: {2: {'T_measured': -0.1519639492034912, 'T_theoretical': 0.00013875417411327362, 'B_effective': -0.273921890370468, 'B_peak': 300, 'utilization': -0.091307296790156}, 4: {'T_measured': 1.5316429138183594, 'T_theoretical': 0.00020813126116991042, 'B_effective': 0.0407662763870417, 'B_peak': 300, 'utilization': 0.013588758795680568}}
Batch Size 512: {2: {'T_measured': -9.08786392211914, 'T_theoretical': 0.00013875417411327362, 'B_effective': -0.004580422043145594, 'B_peak': 300, 'utilization': -0.0015268073477151978}, 4: {'T_measured': -9.552220821380615, 'T_theoretical': 0.00020813126116991042, 'B_effective': -0.006536634728043123, 'B_peak': 300, 'utilization': -0.002178878242681041}}

For part 2 here is the text explanation:
Scaling Type: This experiment measures strong scaling because the batch size per GPU remains constant (32, 128, 512, 2048) while the number of GPUs increases, resulting in a proportionally larger total batch size processed per training step.

Comparison with Weak Scaling: If weak scaling were used instead (keeping total batch size constant and reducing per-GPU batch size), the speedup would be worse because:​

Smaller per-GPU batches reduce GPU compute efficiency

Communication overhead (gradient synchronization) remains constant but becomes a larger fraction of total time

GPU memory bandwidth and compute resources are underutilized with smaller batches

Strong scaling is more appropriate for this deep learning scenario because it maintains high GPU utilization and maximizes throughput per training iteration.

| Batch Size per GPU | 1 GPU Time (s) | 2 GPU Time (s) | 2 GPU Speedup | 4 GPU Time (s) | 4 GPU Speedup |
|--------------------|----------------|----------------|---------------|----------------|---------------|
| 32                 | 20.20          | 58.31          | 0.35          | 76.15          | 0.27          |
| 128                | 17.74          | 17.48          | 1.01          | 19.16          | 0.93          |
| 512                | 19.78          | 10.57          | 1.87          | 10.11          | 1.96          |



For part 3:
For each batch size per GPU (e.g., 32, 128, 512), you first measure the training time for one epoch on a single GPU. This time includes all computation (forward pass, backward pass, optimizer step, and CPU-GPU data transfer).

For 2-GPU and 4-GPU setups, you measure the total training time for one epoch with the same batch size per GPU.

Compute time is assumed to be the same as the single GPU time for that batch size, since each GPU does the same amount of work.

Communication time is the extra time spent synchronizing gradients between GPUs. You calculate it by subtracting the single GPU time from the multi-GPU time:


Communication Time=Multi-GPU Time−Single GPU Time

| Batch Size per GPU | 2 GPU Compute (s) | 2 GPU Comm (s) | 2 GPU Total (s) | 4 GPU Compute (s) | 4 GPU Comm (s) | 4 GPU Total (s) |
|--------------------|-------------------|----------------|-----------------|-------------------|----------------|-----------------|
| 32                 | 20.15             | 38.17          | 58.31           | 20.15             | 56.00          | 76.15           |
| 128                | 17.63             | -0.15          | 17.48           | 17.63             | 1.53           | 19.16           |
| 512                | 19.66             | -9.09          | 10.57           | 19.66             | -9.55          | 10.11           |



For part 4:




In [None]:
# Check GPU memory usage during training
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i} Memory Allocated: {torch.cuda.memory_allocated(i)/1024**3:.2f} GB")
        print(f"GPU {i} Memory Cached: {torch.cuda.memory_reserved(i)/1024**3:.2f} GB")


In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot loss
ax1.plot(train_losses, label='Train Loss')
ax1.plot(test_losses, label='Test Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Test Loss')
ax1.legend()
ax1.grid(True)

# Plot accuracy
ax2.plot(train_accs, label='Train Acc')
ax2.plot(test_accs, label='Test Acc')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Training and Test Accuracy')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.savefig('training_progress.png')
plt.show()
