In [6]:
!pip install -q transformers accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import os

from torch.cuda import OutOfMemoryError

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

print("Batch Size Scaling (Non-FSDP)")

batch_size = 2
max_batch = 64
successes = []

while batch_size <= max_batch:
    input_ids = tokenizer(
        ["The future of AI is very bright."] * batch_size,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).input_ids.to(device)
    labels = input_ids.clone()

    model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    try:
        model.train()
        torch.cuda.reset_peak_memory_stats()
        start = time.time()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        end = time.time()

        gpu_mem = torch.cuda.max_memory_allocated() / 1e6
        successes.append((batch_size, round(loss.item(), 4), round(end-start, 2), round(gpu_mem, 2)))
        print(f"Batch {batch_size} | Loss: {loss.item():.4f} | Time: {end-start:.2f}s | GPU: {gpu_mem:.2f} MB")
        batch_size *= 2

    except RuntimeError as e:
        print(f"Batch {batch_size} FAILED: {str(e).splitlines()[0]}")
        break

print("Non-FSDP Batch Scaling Complete")


Batch Size Scaling (Non-FSDP)
Batch 2 | Loss: 4.2208 | Time: 0.02s | GPU: 5130.27 MB
Batch 4 | Loss: 4.2352 | Time: 0.02s | GPU: 5322.03 MB
Batch 8 | Loss: 4.1710 | Time: 0.02s | GPU: 5336.15 MB
Batch 16 | Loss: 4.2240 | Time: 0.02s | GPU: 5363.26 MB
Batch 32 | Loss: 4.2185 | Time: 0.02s | GPU: 5416.06 MB
Batch 64 | Loss: 4.2549 | Time: 0.02s | GPU: 5513.04 MB
Non-FSDP Batch Scaling Complete
