In [3]:
!pip install -q transformers accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

input_ids = tokenizer(
    ["The future of AI is very bright."] * 8,
    return_tensors="pt",
    padding=True,
    truncation=True
).input_ids.to(device)
labels = input_ids.clone()

model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

print("Non-FSDP in memory-limited mode")
torch.cuda.reset_peak_memory_stats()
start = time.time()

try:
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    end = time.time()
    print(f"[Non-FSDP] Success | Loss: {loss.item():.4f} | Time: {end-start:.2f}s | GPU: {torch.cuda.max_memory_allocated() / 1e6:.2f} MB")
except RuntimeError as e:
    end = time.time()
    print(f"[Non-FSDP] Failed: {e} | Time: {end-start:.2f}s")


Non-FSDP in memory-limited mode
[Non-FSDP] Success | Loss: 4.0359 | Time: 0.03s | GPU: 3025.59 MB
