In [2]:
!pip install -q transformers accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
import time
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
torch.distributed.init_process_group("gloo", rank=0, world_size=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

tokenizer.pad_token = tokenizer.eos_token
input_ids = tokenizer(["The future of AI is very bright."] * 8, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
labels = input_ids.clone()

model_fsdp = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
model_fsdp = FSDP(model_fsdp)
optimizer = torch.optim.AdamW(model_fsdp.parameters(), lr=5e-5)

print("FSDP in memory-limited mode")
torch.cuda.reset_peak_memory_stats()
start = time.time()
try:
    outputs = model_fsdp(input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    end = time.time()
    print(f"[FSDP] Success | Loss: {loss.item():.4f} | Time: {end-start:.2f}s | GPU: {torch.cuda.max_memory_allocated() / 1e6:.2f} MB")
except RuntimeError as e:
    print(f"[FSDP] Failed: {e}")
end = time.time()

torch.distributed.destroy_process_group()


FSDP in memory-limited mode
[FSDP] Success | Loss: 4.0359 | Time: 0.06s | GPU: 2696.46 MB
