In [5]:
!pip install -q transformers accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
import time
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
input_ids = tokenizer("The future of AI is", return_tensors="pt").input_ids.to(device)
labels = input_ids.clone()
results = {
    "Step": [],
    "FSDP Loss": [],
    "FSDP Time (s)": [],
    "FSDP GPU MB": [],
    "Non-FSDP Loss": [],
    "Non-FSDP Time (s)": [],
    "Non-FSDP GPU MB": []
}

print("FSDP version")
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
torch.distributed.init_process_group("gloo", rank=0, world_size=1)

fsdp_model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
fsdp_model = FSDP(fsdp_model)
optimizer = torch.optim.AdamW(fsdp_model.parameters(), lr=5e-5)
fsdp_model.train()

for step in range(3):
    torch.cuda.reset_peak_memory_stats()
    start = time.time()
    outputs = fsdp_model(input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    end = time.time()

    mem = torch.cuda.max_memory_allocated() / 1e6
    print(f"[FSDP] Step {step} | Loss: {loss.item():.4f} | Time: {end-start:.2f}s | GPU: {mem:.2f} MB")

    results["Step"].append(step)
    results["FSDP Loss"].append(round(loss.item(), 4))
    results["FSDP Time (s)"].append(round(end - start, 2))
    results["FSDP GPU MB"].append(round(mem, 2))

torch.distributed.destroy_process_group()

print("Non-FSDP version")
model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()

for step in range(3):
    torch.cuda.reset_peak_memory_stats()
    start = time.time()
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    end = time.time()

    mem = torch.cuda.max_memory_allocated() / 1e6
    print(f"[Non-FSDP] Step {step} | Loss: {loss.item():.4f} | Time: {end-start:.2f}s | GPU: {mem:.2f} MB")

    results["Non-FSDP Loss"].append(round(loss.item(), 4))
    results["Non-FSDP Time (s)"].append(round(end - start, 2))
    results["Non-FSDP GPU MB"].append(round(mem, 2))

import pandas as pd
df = pd.DataFrame(results)
df


FSDP version
[FSDP] Step 0 | Loss: 5.2012 | Time: 0.05s | GPU: 1998.90 MB
[FSDP] Step 1 | Loss: 4.3317 | Time: 0.05s | GPU: 1998.90 MB
[FSDP] Step 2 | Loss: 3.6513 | Time: 0.05s | GPU: 1997.61 MB
Non-FSDP version
[Non-FSDP] Step 0 | Loss: 5.0480 | Time: 0.02s | GPU: 2991.85 MB
[Non-FSDP] Step 1 | Loss: 3.9749 | Time: 0.01s | GPU: 2991.05 MB
[Non-FSDP] Step 2 | Loss: 3.4853 | Time: 0.01s | GPU: 2991.05 MB


Unnamed: 0,Step,FSDP Loss,FSDP Time (s),FSDP GPU MB,Non-FSDP Loss,Non-FSDP Time (s),Non-FSDP GPU MB
0,0,5.2012,0.05,1998.9,5.048,0.02,2991.85
1,1,4.3317,0.05,1998.9,3.9749,0.01,2991.05
2,2,3.6513,0.05,1997.61,3.4853,0.01,2991.05
