# Training Best Practices & Performance Toolkit

Optimizers and architectures only go so far—training strategy determines whether models converge efficiently. This notebook collects practical techniques for scheduling, mixed precision, gradient hygiene, and instrumentation.

## Learning Objectives

- Visualize learning rate schedules and understand when to use them.
- Apply mixed precision with gradient scaling.
- Implement gradient clipping, accumulation, and simple monitoring utilities.
- Build a training harness that combines these ideas.

## Learning Rate Schedules

Schedulers smooth optimization by ramping up (warmup) and decaying learning rates. Plot the schedule to confirm it behaves as intended.

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

model = nn.Sequential(nn.Linear(16, 64), nn.ReLU(), nn.Linear(64, 1))
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)

lrs = []
for step in range(20):
    lrs.append(optimizer.param_groups[0]["lr"])
    scheduler.step()

plt.plot(lrs, marker="o")
plt.title("CosineAnnealingWarmRestarts schedule")
plt.xlabel("Step")
plt.ylabel("Learning rate")
plt.grid(True)
plt.show()


## Mixed Precision Training

Mixed precision boosts throughput by using lower precision where safe. Autocast + GradScaler handle casting and scaling automatically. When CUDA is unavailable, the code falls back to full precision.

In [None]:
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

def train_step(x, y):
    optimizer.zero_grad()
    with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
        preds = model(x)
        loss = torch.nn.functional.mse_loss(preds, y)
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    return loss.item()

x = torch.randn(32, 16)
y = torch.randn(32, 1)
print(f"Loss: {train_step(x, y):.4f}")


## Mini Task – Gradient Monitoring Utility

Create a helper that logs per-layer gradient norms and identifies the largest contributor. This aids debugging when gradients explode or vanish.

In [None]:
def gradient_stats(model: nn.Module):
    # TODO: return dict of {name: norm} and print largest contributor
    raise NotImplementedError


In [None]:
def gradient_stats(model: nn.Module):
    norms = {}
    max_name, max_norm = None, 0.0
    for name, param in model.named_parameters():
        if param.grad is None:
            continue
        norm = param.grad.data.norm(2).item()
        norms[name] = norm
        if norm > max_norm:
            max_name, max_norm = name, norm
    if max_name:
        print(f"Largest gradient: {max_name} -> {max_norm:.4f}")
    return norms

loss = train_step(x, y)
gradient_stats(model)


## Gradient Accumulation & Clipping

Accumulation simulates large batches on small hardware. Clipping prevents exploding gradients.

In [None]:
def train_with_accumulation(loader, accumulation_steps=2, grad_clip=1.0):
    model.train()
    optimizer.zero_grad()
    for step, (xb, yb) in enumerate(loader):
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            preds = model(xb)
            loss = torch.nn.functional.mse_loss(preds, yb) / accumulation_steps
        scaler.scale(loss).backward()
        if (step + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

dummy_loader = [(torch.randn(16, 16), torch.randn(16, 1)) for _ in range(4)]
train_with_accumulation(dummy_loader)
print("Accumulation step completed")


## Mini Task – Early Stopper

Implement an `EarlyStopper` that tracks validation improvements, supports a patience parameter, and exposes `should_stop`.

In [None]:
class EarlyStopper:
    def __init__(self, patience=3, min_delta=0.0):
        # TODO: initialize fields
        raise NotImplementedError

    def update(self, metric):
        # TODO: return True when training should stop
        raise NotImplementedError


In [None]:
class EarlyStopper:
    def __init__(self, patience=3, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("inf")
        self.counter = 0

    def update(self, metric):
        if metric < self.best - self.min_delta:
            self.best = metric
            self.counter = 0
            return False
        self.counter += 1
        return self.counter >= self.patience

stopper = EarlyStopper(patience=2, min_delta=0.01)
for metric in [0.5, 0.45, 0.44, 0.44, 0.43]:
    if stopper.update(metric):
        print("Early stop triggered")
        break


## Comprehensive Exercise – Training Harness

Create a `Trainer` class that supports mixed precision, gradient accumulation, schedulers, checkpointing, and simple callbacks. Demonstrate usage on the dummy regression task.

In [None]:
class Trainer:
    def __init__(self, model, optimizer, scheduler=None, grad_clip=None, accumulation_steps=1, checkpoint_path=None):
        # TODO: store components and initialize GradScaler
        raise NotImplementedError

    def train_epoch(self, loader):
        raise NotImplementedError

    def evaluate(self, loader):
        raise NotImplementedError

    def fit(self, train_loader, val_loader=None, epochs=1):
        raise NotImplementedError


In [None]:
class Trainer:
    def __init__(self, model, optimizer, scheduler=None, grad_clip=None, accumulation_steps=1, checkpoint_path=None):
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.grad_clip = grad_clip
        self.accumulation_steps = accumulation_steps
        self.checkpoint_path = checkpoint_path
        self.scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
        self.history = {"train": [], "val": []}

    def train_epoch(self, loader):
        self.model.train()
        total_loss = 0.0
        self.optimizer.zero_grad()
        for step, (xb, yb) in enumerate(loader):
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                preds = self.model(xb)
                loss = torch.nn.functional.mse_loss(preds, yb) / self.accumulation_steps
            self.scaler.scale(loss).backward()
            if (step + 1) % self.accumulation_steps == 0:
                if self.grad_clip is not None:
                    self.scaler.unscale_(self.optimizer)
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
                self.scaler.step(self.optimizer)
                self.scaler.update()
                self.optimizer.zero_grad()
                if self.scheduler is not None:
                    self.scheduler.step()
            total_loss += loss.item() * xb.size(0) * self.accumulation_steps
        return total_loss / len(loader.dataset)

    def evaluate(self, loader):
        self.model.eval()
        total = 0.0
        with torch.no_grad():
            for xb, yb in loader:
                preds = self.model(xb)
                total += torch.nn.functional.mse_loss(preds, yb).item() * xb.size(0)
        self.model.train()
        return total / len(loader.dataset)

    def fit(self, train_loader, val_loader=None, epochs=1):
        for epoch in range(epochs):
            train_loss = self.train_epoch(train_loader)
            val_loss = None
            if val_loader is not None:
                val_loss = self.evaluate(val_loader)
            self.history["train"].append(train_loss)
            self.history["val"].append(val_loss)
            print(f"Epoch {epoch+1}: train={train_loss:.4f} val={val_loss}")
            if self.checkpoint_path:
                torch.save({
                    "model": self.model.state_dict(),
                    "optimizer": self.optimizer.state_dict(),
                    "scaler": self.scaler.state_dict(),
                    "history": self.history,
                    "epoch": epoch + 1,
                }, self.checkpoint_path)
        return self.history

trainer = Trainer(model, optimizer, scheduler, grad_clip=1.0, accumulation_steps=2)
history = trainer.fit(dummy_loader, dummy_loader, epochs=2)


## Further Reading

- PyTorch Performance Tuning Guide: https://pytorch.org/tutorials/recipes/recipes.html#optimizing-your-model
- NVIDIA Mixed Precision Training documentation
- “Don’t Decay the Learning Rate, Increase the Batch Size” (Smith et al.)
- PyTorch Profiler for deeper instrumentation