[Reference](https://levelup.gitconnected.com/how-to-train-your-pytorch-models-much-faster-14737c8c9770)

# 1. Enable Automatic Mixed Precision Training

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# define model, optimizer and criterion

# define scaler using amp (Automatic Mixed Precision)
scaler = torch.cuda.amp.GradScaler()

 # load inputs and labels with dataloader
for inputs, labels in dataloader:
    inputs = inputs.cuda(non_blocking=True)
    labels = labels.cuda(non_blocking=True)
    optimizer.zero_grad()

    # enable mixed precision training with the scaler
    with torch.cuda.amp.autocast():
        outputs = model(inputs)
        loss = criterion(outputs, targets)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

# 2. Find and Fix Bottlenecks

In [2]:
import torch.profiler

with torch.profiler.profile(
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log'),
    record_shapes=True,
    with_stack=True
) as prof:
    for inputs, targets in dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        prof.step()

# 3. Speed Up Your DataLoader

In [3]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,
    num_workers=4,         # Use as many workers as your CPU cores allow
    pin_memory=True,       # Speeds up data transfer to the GPU
    prefetch_factor=2      # Preload batches (only after PyTorch v1.8.0)
)

# 4. Enable Static Compilation

In [4]:
import torch

model = torch.compile(model, "max-autotune")
# or
model = torch.compile(model, "reduce-overhead")

# 5. Scale Up With Distributed Training

## 5.1) Data Parallelism on a Single Machine

In [5]:
import torch.nn as nn

model = nn.Linear(100, 10)

# Automatically split your data across available GPUs
model = nn.DataParallel(model)
model = model.cuda()

## 5.2) Serious Scaling using Distributed Data Parallel (DDP)

In [6]:
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

# Initialize the distributed environment
# Make sure you set up your environment variables correctly
dist.init_process_group(backend='nccl')
model = nn.Linear(100, 10).cuda()
model = DDP(model)

# 5.3) Leverage Gradient Accumulation

In [7]:
accumulation_steps = 4

for i, (inputs, targets) in enumerate(dataloader):
    inputs, targets = inputs.cuda(non_blocking=True), targets.cuda(non_blocking=True)
    outputs = model(inputs)
    loss = criterion(outputs, targets) / accumulation_steps
    loss.backward()

    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

# 6. Use Task-Specialized Libraries

## 6.1) PyTorch Lightning

In [8]:
import pytorch_lightning as pl
import torch.nn.functional as F

class LitModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.layer = nn.Linear(100, 10)

    def forward(self, x):
        return self.layer(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y)
        return loss

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=0.01)

trainer = pl.Trainer(gpus=2, precision=16, accelerator='ddp')
trainer.fit(LitModel(), dataloader)

## 6.2) NVIDIA Apex

In [9]:
from apex import amp
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

# 7. Model Specific Optimizations

In [10]:
import torch.quantization

model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model, inplace=True)

# Calibrate with your data
for inputs, _ in calibration_dataloader:
    model(inputs)

torch.quantization.convert(model, inplace=True)

# 8. cuDNN and GPU Tweaks

In [11]:
torch.backends.cudnn.deterministic = False