In [2]:
import os, json, math, time
from pathlib import Path

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, models, transforms

from contextlib import nullcontext


In [3]:
# ----------------------------
# Config
# ----------------------------
DATA_DIR = "data"               # your dataset root
OUT_DIR  = "runs/font_densenet"
BATCH_SIZE = 32                 # try 32; if OOM on MPS, use 16
EPOCHS = 12                     # 3-4 frozen + 8-9 unfrozen is a good start
VAL_SPLIT = 0.15
SEED = 42
LEARNING_RATE = 3e-4
WEIGHT_DECAY = 1e-4
NUM_WORKERS = 0                 # macOS often safer at 0/2
PIN_MEMORY = False

In [4]:
# ----------------------------
# Device (Apple Silicon friendly)
# ----------------------------
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [5]:
# ----------------------------
# Transforms
# ----------------------------
# For font imagery: preserve crisp edges, small geometric jitter helps generalize
# Images are black text on white; we still normalize to ImageNet stats since we use pretrained weights.
IMG_SIZE = 224

train_tfms = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),            # ensure 3-ch
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0), antialias=True),
    transforms.RandomAffine(degrees=2, translate=(0.02, 0.02), shear=(-2,2), fill=255),
    transforms.RandomApply([transforms.GaussianBlur(kernel_size=3)], p=0.15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

val_tfms = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize(256, antialias=True),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])


In [6]:
# ----------------------------
# Dataset / Split
# ----------------------------
full_ds = datasets.ImageFolder(DATA_DIR, transform=train_tfms)  # we'll swap tfms for val subset later
num_classes = len(full_ds.classes)
print("Classes:", full_ds.classes)

# save class mapping for inference later
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
with open(Path(OUT_DIR) / "classes.json", "w") as f:
    json.dump(full_ds.classes, f)

n_total = len(full_ds)
n_val = int(math.ceil(n_total * VAL_SPLIT))
n_train = n_total - n_val
g = torch.Generator().manual_seed(SEED)
train_ds, val_ds = random_split(full_ds, [n_train, n_val], generator=g)

# assign val transforms
val_ds.dataset = datasets.ImageFolder(DATA_DIR, transform=val_tfms)
# keep the same class_to_idx mapping
val_ds.dataset.class_to_idx = full_ds.class_to_idx

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

Classes: ['Inter', 'Lato', 'Merriweather', 'Montserrat', 'Nunito', 'Open_Sans', 'Oswald', 'Playfair_Display', 'Poppins', 'Raleway', 'Roboto', 'Roboto_Condensed', 'Roboto_Mono', 'Source_Sans_Pro', 'Ubuntu']


In [7]:
# ----------------------------
# Model (DenseNet-121) + head
# ----------------------------
model = models.densenet121(weights=models.DenseNet121_Weights.IMAGENET1K_V1)
in_features = model.classifier.in_features
model.classifier = nn.Linear(in_features, num_classes)
model.to(device)

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [8]:
# ----------------------------
# Optimizer / Scheduler / Loss
# ----------------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

# ----------------------------
# Optional: Warmup with frozen backbone
# ----------------------------
def set_backbone_requires_grad(req: bool):
    for name, p in model.named_parameters():
        if not name.startswith("classifier"):
            p.requires_grad = req

# freeze backbone for first few epochs for stability on small data
FROZEN_EPOCHS = 3
set_backbone_requires_grad(False)

In [None]:
# ----------------------------
# AMP for speed (CUDA only, MPS/CPU use no-op)
# ----------------------------
use_cuda_amp = (device.type == "cuda")
scaler = torch.cuda.amp.GradScaler(enabled=use_cuda_amp)

def maybe_autocast():
    # Autocast only on CUDA; use no-op on MPS/CPU
    return torch.cuda.amp.autocast(dtype=torch.float16) if use_cuda_amp else nullcontext()

# ----------------------------
# Train / Eval loops
# ----------------------------
best_val_acc = 0.0
best_ckpt = Path(OUT_DIR) / "best.ckpt.pt"

def run_epoch(loader, train=True):
    model.train(mode=train)
    running_loss, correct, total = 0.0, 0, 0

    for images, targets in loader:
        images = images.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        with torch.set_grad_enabled(train):
            with maybe_autocast():
                outputs = model(images)
                loss = criterion(outputs, targets)

            if train:
                if use_cuda_amp:
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad(set_to_none=True)
                else:
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad(set_to_none=True)

        running_loss += loss.item() * images.size(0)
        _, preds = outputs.max(1)
        correct += (preds == targets).sum().item()
        total += targets.size(0)

    avg_loss = running_loss / total if total else 0.0
    acc = correct / total if total else 0.0
    return avg_loss, acc

for epoch in range(EPOCHS):
    t0 = time.time()

    # unfreeze after warmup
    if epoch == FROZEN_EPOCHS:
        set_backbone_requires_grad(True)

    train_loss, train_acc = run_epoch(train_loader, train=True)
    val_loss, val_acc = run_epoch(val_loader, train=False)
    scheduler.step()

    dt = time.time() - t0
    print(f"Epoch {epoch+1:02d}/{EPOCHS} | "
          f"train_loss {train_loss:.4f} acc {train_acc:.3f} | "
          f"val_loss {val_loss:.4f} acc {val_acc:.3f} | {dt:.1f}s")

    # save best
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({
            "model_state": model.state_dict(),
            "classes": full_ds.classes,
            "val_acc": val_acc
        }, best_ckpt)
        print(f"  ✅ Saved new best to {best_ckpt} (val_acc={val_acc:.3f})")

print(f"Best val acc: {best_val_acc:.3f}")
