In [20]:
import os, json, math, time
from pathlib import Path

import platform
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, models, transforms

from contextlib import nullcontext

try:
    torch.set_num_threads(1)
    torch.set_num_interop_threads(1)
    print("Threads set: num_threads=1, interop_threads=1")
except Exception as e:
    print(f"Could not set threads: {e}")

# ----------------------------
# Config
# ----------------------------
DATA_DIR = "data"               # your dataset root
OUT_DIR  = "runs/font_resnet"
BATCH_SIZE = 32                 # try 32; if OOM on MPS, use 16
EPOCHS = 12                     # 3-4 frozen + 8-9 unfrozen is a good start
VAL_SPLIT = 0.15
SEED = 42
LEARNING_RATE = 3e-4
WEIGHT_DECAY = 1e-4
NUM_WORKERS = 0                 # macOS often safer at 0/2
PIN_MEMORY = False


Could not set threads: Error: cannot set number of interop threads after parallel work has started or set_num_interop_threads called


In [21]:
# ----------------------------
# Transforms
# ----------------------------
# For font imagery: preserve crisp edges, small geometric jitter helps generalize
# Images are black text on white; we still normalize to ImageNet stats since we use pretrained weights.
IMG_SIZE = 224

train_tfms = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),            # ensure 3-ch
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0), antialias=True),
    transforms.RandomAffine(degrees=2, translate=(0.02, 0.02), shear=(-2,2), fill=255),
    transforms.RandomApply([transforms.GaussianBlur(kernel_size=3)], p=0.15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

val_tfms = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize(256, antialias=True),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])


In [22]:
# ----------------------------
# Dataset / Split (clone base per split)
# ----------------------------
full_ds = datasets.ImageFolder(DATA_DIR)  # no transform here
num_classes = len(full_ds.classes)
print("Classes:", full_ds.classes)

# save class mapping for inference later
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
with open(Path(OUT_DIR) / "classes.json", "w") as f:
    json.dump(full_ds.classes, f)

n_total = len(full_ds)
n_val = int(math.ceil(n_total * VAL_SPLIT))
n_train = n_total - n_val
g = torch.Generator().manual_seed(SEED)
perm = torch.randperm(n_total, generator=g).tolist()
train_idx, val_idx = perm[:n_train], perm[n_train:]

import copy
train_base = copy.deepcopy(full_ds); train_base.transform = train_tfms
val_base   = copy.deepcopy(full_ds); val_base.transform   = val_tfms

from torch.utils.data import Subset
train_ds = Subset(train_base, train_idx)
val_ds   = Subset(val_base, val_idx)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

Classes: ['Inter', 'Lato', 'Merriweather', 'Montserrat', 'Nunito', 'Open_Sans', 'Oswald', 'Playfair_Display', 'Poppins', 'Raleway', 'Roboto', 'Roboto_Condensed', 'Roboto_Mono', 'Source_Sans_Pro', 'Ubuntu']


In [23]:
# ----------------------------
# Device (Apple Silicon friendly)
# ----------------------------
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

# ----------------------------
# Model (ResNet-18) + head
# ----------------------------
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)
model.to(device)

Using device: mps


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [24]:
# ----------------------------
# Optimizer / Scheduler / Loss
# ----------------------------
criterion = nn.CrossEntropyLoss()
# IMPORTANT: optimizer should only see trainable params at each phase
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                        lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
                        
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

HEAD_NAMES = ("fc", "classifier")

def freeze_backbone_keep_head(model):
    # freeze everything
    for p in model.parameters():
        p.requires_grad = False
    # unfreeze the head (fc for ResNet, classifier for DenseNet)
    for head_name in HEAD_NAMES:
        head = getattr(model, head_name, None)
        if head is not None:
            for p in head.parameters():
                p.requires_grad = True

def unfreeze_all(model):
    for p in model.parameters():
        p.requires_grad = True

# ----------------------------
# Optional: Warmup with frozen backbone
# ----------------------------
def set_backbone_requires_grad(req: bool):
    for name, p in model.named_parameters():
        if name.startswith("fc"):
            p.requires_grad = True
        else:
            p.requires_grad = req

# freeze backbone for first few epochs for stability on small data
FROZEN_EPOCHS = 3
set_backbone_requires_grad(False)

In [25]:
# ----------------------------
# AMP for speed (CUDA only, MPS/CPU use no-op)
# ----------------------------
use_cuda_amp = (device.type == "cuda")
scaler = torch.cuda.amp.GradScaler(enabled=use_cuda_amp)

def maybe_autocast():
    # Autocast only on CUDA; use no-op on MPS/CPU
    return torch.cuda.amp.autocast(dtype=torch.float16) if use_cuda_amp else nullcontext()

# ----------------------------
# Train / Eval loops
# ----------------------------
best_val_acc = 0.0
best_ckpt = Path(OUT_DIR) / "best.ckpt.pt"

def run_epoch(loader, train=True):
    model.train(mode=train)
    running_loss, correct, total = 0.0, 0, 0

    for images, targets in loader:
        images = images.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        with torch.set_grad_enabled(train):
            with maybe_autocast():
                outputs = model(images)
                loss = criterion(outputs, targets)

            if train:
                if use_cuda_amp:
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad(set_to_none=True)
                else:
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad(set_to_none=True)

        running_loss += loss.item() * images.size(0)
        _, preds = outputs.max(1)
        correct += (preds == targets).sum().item()
        total += targets.size(0)

    avg_loss = running_loss / total if total else 0.0
    acc = correct / total if total else 0.0
    return avg_loss, acc

for epoch in range(EPOCHS):
    t0 = time.time()

    # unfreeze after warmup
    if epoch == FROZEN_EPOCHS:
        unfreeze_all(model)
        set_backbone_requires_grad(True)

    train_loss, train_acc = run_epoch(train_loader, train=True)
    val_loss, val_acc = run_epoch(val_loader, train=False)
    scheduler.step()

    dt = time.time() - t0
    print(f"Epoch {epoch+1:02d}/{EPOCHS} | "
          f"train_loss {train_loss:.4f} acc {train_acc:.3f} | "
          f"val_loss {val_loss:.4f} acc {val_acc:.3f} | {dt:.1f}s")

    # save best
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({
            "model_state": model.state_dict(),
            "classes": full_ds.classes,
            "val_acc": val_acc
        }, best_ckpt)
        print(f"  ✅ Saved new best to {best_ckpt} (val_acc={val_acc:.3f})")

print(f"Best val acc: {best_val_acc:.3f}")


  scaler = torch.cuda.amp.GradScaler(enabled=use_cuda_amp)


Epoch 01/12 | train_loss 1.0127 acc 0.738 | val_loss 0.2509 acc 0.973 | 2.5s
  ✅ Saved new best to runs/font_resnet/best.ckpt.pt (val_acc=0.973)
Epoch 02/12 | train_loss 0.1193 acc 0.915 | val_loss 0.0232 acc 0.912 | 2.4s
Epoch 03/12 | train_loss 0.0822 acc 0.845 | val_loss 0.0874 acc 0.876 | 2.4s
Epoch 04/12 | train_loss 0.0604 acc 0.878 | val_loss 0.0064 acc 0.982 | 3.2s
  ✅ Saved new best to runs/font_resnet/best.ckpt.pt (val_acc=0.982)
Epoch 05/12 | train_loss 0.0197 acc 0.943 | val_loss 0.0714 acc 0.858 | 3.2s
Epoch 06/12 | train_loss 0.0036 acc 0.958 | val_loss 0.0871 acc 0.956 | 3.3s
Epoch 07/12 | train_loss 0.0023 acc 0.975 | val_loss 0.0023 acc 0.973 | 3.2s
Epoch 08/12 | train_loss 0.0312 acc 0.945 | val_loss 0.0035 acc 0.973 | 3.2s
Epoch 09/12 | train_loss 0.0430 acc 0.948 | val_loss 0.0022 acc 0.973 | 3.2s
Epoch 10/12 | train_loss 0.0434 acc 0.907 | val_loss 0.0689 acc 0.876 | 3.4s
Epoch 11/12 | train_loss 0.0150 acc 0.940 | val_loss 0.0028 acc 0.947 | 3.3s
Epoch 12/12 | tra

In [27]:
full2 = datasets.ImageFolder(DATA_DIR)
assert [full_ds.samples[i][0] for i in range(50)] == [full2.samples[i][0] for i in range(50)]

In [28]:
from collections import Counter
model.eval()
correct_per, total_per = Counter(), Counter()
with torch.no_grad():
    for imgs, t in val_loader:
        o = model(imgs.to(device)).argmax(1).cpu()
        for ti, pi in zip(t, o):
            total_per[int(ti)] += 1
            correct_per[int(ti)] += int(pi == ti)
{ full_ds.classes[k]: correct_per[k] / total_per[k] for k in total_per }

{'Roboto': 0.0,
 'Playfair_Display': 0.0,
 'Roboto_Condensed': 0.0,
 'Ubuntu': 0.0,
 'Open_Sans': 0.0,
 'Source_Sans_Pro': 0.0,
 'Raleway': 0.0,
 'Montserrat': 0.0,
 'Poppins': 0.0,
 'Inter': 1.0,
 'Roboto_Mono': 0.0,
 'Oswald': 0.0,
 'Merriweather': 0.0,
 'Lato': 0.0,
 'Nunito': 0.0}

In [18]:
from sklearn.metrics import confusion_matrix
all_p, all_t = [], []
with torch.no_grad():
    for x,y in val_loader:
        all_p.append(model(x.to(device)).argmax(1).cpu())
        all_t.append(y)
cm = confusion_matrix(torch.cat(all_t), torch.cat(all_p))
cm

array([[ 7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 7,  0,  0,  0,  0,  0,  0