In [162]:
import numpy as np
import matplotlib.pyplot as plt

In [163]:
import requests
from tqdm import tqdm
import os.path

def download_file(url):
    path = url.split('/')[-1]
    if os.path.isfile(path):
        print (f"{path} already exists")
    else:
      r = requests.get(url, stream=True)
      with open(path, 'wb') as f:
          total_length = int(r.headers.get('content-length'))
          print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))
          for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
              if chunk:
                  f.write(chunk)

url_list = [
    'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz',
    'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz',
    'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz',
    'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz'
]

for url in url_list:
    download_file(url)

kmnist-train-imgs.npz already exists
kmnist-train-labels.npz already exists
kmnist-test-imgs.npz already exists
kmnist-test-labels.npz already exists


In [164]:
!ls

Neural Network with PyTorch.ipynb kmnist-train-imgs.npz
kmnist-test-imgs.npz              kmnist-train-labels.npz
kmnist-test-labels.npz            perceptron.ipynb


In [268]:
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import dataloader
from torchvision import datasets, transforms

torch.manual_seed(28)

import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("using device:", device)

using device: mps


In [None]:
### optional
import torch, time

# pick devices
cpu = torch.device("cpu")
mps = torch.device("mps")

# matrix size (bigger = clearer speed difference)
N = 4000  

# create random tensor
x_cpu = torch.randn(N, N, device=cpu)
x_mps = torch.randn(N, N, device=mps)

# warm-up MPS (first call compiles kernels)
_ = x_mps @ x_mps

# cpu timing
start = time.time()
y_cpu = x_cpu @ x_cpu
torch.cuda.synchronize() if torch.cuda.is_available() else None
end = time.time()
print(f"CPU time: {end - start:.4f} sec")

# mps timing
start = time.time()
y_mps = x_mps @ x_mps
torch.mps.synchronize()   # wait for GPU to finish
end = time.time()
print(f"MPS time: {end - start:.4f} sec")

CPU time: 0.0468 sec
MPS time: 0.0101 sec


In [277]:
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader

# load npz
x_train = np.load("kmnist-train-imgs.npz")["arr_0"]   # (60000,28,28)
y_train = np.load("kmnist-train-labels.npz")["arr_0"] # (60000,)
x_test  = np.load("kmnist-test-imgs.npz")["arr_0"]    # (10000,28,28)
y_test  = np.load("kmnist-test-labels.npz")["arr_0"]  # (10000,)

print("train:", x_train.shape, y_train.shape)
print("test :", x_test.shape,  y_test.shape)

# convert to tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.float32).unsqueeze(1) / 255.0  # (60000,1,28,28)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)                          # (60000,)
x_test_tensor  = torch.tensor(x_test,  dtype=torch.float32).unsqueeze(1) / 255.0
y_test_tensor  = torch.tensor(y_test,  dtype=torch.long)

# datasets + loaders
BATCH_SIZE = 128
train_ds = TensorDataset(x_train_tensor, y_train_tensor)
test_ds  = TensorDataset(x_test_tensor, y_test_tensor)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

# sanity check
xb, yb = next(iter(train_loader))
print("xb:", xb.shape, xb.dtype, xb.min().item(), xb.max().item())
print("yb:", yb.shape, yb.dtype, yb[:10])

train: (60000, 28, 28) (60000,)
test : (10000, 28, 28) (10000,)
xb: torch.Size([128, 1, 28, 28]) torch.float32 0.0 1.0
yb: torch.Size([128]) torch.int64 tensor([9, 4, 5, 1, 3, 9, 4, 7, 1, 5])


In [278]:
import torch.nn as nn

class LogisticRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(28*28, 10)   # 784 → 10

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc(x)   # raw logits
        return x

model = LogisticRegression().to(device)
print(model)

LogisticRegression(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc): Linear(in_features=784, out_features=10, bias=True)
)


In [279]:
criterion = nn.CrossEntropyLoss()


In [280]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

### Logistic regression model

In [281]:
from torch.utils.data import DataLoader

EPOCHS = 5  # start tiny to sanity-check, then try 20–30
model.train()  # sets dropout/bn to training mode (good habit)

for epoch in range(EPOCHS):
    total_loss = 0.0
    correct = 0
    total = 0

    for xb, yb in train_loader:
        xb = xb.to(device)          # images  [B, 1, 28, 28]
        yb = yb.to(device)          # labels  [B]

        # ----- forward
        logits = model(xb)          # [B, 10] raw scores

        # ----- loss
        loss = criterion(logits, yb)  # CrossEntropy with class indices

        # ----- backward
        optimizer.zero_grad()       # clear old gradients
        loss.backward()             # compute new gradients dL/dθ
        optimizer.step()            # θ := θ - lr * grad

        # track metrics
        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)             # predicted class per sample
        correct += (preds == yb).sum().item()
        total += xb.size(0)

    avg_loss = total_loss / total
    train_acc = correct / total
    print(f"epoch {epoch+1}/{EPOCHS} | loss: {avg_loss:.4f} | train acc: {train_acc:.4f}")

epoch 1/5 | loss: 0.8380 | train acc: 0.7646
epoch 2/5 | loss: 0.6518 | train acc: 0.8092
epoch 3/5 | loss: 0.6219 | train acc: 0.8162
epoch 4/5 | loss: 0.6064 | train acc: 0.8211
epoch 5/5 | loss: 0.5964 | train acc: 0.8234


In [283]:

@torch.no_grad()
def evaluate(model, loader):
    model.eval()    # eval mode: disables dropout, uses running stats in BN
    correct = 0
    total = 0
    total_loss = 0.0
    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)
        logits = model(xb)                 # [B, 10]
        loss = criterion(logits, yb)       # CE loss on this batch
        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)
    return total_loss / total, correct / total

test_loss, test_acc = evaluate(model, test_loader)
print(f"test | loss: {test_loss:.4f} | acc: {test_acc:.4f}")

test | loss: 1.0078 | acc: 0.6988


### MLP model

In [284]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),              # [B,1,28,28] -> [B,784]
            nn.Linear(784, 256),
            nn.ReLU(),
            nn.Dropout(0.5),           # regularization
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 10)         # logits (no Softmax)
        )
    def forward(self, x):
        return self.net(x)

model = MLP().to(device)
print(model)

MLP(
  (net): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=256, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.5, inplace=False)
    (7): Linear(in_features=128, out_features=10, bias=True)
  )
)


In [285]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)  # L2 helps generalization
EPOCHS = 15

In [286]:
model.train()
for epoch in range(EPOCHS):
    total_loss = 0.0
    correct = 0
    total = 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        logits = model(xb)
        loss = criterion(logits, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)

    avg_loss = total_loss / total
    train_acc = correct / total
    print(f"epoch {epoch+1}/{EPOCHS} | loss: {avg_loss:.4f} | train acc: {train_acc:.4f}")

epoch 1/15 | loss: 0.7287 | train acc: 0.7745
epoch 2/15 | loss: 0.4006 | train acc: 0.8806
epoch 3/15 | loss: 0.3319 | train acc: 0.9011
epoch 4/15 | loss: 0.2932 | train acc: 0.9120
epoch 5/15 | loss: 0.2711 | train acc: 0.9187
epoch 6/15 | loss: 0.2516 | train acc: 0.9245
epoch 7/15 | loss: 0.2360 | train acc: 0.9287
epoch 8/15 | loss: 0.2272 | train acc: 0.9308
epoch 9/15 | loss: 0.2197 | train acc: 0.9325
epoch 10/15 | loss: 0.2121 | train acc: 0.9361
epoch 11/15 | loss: 0.2041 | train acc: 0.9372
epoch 12/15 | loss: 0.2004 | train acc: 0.9383
epoch 13/15 | loss: 0.1930 | train acc: 0.9392
epoch 14/15 | loss: 0.1919 | train acc: 0.9414
epoch 15/15 | loss: 0.1874 | train acc: 0.9415


In [287]:
test_loss, test_acc = evaluate(model, test_loader)
print(f"test | loss: {test_loss:.4f} | acc: {test_acc:.4f}")

test | loss: 0.3912 | acc: 0.8898


In [288]:
best_acc, wait, patience = 0.0, 0, 5
best_state = None
EPOCHS = 30  # allow more room; early stop will cut it short

for epoch in range(EPOCHS):
    # ---- train (same as before)
    model.train()
    total_loss = 0.0; correct = 0; total = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        correct += (logits.argmax(1) == yb).sum().item()
        total += xb.size(0)

    train_loss = total_loss / total
    train_acc  = correct / total

    # ---- evaluate
    test_loss, test_acc = evaluate(model, test_loader)
    print(f"epoch {epoch+1}/{EPOCHS} | train_acc {train_acc:.4f} | test_acc {test_acc:.4f}")

    # ---- early stop tracking
    if test_acc > best_acc:
        best_acc, wait = test_acc, 0
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stop at epoch {epoch+1}; best test_acc = {best_acc:.4f}")
            model.load_state_dict(best_state)
            break

epoch 1/30 | train_acc 0.9456 | test_acc 0.8917
epoch 2/30 | train_acc 0.9446 | test_acc 0.8915
epoch 3/30 | train_acc 0.9438 | test_acc 0.8959
epoch 4/30 | train_acc 0.9444 | test_acc 0.8963
epoch 5/30 | train_acc 0.9464 | test_acc 0.8904
epoch 6/30 | train_acc 0.9476 | test_acc 0.8956
epoch 7/30 | train_acc 0.9460 | test_acc 0.8923
epoch 8/30 | train_acc 0.9472 | test_acc 0.8948
epoch 9/30 | train_acc 0.9493 | test_acc 0.8960
Early stop at epoch 9; best test_acc = 0.8963


In [291]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.1, patience=2
)

# after each test eval:
scheduler.step(test_acc)

In [292]:
transforms.Compose([
    transforms.ToTensor(),
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, translate=(0.1,0.1))
])

Compose(
    ToTensor()
    RandomRotation(degrees=[-10.0, 10.0], interpolation=nearest, expand=False, fill=0)
    RandomAffine(degrees=[0.0, 0.0], translate=(0.1, 0.1))
)

In [293]:
import torch.nn as nn

class MLP_BN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(784, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(128, 10)  # logits
        )
    def forward(self, x): return self.net(x)

model = MLP_BN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
# keep your ReduceLROnPlateau:
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.1, patience=2)

In [294]:
def current_lr(opt): return opt.param_groups[0]['lr']
print("lr:", current_lr(optimizer))
# after scheduler.step(test_acc), print again

lr: 0.001


In [295]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch
from PIL import Image
import numpy as np

# Augmentations: small rotations/shifts (works well for handwriting)
train_tfms = transforms.Compose([
    transforms.ToPILImage(),                 # HxW (uint8) -> PIL
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),                   # -> [1,H,W] float in [0,1]
])
test_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
])

class KMNISTNPZ(Dataset):
    def __init__(self, x_np, y_np, tfm):
        self.x = x_np
        self.y = y_np
        self.tfm = tfm
    def __len__(self): return len(self.x)
    def __getitem__(self, idx):
        img = self.x[idx]                     # (28,28) uint8
        lab = int(self.y[idx])                # scalar int
        img = self.tfm(img)                   # -> [1,28,28] float
        return img, torch.tensor(lab, dtype=torch.long)

# rebuild loaders with augmentation for train only
aug_train_ds = KMNISTNPZ(x_train, y_train, train_tfms)
plain_test_ds = KMNISTNPZ(x_test,  y_test,  test_tfms)

BATCH_SIZE = 128
train_loader = DataLoader(aug_train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(plain_test_ds, batch_size=BATCH_SIZE, shuffle=False)

# sanity check
xb, yb = next(iter(train_loader))
print(xb.shape, xb.min().item(), xb.max().item(), yb.shape, yb[:8])

torch.Size([128, 1, 28, 28]) 0.0 1.0 torch.Size([128]) tensor([4, 9, 9, 3, 5, 9, 4, 3])


In [296]:
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch, numpy as np

train_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),        # -> [1,28,28] in [0,1]
])
test_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
])

class KMNISTNPZ(Dataset):
    def __init__(self, x_np, y_np, tfm):
        self.x, self.y, self.tfm = x_np, y_np, tfm
    def __len__(self): return len(self.x)
    def __getitem__(self, i):
        img = self.tfm(self.x[i])                    # (28,28) -> [1,28,28]
        lab = torch.tensor(int(self.y[i]), dtype=torch.long)
        return img, lab

train_loader = DataLoader(KMNISTNPZ(x_train, y_train, train_tfms),
                          batch_size=128, shuffle=True)
test_loader  = DataLoader(KMNISTNPZ(x_test,  y_test,  test_tfms),
                          batch_size=128, shuffle=False)

In [297]:
def lr_now(opt): return opt.param_groups[0]['lr']
print("lr:", lr_now(optimizer))
# after scheduler.step(test_acc): print("lr:", lr_now(optimizer))

lr: 0.001


In [298]:
xb, yb = next(iter(train_loader))
print(xb.shape, xb.min().item(), xb.max().item(), yb[:8])

torch.Size([128, 1, 28, 28]) 0.0 1.0 tensor([6, 5, 0, 7, 2, 3, 1, 1])


In [299]:
import torch

def current_lr(opt): 
    return opt.param_groups[0]['lr']

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)
    return total_loss / total, correct / total

# ---- training with early stop + LR scheduler ----
EPOCHS = 40
patience = 5

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=0.1, patience=2  # drop LR 10× if no test_acc improvement for 2 epochs
)

best_acc = 0.0
best_state = None
wait = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        logits = model(xb)
        loss = criterion(logits, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += xb.size(0)

    train_loss = running_loss / total
    train_acc = correct / total

    # Evaluate on test
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    # Scheduler step on the metric we want to maximize
    scheduler.step(test_acc)

    print(
        f"epoch {epoch:2d}/{EPOCHS} | "
        f"lr {current_lr(optimizer):.5f} | "
        f"train_loss {train_loss:.4f} acc {train_acc:.4f} | "
        f"test_loss {test_loss:.4f} acc {test_acc:.4f}"
    )

    # Early stopping bookkeeping
    if test_acc > best_acc:
        best_acc = test_acc
        wait = 0
        # keep a CPU copy to be device-agnostic
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stop at epoch {epoch}. Best test_acc = {best_acc:.4f}")
            break

# Restore best weights (if improved at least once)
if best_state is not None:
    model.load_state_dict(best_state)
    # optional: re-evaluate best model
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)
    print(f"Restored best model | test_loss {test_loss:.4f} | test_acc {test_acc:.4f}")
else:
    print("No improvement recorded; best_state is None.")

epoch  1/40 | lr 0.00100 | train_loss 1.1385 acc 0.6280 | test_loss 0.8498 acc 0.7296
epoch  2/40 | lr 0.00100 | train_loss 0.8431 acc 0.7271 | test_loss 0.7379 acc 0.7625
epoch  3/40 | lr 0.00100 | train_loss 0.7560 acc 0.7593 | test_loss 0.6605 acc 0.7859
epoch  4/40 | lr 0.00100 | train_loss 0.6980 acc 0.7759 | test_loss 0.6261 acc 0.7999
epoch  5/40 | lr 0.00100 | train_loss 0.6629 acc 0.7896 | test_loss 0.5905 acc 0.8088
epoch  6/40 | lr 0.00100 | train_loss 0.6370 acc 0.7967 | test_loss 0.5558 acc 0.8192
epoch  7/40 | lr 0.00100 | train_loss 0.6187 acc 0.8036 | test_loss 0.5398 acc 0.8238
epoch  8/40 | lr 0.00100 | train_loss 0.6039 acc 0.8103 | test_loss 0.5339 acc 0.8280
epoch  9/40 | lr 0.00100 | train_loss 0.5873 acc 0.8139 | test_loss 0.5140 acc 0.8316
epoch 10/40 | lr 0.00100 | train_loss 0.5748 acc 0.8178 | test_loss 0.5009 acc 0.8404
epoch 11/40 | lr 0.00100 | train_loss 0.5594 acc 0.8224 | test_loss 0.4916 acc 0.8444
epoch 12/40 | lr 0.00100 | train_loss 0.5673 acc 0.820

### Lets make this model more deeper with slighet changes..

In [310]:
import torch
import torch.nn as nn

class MLP_Wide(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),                        # [B,1,28,28] -> [B,784]
            nn.Linear(784, 512), nn.BatchNorm1d(512), nn.GELU(), nn.Dropout(0.4),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.GELU(), nn.Dropout(0.4),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.GELU(), nn.Dropout(0.4),
            nn.Linear(128, 10)                   # logits
        )
    def forward(self, x): 
        return self.net(x)

model = MLP_Wide().to(device)

In [311]:
criterion  = nn.CrossEntropyLoss(label_smoothing=0.05)  # helps generalization
optimizer  = torch.optim.Adam(model.parameters(), lr=3e-3, weight_decay=3e-4)

# OneCycleLR needs steps_per_epoch
steps_per_epoch = len(train_loader)
EPOCHS = 30
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=3e-3,
    epochs=EPOCHS, steps_per_epoch=steps_per_epoch,
    pct_start=0.3,           # warmup portion
    div_factor=10,           # initial lr = max_lr/div_factor
    final_div_factor=100,    # final lr = initial/final_div_factor
)

In [312]:
def lr_now(opt): return opt.param_groups[0]['lr']

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        total_loss += loss.item() * xb.size(0)
        correct += (logits.argmax(1) == yb).sum().item()
        total += xb.size(0)
    return total_loss/total, correct/total

best_acc, wait, patience = 0.0, 0, 15    # start best_acc low; patience 8 works well with OneCycle
best_state = None

for epoch in range(1, EPOCHS+1):
    model.train()
    run_loss, correct, total = 0.0, 0, 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()               # <-- step scheduler EACH BATCH

        run_loss += loss.item() * xb.size(0)
        correct  += (logits.argmax(1) == yb).sum().item()
        total    += xb.size(0)

    train_loss = run_loss/total
    train_acc  = correct/total

    test_loss, test_acc = evaluate(model, test_loader)

    print(f"epoch {epoch:2d}/{EPOCHS} | lr {lr_now(optimizer):.5f} | "
          f"train_loss {train_loss:.4f} acc {train_acc:.4f} | "
          f"test_loss {test_loss:.4f} acc {test_acc:.4f}")

    # early stopping on test_acc
    if test_acc > best_acc:
        best_acc, wait = test_acc, 0
        best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stop at epoch {epoch}; best test_acc = {best_acc:.4f}")
            break

# restore best weights
if best_state is not None:
    model.load_state_dict(best_state)
    test_loss, test_acc = evaluate(model, test_loader)
    print(f"Restored best | test_loss {test_loss:.4f} | test_acc {test_acc:.4f}")

epoch  1/30 | lr 0.00038 | train_loss 1.4236 acc 0.5723 | test_loss 1.0453 acc 0.7201
epoch  2/30 | lr 0.00062 | train_loss 1.0152 acc 0.7327 | test_loss 0.8702 acc 0.7832
epoch  3/30 | lr 0.00098 | train_loss 0.9217 acc 0.7677 | test_loss 0.8472 acc 0.7899
epoch  4/30 | lr 0.00142 | train_loss 0.8619 acc 0.7900 | test_loss 0.7603 acc 0.8281
epoch  5/30 | lr 0.00188 | train_loss 0.8270 acc 0.8058 | test_loss 0.7474 acc 0.8328
epoch  6/30 | lr 0.00233 | train_loss 0.7981 acc 0.8183 | test_loss 0.7516 acc 0.8331
epoch  7/30 | lr 0.00268 | train_loss 0.7848 acc 0.8233 | test_loss 0.7587 acc 0.8300
epoch  8/30 | lr 0.00292 | train_loss 0.7754 acc 0.8275 | test_loss 0.7118 acc 0.8451
epoch  9/30 | lr 0.00300 | train_loss 0.7750 acc 0.8275 | test_loss 0.7336 acc 0.8317
epoch 10/30 | lr 0.00298 | train_loss 0.7734 acc 0.8280 | test_loss 0.7447 acc 0.8281
epoch 11/30 | lr 0.00293 | train_loss 0.7693 acc 0.8307 | test_loss 0.7084 acc 0.8456
epoch 12/30 | lr 0.00285 | train_loss 0.7651 acc 0.831

the model below is slighlty weeker so we are using the previous model and doing three sets of parameter tunning on it below are three models as p1, p2, p3

In [313]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# ---------- 1) Transforms ----------
train_tfms = transforms.Compose([
    transforms.ToPILImage(),                # needed by RandomCrop/Rotation
    transforms.RandomCrop(28, padding=2),   # small jitter
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),                  # -> [1,28,28] in [0,1]
])
test_tfms = transforms.Compose([
    transforms.ToPILImage(),                # keep test clean (no aug)
    transforms.ToTensor(),
])

# ---------- 2) NPZ dataset wrapper ----------
class KMNISTNPZ(Dataset):
    def __init__(self, x_np, y_np, tfm):
        self.x = x_np
        self.y = y_np
        self.tfm = tfm
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        img = self.x[idx]                         # (28,28) uint8
        lab = int(self.y[idx])                    # scalar
        img = self.tfm(img)                       # -> [1,28,28] float
        return img, torch.tensor(lab, dtype=torch.long)

# NOTE: assumes x_train, y_train, x_test, y_test already loaded from NPZ
BATCH_SIZE = 128
train_loader = DataLoader(KMNISTNPZ(x_train, y_train, train_tfms),
                          batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(KMNISTNPZ(x_test,  y_test,  test_tfms),
                          batch_size=BATCH_SIZE, shuffle=False)

# (Optional) sanity check
xb, yb = next(iter(train_loader))
print("train batch:", xb.shape, xb.min().item(), xb.max().item(), yb[:8])

# ---------- 3) Model ----------
# Use your current MLP model object named `model` (already created & moved to device).
# If you want my suggested wider MLP with BN+GELU+Dropout(0.35), define it above and set model=...

# ---------- 4) Loss / Optimizer / Scheduler ----------
criterion  = nn.CrossEntropyLoss(label_smoothing=0.10)
optimizer  = torch.optim.Adam(model.parameters(), lr=3e-3, weight_decay=1e-4)

EPOCHS = 40
steps_per_epoch = len(train_loader)  # recompute after rebuilding loader!

scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=3e-3,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    pct_start=0.3,
    div_factor=10,
    final_div_factor=100,
)

def lr_now(opt): 
    return opt.param_groups[0]['lr']

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        total_loss += loss.item() * xb.size(0)
        correct += (logits.argmax(1) == yb).sum().item()
        total += xb.size(0)
    return total_loss/total, correct/total

# ---------- 5) Train with early stopping ----------
best_acc, wait, patience = 0.0, 0, 15
best_state = None

for epoch in range(1, EPOCHS+1):
    model.train()
    run_loss, correct, total = 0.0, 0, 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()  # step OneCycle each batch

        run_loss += loss.item() * xb.size(0)
        correct  += (logits.argmax(1) == yb).sum().item()
        total    += xb.size(0)

    train_loss = run_loss / total
    train_acc  = correct / total

    test_loss, test_acc = evaluate(model, test_loader)

    print(
        f"epoch {epoch:2d}/{EPOCHS} | lr {lr_now(optimizer):.5f} | "
        f"train_loss {train_loss:.4f} acc {train_acc:.4f} | "
        f"test_loss {test_loss:.4f} acc {test_acc:.4f}"
    )

    if test_acc > best_acc:
        best_acc, wait = test_acc, 0
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stop at epoch {epoch}; best test_acc = {best_acc:.4f}")
            break

# ---------- 6) Restore best ----------
if best_state is not None:
    model.load_state_dict(best_state)
    test_loss, test_acc = evaluate(model, test_loader)
    print(f"Restored best | test_loss {test_loss:.4f} | test_acc {test_acc:.4f}")

train batch: torch.Size([128, 1, 28, 28]) 0.0 1.0 tensor([1, 3, 7, 1, 3, 0, 2, 8])
epoch  1/40 | lr 0.00035 | train_loss 0.8123 acc 0.8939 | test_loss 0.7757 acc 0.9029
epoch  2/40 | lr 0.00048 | train_loss 0.8068 acc 0.8942 | test_loss 0.7749 acc 0.9043
epoch  3/40 | lr 0.00070 | train_loss 0.8113 acc 0.8933 | test_loss 0.7821 acc 0.9001
epoch  4/40 | lr 0.00098 | train_loss 0.8254 acc 0.8866 | test_loss 0.7943 acc 0.8940
epoch  5/40 | lr 0.00130 | train_loss 0.8379 acc 0.8809 | test_loss 0.8100 acc 0.8826
epoch  6/40 | lr 0.00165 | train_loss 0.8438 acc 0.8791 | test_loss 0.8012 acc 0.8917
epoch  7/40 | lr 0.00200 | train_loss 0.8533 acc 0.8725 | test_loss 0.8108 acc 0.8822
epoch  8/40 | lr 0.00233 | train_loss 0.8605 acc 0.8681 | test_loss 0.8165 acc 0.8804
epoch  9/40 | lr 0.00260 | train_loss 0.8631 acc 0.8669 | test_loss 0.8261 acc 0.8773
epoch 10/40 | lr 0.00282 | train_loss 0.8660 acc 0.8661 | test_loss 0.8290 acc 0.8766
epoch 11/40 | lr 0.00295 | train_loss 0.8686 acc 0.8663 |

In [314]:
# ===== P1: rotate+translate (no crop) | LS 0.05 | WD 3e-4 | Dropout 0.40 =====
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# --- device (auto) ---
try:
    device
except NameError:
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
print("device:", device)

# --- transforms ---
train_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
])
test_tfms = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])

# --- dataset wrapper ---
class KMNISTNPZ(Dataset):
    def __init__(self, x_np, y_np, tfm):
        self.x, self.y, self.tfm = x_np, y_np, tfm
    def __len__(self): return len(self.x)
    def __getitem__(self, i):
        img = self.tfm(self.x[i])                 # (28,28)->[1,28,28]
        lab = torch.tensor(int(self.y[i]), dtype=torch.long)
        return img, lab

BATCH = 128
train_loader = DataLoader(KMNISTNPZ(x_train, y_train, train_tfms), batch_size=BATCH, shuffle=True)
test_loader  = DataLoader(KMNISTNPZ(x_test,  y_test,  test_tfms),  batch_size=BATCH, shuffle=False)

# --- model (w/ configurable dropout) ---
class MLP_Wide(nn.Module):
    def __init__(self, p=0.40):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(784, 512), nn.BatchNorm1d(512), nn.GELU(), nn.Dropout(p),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.GELU(), nn.Dropout(p),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.GELU(), nn.Dropout(p),
            nn.Linear(128, 10)
        )
    def forward(self, x): return self.net(x)

model = MLP_Wide(p=0.40).to(device)

# --- loss/opt/sched ---
criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3, weight_decay=3e-4)
EPOCHS = 40
steps_per_epoch = len(train_loader)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=3e-3, epochs=EPOCHS, steps_per_epoch=steps_per_epoch,
    pct_start=0.3, div_factor=10, final_div_factor=100
)

@torch.no_grad()
def evaluate(m, loader):
    m.eval(); totL=0.0; cor=0; tot=0
    for xb,yb in loader:
        xb,yb = xb.to(device), yb.to(device)
        lg = m(xb); loss = criterion(lg,yb)
        totL += loss.item()*xb.size(0)
        cor  += (lg.argmax(1)==yb).sum().item()
        tot  += xb.size(0)
    return totL/tot, cor/tot

def lr_now(opt): return opt.param_groups[0]['lr']

best_acc, wait, patience, best_state = 0.0, 0, 15, None
for ep in range(1, EPOCHS+1):
    model.train(); runL=0.0; cor=0; tot=0
    for xb,yb in train_loader:
        xb,yb = xb.to(device), yb.to(device)
        lg = model(xb); loss = criterion(lg,yb)
        optimizer.zero_grad(); loss.backward(); optimizer.step(); scheduler.step()
        runL += loss.item()*xb.size(0); cor += (lg.argmax(1)==yb).sum().item(); tot += xb.size(0)
    trL, trA = runL/tot, cor/tot
    teL, teA = evaluate(model, test_loader)
    print(f"P1 | epoch {ep:2d}/{EPOCHS} | lr {lr_now(optimizer):.5f} | train {trL:.4f}/{trA:.4f} | test {teL:.4f}/{teA:.4f}")
    if teA>best_acc:
        best_acc, wait = teA, 0
        best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
    else:
        wait += 1
        if wait>=patience: print(f"P1 early stop at {ep}; best acc {best_acc:.4f}"); break

if best_state is not None:
    model.load_state_dict(best_state)
    teL, teA = evaluate(model, test_loader)
    print(f"P1 restored best | test_loss {teL:.4f} | test_acc {teA:.4f}")

device: mps
P1 | epoch  1/40 | lr 0.00035 | train 1.2896/0.6334 | test 0.9595/0.7407
P1 | epoch  2/40 | lr 0.00048 | train 0.8966/0.7802 | test 0.8349/0.7952
P1 | epoch  3/40 | lr 0.00070 | train 0.8099/0.8160 | test 0.7779/0.8159
P1 | epoch  4/40 | lr 0.00098 | train 0.7670/0.8306 | test 0.7504/0.8274
P1 | epoch  5/40 | lr 0.00130 | train 0.7394/0.8394 | test 0.7338/0.8371
P1 | epoch  6/40 | lr 0.00165 | train 0.7148/0.8507 | test 0.7205/0.8426
P1 | epoch  7/40 | lr 0.00200 | train 0.7075/0.8527 | test 0.7127/0.8424
P1 | epoch  8/40 | lr 0.00233 | train 0.6927/0.8606 | test 0.6898/0.8535
P1 | epoch  9/40 | lr 0.00260 | train 0.6900/0.8612 | test 0.7212/0.8420
P1 | epoch 10/40 | lr 0.00282 | train 0.6891/0.8611 | test 0.7039/0.8458
P1 | epoch 11/40 | lr 0.00295 | train 0.6949/0.8589 | test 0.6779/0.8533
P1 | epoch 12/40 | lr 0.00300 | train 0.6989/0.8569 | test 0.6949/0.8506
P1 | epoch 13/40 | lr 0.00299 | train 0.6958/0.8576 | test 0.6805/0.8582
P1 | epoch 14/40 | lr 0.00296 | train 0

In [315]:
# ===== P2: crop+rotate+translate | LS 0.00 | WD 1e-4 | Dropout 0.35 =====
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

try:
    device
except NameError:
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
print("device:", device)

train_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomCrop(28, padding=2),
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
])
test_tfms = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])

class KMNISTNPZ(Dataset):
    def __init__(self, x_np, y_np, tfm):
        self.x, self.y, self.tfm = x_np, y_np, tfm
    def __len__(self): return len(self.x)
    def __getitem__(self, i):
        img = self.tfm(self.x[i]); lab = torch.tensor(int(self.y[i]), dtype=torch.long)
        return img, lab

BATCH=128
train_loader = DataLoader(KMNISTNPZ(x_train,y_train,train_tfms), batch_size=BATCH, shuffle=True)
test_loader  = DataLoader(KMNISTNPZ(x_test, y_test, test_tfms),  batch_size=BATCH, shuffle=False)

class MLP_Wide(nn.Module):
    def __init__(self, p=0.35):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(784, 512), nn.BatchNorm1d(512), nn.GELU(), nn.Dropout(p),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.GELU(), nn.Dropout(p),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.GELU(), nn.Dropout(p),
            nn.Linear(128, 10)
        )
    def forward(self, x): return self.net(x)

model = MLP_Wide(p=0.35).to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=0.0)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3, weight_decay=1e-4)
EPOCHS = 40
steps_per_epoch = len(train_loader)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=3e-3, epochs=EPOCHS, steps_per_epoch=steps_per_epoch,
    pct_start=0.3, div_factor=10, final_div_factor=100
)

@torch.no_grad()
def evaluate(m, loader):
    m.eval(); totL=0.0; cor=0; tot=0
    for xb,yb in loader:
        xb,yb = xb.to(device), yb.to(device)
        lg=m(xb); loss=criterion(lg,yb)
        totL+=loss.item()*xb.size(0); cor+=(lg.argmax(1)==yb).sum().item(); tot+=xb.size(0)
    return totL/tot, cor/tot

def lr_now(opt): return opt.param_groups[0]['lr']

best_acc, wait, patience, best_state = 0.0, 0, 15, None
for ep in range(1, EPOCHS+1):
    model.train(); runL=0.0; cor=0; tot=0
    for xb,yb in train_loader:
        xb,yb=xb.to(device), yb.to(device)
        lg=model(xb); loss=criterion(lg,yb)
        optimizer.zero_grad(); loss.backward(); optimizer.step(); scheduler.step()
        runL+=loss.item()*xb.size(0); cor+=(lg.argmax(1)==yb).sum().item(); tot+=xb.size(0)
    trL, trA = runL/tot, cor/tot
    teL, teA = evaluate(model, test_loader)
    print(f"P2 | epoch {ep:2d}/{EPOCHS} | lr {lr_now(optimizer):.5f} | train {trL:.4f}/{trA:.4f} | test {teL:.4f}/{teA:.4f}")
    if teA>best_acc:
        best_acc, wait = teA, 0
        best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
    else:
        wait += 1
        if wait>=patience: print(f"P2 early stop at {ep}; best acc {best_acc:.4f}"); break

if best_state is not None:
    model.load_state_dict(best_state)
    teL, teA = evaluate(model, test_loader)
    print(f"P2 restored best | test_loss {teL:.4f} | test_acc {teA:.4f}")

device: mps
P2 | epoch  1/40 | lr 0.00035 | train 1.2403/0.6027 | test 0.8525/0.7281
P2 | epoch  2/40 | lr 0.00048 | train 0.7780/0.7492 | test 0.7063/0.7734
P2 | epoch  3/40 | lr 0.00070 | train 0.6734/0.7840 | test 0.6226/0.7973
P2 | epoch  4/40 | lr 0.00098 | train 0.6098/0.8040 | test 0.5317/0.8262
P2 | epoch  5/40 | lr 0.00130 | train 0.5633/0.8179 | test 0.5137/0.8357
P2 | epoch  6/40 | lr 0.00165 | train 0.5216/0.8318 | test 0.4649/0.8468
P2 | epoch  7/40 | lr 0.00200 | train 0.4961/0.8417 | test 0.4577/0.8507
P2 | epoch  8/40 | lr 0.00233 | train 0.4697/0.8508 | test 0.4360/0.8570
P2 | epoch  9/40 | lr 0.00260 | train 0.4507/0.8563 | test 0.4367/0.8573
P2 | epoch 10/40 | lr 0.00282 | train 0.4276/0.8633 | test 0.4014/0.8703
P2 | epoch 11/40 | lr 0.00295 | train 0.4220/0.8658 | test 0.3949/0.8730
P2 | epoch 12/40 | lr 0.00300 | train 0.4135/0.8695 | test 0.4072/0.8720
P2 | epoch 13/40 | lr 0.00299 | train 0.4111/0.8684 | test 0.3870/0.8801
P2 | epoch 14/40 | lr 0.00296 | train 0

In [316]:
# ===== P3: gentler aug (crop pad=1, rot=7°, trans=0.08) | LS 0.05 | WD 3e-4 | Dropout 0.40 =====
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

try:
    device
except NameError:
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
print("device:", device)

train_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomCrop(28, padding=1),
    transforms.RandomRotation(7),
    transforms.RandomAffine(0, translate=(0.08, 0.08)),
    transforms.ToTensor(),
])
test_tfms = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])

class KMNISTNPZ(Dataset):
    def __init__(self, x_np, y_np, tfm):
        self.x, self.y, self.tfm = x_np, y_np, tfm
    def __len__(self): return len(self.x)
    def __getitem__(self, i):
        img = self.tfm(self.x[i]); lab = torch.tensor(int(self.y[i]), dtype=torch.long)
        return img, lab

BATCH=128
train_loader = DataLoader(KMNISTNPZ(x_train,y_train,train_tfms), batch_size=BATCH, shuffle=True)
test_loader  = DataLoader(KMNISTNPZ(x_test, y_test, test_tfms),  batch_size=BATCH, shuffle=False)

class MLP_Wide(nn.Module):
    def __init__(self, p=0.40):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(784, 512), nn.BatchNorm1d(512), nn.GELU(), nn.Dropout(p),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.GELU(), nn.Dropout(p),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.GELU(), nn.Dropout(p),
            nn.Linear(128, 10)
        )
    def forward(self, x): return self.net(x)

model = MLP_Wide(p=0.40).to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3, weight_decay=3e-4)
EPOCHS = 40
steps_per_epoch = len(train_loader)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=3e-3, epochs=EPOCHS, steps_per_epoch=steps_per_epoch,
    pct_start=0.3, div_factor=10, final_div_factor=100
)

@torch.no_grad()
def evaluate(m, loader):
    m.eval(); totL=0.0; cor=0; tot=0
    for xb,yb in loader:
        xb,yb=xb.to(device), yb.to(device)
        lg=m(xb); loss=criterion(lg,yb)
        totL+=loss.item()*xb.size(0); cor+=(lg.argmax(1)==yb).sum().item(); tot+=xb.size(0)
    return totL/tot, cor/tot

def lr_now(opt): return opt.param_groups[0]['lr']

best_acc, wait, patience, best_state = 0.0, 0, 15, None
for ep in range(1, EPOCHS+1):
    model.train(); runL=0.0; cor=0; tot=0
    for xb,yb in train_loader:
        xb,yb=xb.to(device), yb.to(device)
        lg=model(xb); loss=criterion(lg,yb)
        optimizer.zero_grad(); loss.backward(); optimizer.step(); scheduler.step()
        runL+=loss.item()*xb.size(0); cor+=(lg.argmax(1)==yb).sum().item(); tot+=xb.size(0)
    trL, trA = runL/tot, cor/tot
    teL, teA = evaluate(model, test_loader)
    print(f"P3 | epoch {ep:2d}/{EPOCHS} | lr {lr_now(optimizer):.5f} | train {trL:.4f}/{trA:.4f} | test {teL:.4f}/{teA:.4f}")
    if teA>best_acc:
        best_acc, wait = teA, 0
        best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
    else:
        wait += 1
        if wait>=patience: print(f"P3 early stop at {ep}; best acc {best_acc:.4f}"); break

if best_state is not None:
    model.load_state_dict(best_state)
    teL, teA = evaluate(model, test_loader)
    print(f"P3 restored best | test_loss {teL:.4f} | test_acc {teA:.4f}")

device: mps
P3 | epoch  1/40 | lr 0.00035 | train 1.2337/0.6579 | test 0.9420/0.7560
P3 | epoch  2/40 | lr 0.00048 | train 0.8609/0.7957 | test 0.7956/0.8154
P3 | epoch  3/40 | lr 0.00070 | train 0.7765/0.8288 | test 0.7352/0.8361
P3 | epoch  4/40 | lr 0.00098 | train 0.7332/0.8440 | test 0.7255/0.8381
P3 | epoch  5/40 | lr 0.00130 | train 0.7060/0.8549 | test 0.6927/0.8469
P3 | epoch  6/40 | lr 0.00165 | train 0.6884/0.8612 | test 0.7124/0.8419
P3 | epoch  7/40 | lr 0.00200 | train 0.6743/0.8663 | test 0.6629/0.8674
P3 | epoch  8/40 | lr 0.00233 | train 0.6665/0.8696 | test 0.6867/0.8543
P3 | epoch  9/40 | lr 0.00260 | train 0.6635/0.8700 | test 0.6818/0.8558
P3 | epoch 10/40 | lr 0.00282 | train 0.6643/0.8719 | test 0.6547/0.8677
P3 | epoch 11/40 | lr 0.00295 | train 0.6688/0.8688 | test 0.6626/0.8633
P3 | epoch 12/40 | lr 0.00300 | train 0.6635/0.8716 | test 0.6759/0.8620
P3 | epoch 13/40 | lr 0.00299 | train 0.6692/0.8701 | test 0.6706/0.8651
P3 | epoch 14/40 | lr 0.00296 | train 0

In [317]:
# ===== P2: crop+rotate+translate | LS 0.00 | WD 1e-4 | Dropout 0.35 =====
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

try:
    device
except NameError:
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
print("device:", device)

train_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomCrop(28, padding=2),
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
])
test_tfms = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])

class KMNISTNPZ(Dataset):
    def __init__(self, x_np, y_np, tfm):
        self.x, self.y, self.tfm = x_np, y_np, tfm
    def __len__(self): return len(self.x)
    def __getitem__(self, i):
        img = self.tfm(self.x[i]); lab = torch.tensor(int(self.y[i]), dtype=torch.long)
        return img, lab

BATCH=128
train_loader = DataLoader(KMNISTNPZ(x_train,y_train,train_tfms), batch_size=BATCH, shuffle=True)
test_loader  = DataLoader(KMNISTNPZ(x_test, y_test, test_tfms),  batch_size=BATCH, shuffle=False)

class MLP_Wide(nn.Module):
    def __init__(self, p=0.35):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(784, 512), nn.BatchNorm1d(512), nn.GELU(), nn.Dropout(p),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.GELU(), nn.Dropout(p),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.GELU(), nn.Dropout(p),
            nn.Linear(128, 10)
        )
    def forward(self, x): return self.net(x)

model = MLP_Wide(p=0.35).to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=0.0)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3, weight_decay=1e-4)
EPOCHS = 40
steps_per_epoch = len(train_loader)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=3e-3, epochs=EPOCHS, steps_per_epoch=steps_per_epoch,
    pct_start=0.3, div_factor=10, final_div_factor=100
)

@torch.no_grad()
def evaluate(m, loader):
    m.eval(); totL=0.0; cor=0; tot=0
    for xb,yb in loader:
        xb,yb = xb.to(device), yb.to(device)
        lg=m(xb); loss=criterion(lg,yb)
        totL+=loss.item()*xb.size(0); cor+=(lg.argmax(1)==yb).sum().item(); tot+=xb.size(0)
    return totL/tot, cor/tot

def lr_now(opt): return opt.param_groups[0]['lr']

best_acc, wait, patience, best_state = 0.0, 0, 15, None
for ep in range(1, EPOCHS+1):
    model.train(); runL=0.0; cor=0; tot=0
    for xb,yb in train_loader:
        xb,yb=xb.to(device), yb.to(device)
        lg=model(xb); loss=criterion(lg,yb)
        optimizer.zero_grad(); loss.backward(); optimizer.step(); scheduler.step()
        runL+=loss.item()*xb.size(0); cor+=(lg.argmax(1)==yb).sum().item(); tot+=xb.size(0)
    trL, trA = runL/tot, cor/tot
    teL, teA = evaluate(model, test_loader)
    print(f"P2 | epoch {ep:2d}/{EPOCHS} | lr {lr_now(optimizer):.5f} | train {trL:.4f}/{trA:.4f} | test {teL:.4f}/{teA:.4f}")
    if teA>best_acc:
        best_acc, wait = teA, 0
        best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
    else:
        wait += 1
        if wait>=patience: print(f"P2 early stop at {ep}; best acc {best_acc:.4f}"); break

if best_state is not None:
    model.load_state_dict(best_state)
    teL, teA = evaluate(model, test_loader)
    print(f"P2 restored best | test_loss {teL:.4f} | test_acc {teA:.4f}")

device: mps
P2 | epoch  1/40 | lr 0.00035 | train 1.2421/0.6027 | test 0.8440/0.7264
P2 | epoch  2/40 | lr 0.00048 | train 0.7745/0.7512 | test 0.6726/0.7838
P2 | epoch  3/40 | lr 0.00070 | train 0.6647/0.7858 | test 0.6344/0.7942
P2 | epoch  4/40 | lr 0.00098 | train 0.6038/0.8057 | test 0.5434/0.8217
P2 | epoch  5/40 | lr 0.00130 | train 0.5592/0.8188 | test 0.5014/0.8330
P2 | epoch  6/40 | lr 0.00165 | train 0.5232/0.8305 | test 0.4880/0.8379
P2 | epoch  7/40 | lr 0.00200 | train 0.4955/0.8398 | test 0.4369/0.8550
P2 | epoch  8/40 | lr 0.00233 | train 0.4706/0.8495 | test 0.4636/0.8515
P2 | epoch  9/40 | lr 0.00260 | train 0.4464/0.8572 | test 0.4098/0.8665
P2 | epoch 10/40 | lr 0.00282 | train 0.4385/0.8597 | test 0.3760/0.8806
P2 | epoch 11/40 | lr 0.00295 | train 0.4213/0.8659 | test 0.3971/0.8737
P2 | epoch 12/40 | lr 0.00300 | train 0.4188/0.8669 | test 0.3849/0.8763
P2 | epoch 13/40 | lr 0.00299 | train 0.4097/0.8708 | test 0.3906/0.8751
P2 | epoch 14/40 | lr 0.00296 | train 0

In [319]:
import torch
from torch.optim.swa_utils import AveragedModel, SWALR, update_bn

# === 1. Wrap trained P2 model in SWA ===
swa_model = torch.optim.swa_utils.AveragedModel(model).to(device)
swa_start = 35   # epoch to start averaging (after your P2 run, just pick a safe number)
swa_epochs = 6   # how many SWA passes to run

# Use SWALR scheduler (cyclical learning rate for SWA phase)
swa_scheduler = SWALR(optimizer, swa_lr=5e-4)

# === 2. Run SWA training passes ===
for epoch in range(1, swa_epochs + 1):
    model.train()
    run_loss, correct, total = 0.0, 0, 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        swa_scheduler.step()

        run_loss += loss.item() * xb.size(0)
        correct  += (logits.argmax(1) == yb).sum().item()
        total    += xb.size(0)

    # update running averages
    swa_model.update_parameters(model)

    train_loss = run_loss / total
    train_acc  = correct / total

    print(f"SWA epoch {epoch}/{swa_epochs} done "
          f"| train_loss {train_loss:.4f} acc {train_acc:.4f}")

# === 3. Update BN stats (important for SWA) ===
update_bn(train_loader, swa_model, device=device)

# === 4. Evaluate both base and SWA averaged ===
base_loss, base_acc = evaluate(model, test_loader)
swa_loss, swa_acc   = evaluate(swa_model.module, test_loader)

print(f"\nBase (post-P2) | test_loss {base_loss:.4f} | test_acc {base_acc:.4f}")
print(f"SWA averaged   | test_loss {swa_loss:.4f} | test_acc {swa_acc:.4f}")

# === 5. Replace model if SWA is better ===
if swa_acc >= base_acc:
    model = swa_model.module.to(device)
    print("✅ Replaced current model weights with SWA averaged weights")

SWA epoch 1/6 done | train_loss 0.2463 acc 0.9217
SWA epoch 2/6 done | train_loss 0.2469 acc 0.9216
SWA epoch 3/6 done | train_loss 0.2465 acc 0.9218
SWA epoch 4/6 done | train_loss 0.2420 acc 0.9235
SWA epoch 5/6 done | train_loss 0.2388 acc 0.9245
SWA epoch 6/6 done | train_loss 0.2451 acc 0.9225

Base (post-P2) | test_loss 0.2219 | test_acc 0.9331
SWA averaged   | test_loss 0.2183 | test_acc 0.9323


In [320]:
import torch
import torch.nn.functional as F

@torch.no_grad()
def ensemble_eval(models, loader, device):
    for m in models:
        m.eval()
    total, correct, loss_sum = 0, 0, 0.0

    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        # forward all models
        logits_list = [m(xb) for m in models]

        # average probabilities (more stable than averaging logits)
        probs_list = [F.softmax(lg, dim=1) for lg in logits_list]
        avg_probs = torch.stack(probs_list, dim=0).mean(dim=0)  # [B, C]

        # accuracy
        pred = avg_probs.argmax(dim=1)
        correct += (pred == yb).sum().item()
        total += yb.size(0)

        # cross-entropy on the averaged probabilities
        loss_sum += (-torch.log(avg_probs[torch.arange(yb.size(0)), yb] + 1e-12)).sum().item()

    return loss_sum / total, correct / total

# --- evaluate ensemble of (base model) + (SWA averaged) ---
models = [model, swa_model.module]  # base + SWA
ens_loss, ens_acc = ensemble_eval(models, test_loader, device)
print(f"Ensemble (base + SWA) | test_loss {ens_loss:.4f} | test_acc {ens_acc:.4f}")

Ensemble (base + SWA) | test_loss 0.2164 | test_acc 0.9330


In [321]:
weights = torch.tensor([0.6, 0.4], device=device).view(2, 1, 1)
logits_list = [m(xb) for m in models]
probs_list = [F.softmax(lg, dim=1) for lg in logits_list]
avg_probs = (torch.stack(probs_list, dim=0) * weights).sum(dim=0)

In [322]:
# save base
torch.save(model.state_dict(), "kmnist_mlp_base.pt")

# save SWA (averaged)
torch.save(swa_model.module.state_dict(), "kmnist_mlp_swa.pt")