Imports and setup

In [1]:
import random
import os
import torch
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torchvision.transforms import v2
from torch.backends import cudnn
from torch import GradScaler
from torch import optim
from tqdm import tqdm
import numpy as np
import pickle
from torchvision.transforms import v2



#Adding seed for reproducibility

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print(f"Random seed set to {seed}")

Random seed set to 42


Device setup


In [2]:
device = torch.accelerator.current_accelerator() if torch.accelerator.is_available() else torch.device("cpu")
enable_half = device.type != "cpu"
scaler = GradScaler(device, enabled=enable_half)

print("Grad scaler is enabled:", enable_half)
device

Grad scaler is enabled: True


device(type='cuda')

Dataset loading


In [3]:
if os.path.exists("/kaggle/input") and os.path.exists("/kaggle/working"):
    print("Running on Kaggle.")
    SVHN_test = "/kaggle/input/fii-atnn-2025-competition-2/SVHN_test.pkl"
    SVHN_train = "/kaggle/input/fii-atnn-2025-competition-2/SVHN_train.pkl"
else:
    print("Not on Kaggle.")
    SVHN_test = "data/SVHN_test.pkl"
    SVHN_train = "data/SVHN_train.pkl"

Running on Kaggle.


In [4]:
class SVHN_Dataset(Dataset):
    def __init__(self, train: bool, transforms: v2.Transform):
        path = SVHN_test
        if train:
            path = SVHN_train
        with open(path, "rb") as fd:
            self.data = pickle.load(fd)

        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i: int):
        image, label = self.data[i]
        if self.transforms is None:
            return image, label
        return self.transforms(image), label

Data preprocessing and dataloader

Experiment 1

In [5]:
# # basic_transforms = v2.Compose([
# #     v2.ToImage(),
# #     v2.ToDtype(torch.float32, scale=True),
# #     v2.Normalize((0.5, 0.5, 0.5), (0.25, 0.25, 0.25), inplace=True)
# # ])

# # Training transforms (with mirroring)
# train_transforms = v2.Compose([
#     v2.ToImage(),
#     v2.ToDtype(torch.float32, scale=True),
#     v2.RandomHorizontalFlip(),  # <-- mirroring
#     v2.Normalize((0.5, 0.5, 0.5), (0.25, 0.25, 0.25), inplace=True)
# ])

# # Test transforms (no mirroring)
# test_transforms = v2.Compose([
#     v2.ToImage(),
#     v2.ToDtype(torch.float32, scale=True),
#     v2.Normalize((0.5, 0.5, 0.5), (0.25, 0.25, 0.25), inplace=True)
# ])

# # Datasets
# train_set = SVHN_Dataset(train=True, transforms=train_transforms)
# test_set = SVHN_Dataset(train=False, transforms=test_transforms)

# # DataLoaders
# train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_set, batch_size=500)

Experiment 2

In [6]:
train_transforms = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),

    v2.RandomCrop(32, padding=4),
    v2.RandomHorizontalFlip(p=0.5),
    v2.RandomAffine(degrees=10, translate=(0.1, 0.1)),
    v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),

    v2.Normalize((0.5, 0.5, 0.5), (0.25, 0.25, 0.25)),
])


# Test transforms (no mirroring)
test_transforms = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize((0.5, 0.5, 0.5), (0.25, 0.25, 0.25), inplace=True)
])

# Datasets
train_set = SVHN_Dataset(train=True, transforms=train_transforms)
test_set = SVHN_Dataset(train=False, transforms=test_transforms)

# DataLoaders
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=500)

In [7]:
class VGG13(nn.Module):
    def __init__(self):
        super(VGG13, self).__init__()

        self.layers = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Classifier
            nn.Flatten(),
            nn.Linear(512, 100)
        )

    def forward(self, x: Tensor) -> Tensor:
        return self.layers(x)


Experiment 3: Cutmix and Mixup

In [8]:
import torch.nn.functional as F

def rand_bbox(size, lam):
    """Generate random bounding box."""
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # uniform center
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2


def mixup_data(x, y, alpha=1.0):
    """Applies MixUp augmentation."""
    if alpha <= 0:
        return x, y, y, 1.0
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


def cutmix_data(x, y, alpha=1.0):
    """Applies CutMix augmentation."""
    if alpha <= 0:
        return x, y, y, 1.0
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)

    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)
    x[:, :, bbx1:bbx2, bby1:bby2] = x[index, :, bbx1:bbx2, bby1:bby2]

    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size()[-1] * x.size()[-2]))
    y_a, y_b = y, y[index]
    return x, y_a, y_b, lam


def criterion_mixup_cutmix(criterion, preds, y_a, y_b, lam):
    """Computes loss for mixed labels."""
    return lam * criterion(preds, y_a) + (1 - lam) * criterion(preds, y_b)


Model setup: Basic setup

In [9]:
# model = VGG13().to(device)
# model = torch.jit.script(model)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.001, fused=True)

Experiment 1: Using Adam optimizer


In [10]:
# model = VGG13().to(device)
# # Remove scripting if using features that may not be compatible with Adam
# # model = torch.jit.script(model)

# criterion = nn.CrossEntropyLoss()

# # Use Adam optimizer
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Optional: Learning rate scheduler (ReduceLROnPlateau)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
#     optimizer, mode='max', factor=0.5, patience=2
# )


Experiment 2:Adam+CosineAnnealing+LAbelSmoothing

In [11]:
model = VGG13().to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

optimizer = optim.Adam(model.parameters(), lr=0.001)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=100,         
    eta_min=1e-5       
)


Experiment 1:Training function

In [12]:
# def train():
#     model.train()
#     correct = 0
#     total = 0

#     for inputs, targets in train_loader:
#         inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)
#         with torch.autocast(device.type, enabled=enable_half):
#             outputs = model(inputs)
#             loss = criterion(outputs, targets)
#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()
#         optimizer.zero_grad()

#         predicted = outputs.argmax(1)
#         total += targets.size(0)
#         correct += predicted.eq(targets).sum().item()

#     return 100.0 * correct / total

Experiment 3:Training function with cutmix and mixup


In [13]:
def train():
    model.train()
    correct, total, total_loss = 0, 0, 0.0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)

        # Randomly choose augmentation type
        aug_type = random.choice(["none", "mixup", "cutmix"])
        if aug_type == "mixup":
            inputs, targets_a, targets_b, lam = mixup_data(inputs, targets, alpha=1.0)
        elif aug_type == "cutmix":
            inputs, targets_a, targets_b, lam = cutmix_data(inputs, targets, alpha=1.0)
        else:
            lam = 1.0  # No augmentation

        with torch.autocast(device_type=device.type, enabled=enable_half):
            outputs = model(inputs)
            if aug_type in ["mixup", "cutmix"]:
                loss = criterion_mixup_cutmix(criterion, outputs, targets_a, targets_b, lam)
            else:
                loss = criterion(outputs, targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item() * targets.size(0)
        predicted = outputs.argmax(1)
        total += targets.size(0)
        if aug_type == "none":
            correct += predicted.eq(targets).sum().item()
        else:
            # Mixed-label accuracy not exact — use soft metric
            correct += (lam * predicted.eq(targets_a).sum().item() +
                        (1 - lam) * predicted.eq(targets_b).sum().item())

    return 100.0 * correct / total, total_loss / total


Inference

In [14]:
@torch.inference_mode()
def inference():
    model.eval()

    labels = []

    for inputs, _ in test_loader:
        inputs = inputs.to(device, non_blocking=True)
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)

        predicted = outputs.argmax(1).tolist()
        labels.extend(predicted)

    return labels

Experiment 1:

In [15]:
# best = 0.0
# best_epoch = 0
# epochs = list(range(50))  # increase epochs; early stopping will break early
# patience = 3
# no_improve = 0

# with tqdm(epochs) as tbar:
#     for epoch in tbar:
#         train_acc = train()

#         # Step the LR scheduler
#         scheduler.step(train_acc)

#         # Checkpoint saving
#         if train_acc > best:
#             best = train_acc
#             best_epoch = epoch
#             torch.save(model.state_dict(), "best_model.pth")
#             no_improve = 0
#         else:
#             no_improve += 1

#         # Early stopping
#         if no_improve >= patience:
#             print(f"Early stopping at epoch {epoch}")
#             break

#         tbar.set_description(f"Train: {train_acc:.2f}, Best: {best:.2f} at epoch {best_epoch}")


Experiment 2:

In [16]:
# best = 0.0
# best_epoch = 0
# epochs = list(range(100))  # increase epochs; early stopping will break early
# patience = 3
# no_improve = 0

# with tqdm(epochs) as tbar:
#     for epoch in tbar:
#         train_acc = train()

#         # Step the LR scheduler
#         scheduler.step(train_acc)

#         # Checkpoint saving
#         if train_acc > best:
#             best = train_acc
#             best_epoch = epoch
#             torch.save(model.state_dict(), "best_model.pth")
#             no_improve = 0
#         else:
#             no_improve += 1

#         # Early stopping
#         if no_improve >= patience:
#             print(f"Early stopping at epoch {epoch}")
#             break

#         tbar.set_description(f"Train: {train_acc:.2f}, Best: {best:.2f} at epoch {best_epoch}")


Experiment 3

In [17]:
best = 0.0
best_epoch = 0
epochs = list(range(100))  # increase epochs; early stopping will break early
patience = 3
no_improve = 0

with tqdm(epochs) as tbar:
    for epoch in tbar:
        train_acc, _ = train()   # <-- unpack tuple

        # Step the LR scheduler
        scheduler.step()  # keep as is for CosineAnnealing

        # Checkpoint saving
        if train_acc > best:
            best = train_acc
            best_epoch = epoch
            torch.save(model.state_dict(), "best_model.pth")
            no_improve = 0
        else:
            no_improve += 1

        # Early stopping
        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

        tbar.set_description(f"Train: {train_acc:.2f}, Best: {best:.2f} at epoch {best_epoch}")


Train: 57.26, Best: 57.81 at epoch 38:  41%|████      | 41/100 [49:11<1:10:47, 72.00s/it]

Early stopping at epoch 41





In [18]:
# # Load the best checkpoint
# if os.path.exists("best_model.pth"):
#     model.load_state_dict(torch.load("best_model.pth"))
#     print("Loaded best model checkpoint.")
# else:
#     print("No checkpoint found, using current model.")

# Prepare submission
data = {
    "ID": [],
    "target": []
}

for i, label in enumerate(inference()):
    data["ID"].append(i)
    data["target"].append(label)

df = pd.DataFrame(data)
df.to_csv("/kaggle/working/submission.csv", index=False)

