This is after we got a 0.78 in the last run on 8th march.
MODEL

This model got 0.84517

In [6]:
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# ------------------------------------------------------------------------
#                       Squeeze-and-Excitation
# ------------------------------------------------------------------------
class SEBlock(nn.Module):
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.fc1 = nn.Linear(channels, channels // reduction)
        self.fc2 = nn.Linear(channels // reduction, channels)

    def forward(self, x):
        b, c, _, _ = x.size()
        # Squeeze: global spatial average pooling
        squeeze = x.view(b, c, -1).mean(dim=2)
        # Excitation: two FC layers with ReLU and Sigmoid activations
        excitation = F.relu(self.fc1(squeeze))
        excitation = torch.sigmoid(self.fc2(excitation)).view(b, c, 1, 1)
        return x * excitation

# ------------------------------------------------------------------------
#                       Stochastic Depth
# ------------------------------------------------------------------------
class StochasticDepth(nn.Module):
    """Drops residual branch with probability p."""
    def __init__(self, p: float = 0.1):
        super().__init__()
        self.p = p

    def forward(self, x, residual):
        if not self.training or self.p == 0.0:
            return x + residual
        if torch.rand(1).item() < self.p:
            return x
        else:
            return x + residual

# ------------------------------------------------------------------------
#       PreAct Residual Block with SE + StochasticDepth
# ------------------------------------------------------------------------
class PreActBlock(nn.Module):
    def __init__(self, in_planes, out_planes, stride=1, drop_prob=0.0):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.se = SEBlock(in_planes)
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)
        self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3,
                               stride=1, padding=1, bias=False)

        self.shortcut = None
        if stride != 1 or in_planes != out_planes:
            self.shortcut = nn.Conv2d(in_planes, out_planes, kernel_size=1,
                                      stride=stride, padding=0, bias=False)

        # Stochastic Depth probability
        self.sd = StochasticDepth(p=drop_prob)

    def forward(self, x):
        out = F.relu(self.bn1(x))
        out = self.se(out)  # Squeeze-Excitation on pre-activated features
        shortcut = x if self.shortcut is None else self.shortcut(out)
        out = self.conv1(out)
        out = F.relu(self.bn2(out))
        out = self.conv2(out)
        return self.sd(shortcut, out)

# ------------------------------------------------------------------------
#       Modified LightResNet for CIFAR-10 with ~4.4M parameters
#       (Channels: 40, 80, 160, 320)
# ------------------------------------------------------------------------
class LightResNet18_v2(nn.Module):
    def __init__(self, num_classes=10, drop_prob=0.1):
        super().__init__()
        # Increase the base channel width to 40 (instead of 32)
        self.in_planes = 45
        self.conv1 = nn.Conv2d(3, 45, kernel_size=3, stride=1,
                               padding=1, bias=False)

        # Four layers with 2 blocks each; channels increase as 40 -> 80 -> 160 -> 320.
        self.layer1 = self._make_layer(45, 2, stride=1, base_p=drop_prob * 1/4)
        self.layer2 = self._make_layer(90, 2, stride=2, base_p=drop_prob * 2/4)
        self.layer3 = self._make_layer(170, 2, stride=2, base_p=drop_prob * 3/4)
        self.layer4 = self._make_layer(340, 2, stride=2, base_p=drop_prob * 4/4)

        self.bn = nn.BatchNorm2d(340)
        self.linear = nn.Linear(340, num_classes)

        # Weight initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, out_planes, blocks, stride, base_p):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for i in range(blocks):
            # Increase dropout probability linearly across blocks
            block_p = base_p * (i + 1) / blocks
            layers.append(PreActBlock(self.in_planes, out_planes,
                                      stride=strides[i],
                                      drop_prob=block_p))
            self.in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.relu(self.bn(out))
        out = F.adaptive_avg_pool2d(out, 1).view(out.size(0), -1)
        out = self.linear(out)
        return out

# Quick parameter check
if __name__ == "__main__":
    model = LightResNet18_v2(num_classes=10, drop_prob=0.1)
    x = torch.randn(2, 3, 32, 32)
    y = model(x)
    print("Output shape:", y.shape)
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable parameters: {total_params:,} (~{total_params/1e6:.2f}M)")


Output shape: torch.Size([2, 10])
Total trainable parameters: 4,998,947 (~5.00M)


In [7]:
# train.py
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import datasets
import numpy as np
import pickle
from torch.utils.data import DataLoader, Dataset
from PIL import Image
#from model import LightResNet18
from tqdm import tqdm

# We'll define a custom collate function for MixUp/CutMix
# randomly picks one method for each batch
def rand_bbox(size, lam):
    W = size[3]
    H = size[2]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    return bbx1, bby1, bbx2, bby2

def mixup_data(x, y, alpha=1.0):
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size(0)
    index = torch.randperm(batch_size)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def cutmix_data(x, y, alpha=1.0):
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size(0)
    index = torch.randperm(batch_size)
    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)
    x[:, :, bby1:bby2, bbx1:bbx2] = x[index, :, bby1:bby2, bbx1:bbx2]
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size(-1) * x.size(-2)))
    y_a, y_b = y, y[index]
    return x, y_a, y_b, lam

def mixup_cutmix_collate(batch, alpha=1.0, p=0.5):
    # Standard collate
    images, labels = list(zip(*batch))
    images = torch.stack(images, 0)
    labels = torch.tensor(labels, dtype=torch.long)

    # Flip a coin to choose MixUp or CutMix
    if np.random.rand() < p:
        # MixUp
        mixed_x, y_a, y_b, lam = mixup_data(images, labels, alpha)
        return mixed_x, (y_a, y_b, lam, 'mixup')
    else:
        # CutMix
        cutmix_x, y_a, y_b, lam = cutmix_data(images, labels, alpha)
        return cutmix_x, (y_a, y_b, lam, 'cutmix')

def unpickle(file):
    with open(file, 'rb') as fo:
        data_dict = pickle.load(fo, encoding='bytes')
    return data_dict

class CIFARDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data.reshape(-1, 3, 32, 32).astype("float32") / 255.0
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img = self.data[idx]*255
        img = img.astype("uint8")
        # shape: (3,32,32) => for PIL => (32,32,3)
        img = Image.fromarray(img.transpose(1,2,0))
        label = self.labels[idx]
        if self.transform:
            img = self.transform(img)
        return img, label

def load_cifar10_batches(root_dir):
    data_list, labels_list = [], []
    for i in range(1, 6):
        batch_file = f"{root_dir}/data_batch_{i}"
        batch = unpickle(batch_file)
        data_list.append(batch[b'data'])
        labels_list.extend(batch[b'labels'])
    X = np.concatenate(data_list, axis=0)
    y = np.array(labels_list)
    return X, y

def train():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # 2) Transforms: RandAugment or AutoAugment
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.AutoAugment(transforms.AutoAugmentPolicy.CIFAR10),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
        transforms.RandomErasing(p=0.5, scale=(0.02, 0.1), value='random')
    ])
    transform_val = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2470, 0.2435, 0.2616))
    ])

    #train_dataset = CIFARDataset(X_train, y_train, transform=transform_train)
    #val_dataset   = CIFARDataset(X_val,   y_val,   transform=transform_val)
    train_dataset = datasets.CIFAR10(root='./data', train=True,
                                 download=True, transform=transform_train)
    val_dataset = datasets.CIFAR10(root='./data', train=False,
                                download=True, transform=transform_val)
    # Collate with MixUp & CutMix
    train_loader = DataLoader(train_dataset, batch_size=128,
                              shuffle=True, num_workers=4,
                              collate_fn=lambda b: mixup_cutmix_collate(b, alpha=1.0, p=0.5))
    val_loader   = DataLoader(val_dataset,   batch_size=128,
                              shuffle=False, num_workers=4)

    # 3) Initialize model
    model = LightResNet18_v2(num_classes=10, drop_prob=0.1).to(device)

    # 4) Loss, Optimizer, Scheduler
    # We'll handle MixUp/CutMix label logic manually, but also do label_smoothing
    base_criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=300)

    best_acc = 0.0
    num_epochs = 300

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_approx = 0.0
        total_approx = 0

        train_pbar = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{num_epochs}]", leave=False)
        for images, label_info in train_pbar:
            # label_info = (y_a, y_b, lam, method)
            y_a, y_b, lam, method = label_info
            images = images.to(device)
            y_a = y_a.to(device)
            y_b = y_b.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            # MixUp/CutMix combined loss
            loss = lam * base_criterion(outputs, y_a) + (1 - lam) * base_criterion(outputs, y_b)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            # approximate accuracy counting
            correct_approx += (preds == y_a).sum().item() * lam + (preds == y_b).sum().item() * (1 - lam)
            total_approx   += images.size(0)

            train_pbar.set_postfix(loss=f"{loss.item():.3f}")

        scheduler.step()
        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc  = 100.0 * correct_approx / total_approx

        # ----- Validation -----
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for imgs, labels in val_loader:
                imgs, labels = imgs.to(device), labels.to(device)
                outputs = model(imgs)
                _, pred = torch.max(outputs, 1)
                val_correct += (pred == labels).sum().item()
                val_total   += labels.size(0)
        val_acc = 100.0 * val_correct / val_total

        print(f"Epoch {epoch+1}/{num_epochs} | "
              f"Train Loss: {epoch_loss:.4f}, Approx Train Acc: {epoch_acc:.2f}% | "
              f"Val Acc: {val_acc:.2f}%")

        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), "best_model.pth")
            print(f"   [*] Saved new best model: Val Acc = {val_acc:.2f}%")

    print(f"Training complete. Best validation accuracy: {best_acc:.2f}%")
    print("Best model saved as best_model.pth")

if __name__ == "__main__":
    train()


Using device: cuda
Files already downloaded and verified
Files already downloaded and verified




Epoch 1/300 | Train Loss: 2.1575, Approx Train Acc: 21.78% | Val Acc: 39.47%
   [*] Saved new best model: Val Acc = 39.47%




Epoch 2/300 | Train Loss: 1.9978, Approx Train Acc: 32.19% | Val Acc: 54.28%
   [*] Saved new best model: Val Acc = 54.28%




KeyboardInterrupt: 

In [8]:
# inference.py
import torch
import torchvision.transforms as transforms
import pandas as pd
import pickle
import numpy as np
from torch.utils.data import DataLoader, Dataset
#from model import LightResNet18
import torch.nn.functional as F

def unpickle(file):
    with open(file, 'rb') as fo:
        data_dict = pickle.load(fo, encoding='bytes')
    return data_dict

class CIFARTestDataset(Dataset):
    def __init__(self, data, ids, transform=None):
        """
        data: shape (N, 32, 32, 3)
        ids: array/list of image IDs
        transform: transforms to apply
        """
        self.data = data
        self.ids = ids
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img = self.data[idx].astype("uint8")  # ensure 0..255
        img_id = self.ids[idx]
        if self.transform:
            img = self.transform(img)
        return img, img_id

def inference():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # 1) Load the best model
    model = LightResNet18_v2(num_classes=10, drop_prob=0.1).to(device)
    model.load_state_dict(torch.load("best_model.pth", map_location=device))
    model.eval()

    # 2) Load custom test set .pkl
    test_file = "/content/cifar_test_nolabel.pkl"
    test_dict = unpickle(test_file)
    print("Keys in test_dict:", test_dict.keys())

    # Reshape if it's (N, 3072). If it's already (N, 32,32,3), remove reshape
    test_images = test_dict[b'data'].reshape(-1, 32, 32, 3)
    test_ids = [str(i) for i in range(len(test_images))]

    # 3) Define test transforms
    transform_test = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2470, 0.2435, 0.2616))
    ])

    test_dataset = CIFARTestDataset(test_images, test_ids, transform=transform_test)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)

    # 4) Inference with 2-pass TTA
    predictions = []
    image_ids = []
    with torch.no_grad():
        for imgs, ids in test_loader:
            imgs = imgs.to(device)

            # Pass 1: Normal
            out_normal = model(imgs)
            probs_normal = F.softmax(out_normal, dim=1)

            # Pass 2: Horizontal flip
            imgs_flipped = torch.flip(imgs, dims=[3])  # flip W dimension
            out_flipped = model(imgs_flipped)
            probs_flipped = F.softmax(out_flipped, dim=1)

            # Average probabilities
            final_probs = (probs_normal + probs_flipped) / 2.0
            _, predicted = torch.max(final_probs, 1)

            predictions.extend(predicted.cpu().numpy().tolist())
            image_ids.extend(ids)

    # 5) Save submission
    submission_df = pd.DataFrame({"ID": image_ids, "Labels": predictions})
    submission_df.to_csv("submission.csv", index=False)
    print("Submission file saved as submission.csv")

if __name__ == "__main__":
    inference()


Using device: cuda
Keys in test_dict: dict_keys([b'data', b'ids'])


  model.load_state_dict(torch.load("best_model.pth", map_location=device))


Submission file saved as submission.csv


In [9]:
import pandas as pd

# Load and inspect submission file
submission_df = pd.read_csv("submission.csv")
print(submission_df.head())  # Show first few rows
print(submission_df["Labels"].value_counts())  # Show label distribution


   ID  Labels
0   0       6
1   1       1
2   2       8
3   3       6
4   4       9
Labels
1    1062
3    1062
5    1054
8    1050
7    1036
9     981
4     974
6     953
2     946
0     882
Name: count, dtype: int64
