In [1]:
import pandas as pd
import torch
import copy
import torch.nn as nn
from torchvision import transforms
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [2]:
# Define transforms
train_transform = transforms.Compose([
    transforms.ToPILImage(),  # Convert tensor/array to PIL image
    transforms.RandomRotation(10),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),  # Converts back to tensor and scales to [0,1]
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean & std
])

val_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

In [3]:
class DigitDataset(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X.astype(np.uint8).reshape(-1, 28, 28)  # keep as image-like array
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        image = self.X[idx]
        label = self.y[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

In [4]:
df = pd.read_csv('digit-recognizer/train.csv')  # replace with actual path

X = df.iloc[:, 1:].values  # pixels
y = df.iloc[:, 0].values   # labels

# Optional: train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

train_dataset = DigitDataset(X_train, y_train, transform=train_transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

val_dataset = DigitDataset(X_val, y_val, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [5]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),  # 28x28 → 28x28
            nn.Dropout2d(0.2),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 64, 3, padding=1), # 28x28 → 28x28
            nn.Dropout2d(0.2),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),              # 28x28 → 14x14

            nn.Conv2d(64, 128, 3, padding=1),# 14x14 → 14x14
            nn.Dropout2d(0.2),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),              # 14x14 → 7x7
        )
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 7 * 7, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 10)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

In [6]:
model1 = CNNModel()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model1.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

patience = 5  # stop after 3 epochs without improvement
best_val_acc = 0
epochs_without_improvement = 0
best_model_wts = copy.deepcopy(model1.state_dict())

for epoch in range(100):  # set a higher max epoch
    model1.train()
    total_loss = 0

    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model1(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    scheduler.step()

    # Validation step
    model1.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            outputs = model1(images)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_acc = correct / total

    print(f"Epoch {epoch+1}, Train Loss: {total_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Early stopping check
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_wts = copy.deepcopy(model1.state_dict())
        epochs_without_improvement = 0
        torch.save(model1.state_dict(), "best_model.pt")  # save best model
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print("Early stopping triggered.")
            break

Epoch 1, Train Loss: 244.6045, Val Acc: 0.9829
Epoch 2, Train Loss: 100.7971, Val Acc: 0.9867
Epoch 3, Train Loss: 69.2830, Val Acc: 0.9893
Epoch 4, Train Loss: 57.7086, Val Acc: 0.9917
Epoch 5, Train Loss: 48.5856, Val Acc: 0.9919
Epoch 6, Train Loss: 45.5581, Val Acc: 0.9926
Epoch 7, Train Loss: 41.6774, Val Acc: 0.9931
Epoch 8, Train Loss: 40.7047, Val Acc: 0.9926
Epoch 9, Train Loss: 37.0766, Val Acc: 0.9929
Epoch 10, Train Loss: 36.0120, Val Acc: 0.9931
Epoch 11, Train Loss: 35.8690, Val Acc: 0.9931
Epoch 12, Train Loss: 34.5244, Val Acc: 0.9929
Early stopping triggered.


In [7]:
class CNNModel2(nn.Module):
    def __init__(self):
        super(CNNModel2, self).__init__()

        def conv_block(in_channels, out_channels):
            return nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True),
                nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=2, stride=2),
                nn.Dropout(0.25)
            )

        self.block1 = conv_block(1, 64)     # 28x28 → 14x14
        self.block2 = conv_block(64, 128)   # 14x14 → 7x7
        self.block3 = conv_block(128, 256)  # 7x7 → 3x3

        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))  # 3x3 → 1x1
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.global_avg_pool(x)
        x = self.classifier(x)
        return x

In [8]:
model2 = CNNModel2()
optimizer2 = torch.optim.AdamW(model2.parameters(), lr=0.001, weight_decay=1e-5)
scheduler2 = torch.optim.lr_scheduler.StepLR(optimizer2, step_size=3, gamma=0.5)

best_val_acc2 = 0
epochs_without_improvement2 = 0
best_model_wts2 = copy.deepcopy(model2.state_dict())

for epoch in range(100):  # set a higher max epoch
    model2.train()
    total_loss = 0

    for images, labels in train_loader:
        optimizer2.zero_grad()
        outputs = model2(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer2.step()
        total_loss += loss.item()

    scheduler2.step()

    # Validation step
    model2.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            outputs = model2(images)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_acc = correct / total

    print(f"Epoch {epoch+1}, Train Loss: {total_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Early stopping check
    if val_acc > best_val_acc2:
        best_val_acc2 = val_acc
        best_model_wts2 = copy.deepcopy(model2.state_dict())
        epochs_without_improvement2 = 0
        torch.save(model2.state_dict(), "best_model.pt")  # save best model
    else:
        epochs_without_improvement2 += 1
        if epochs_without_improvement2 >= patience:
            print("Early stopping triggered.")
            break

Epoch 1, Train Loss: 185.5704, Val Acc: 0.9838
Epoch 2, Train Loss: 55.6448, Val Acc: 0.9912
Epoch 3, Train Loss: 45.7892, Val Acc: 0.9879
Epoch 4, Train Loss: 30.6979, Val Acc: 0.9919
Epoch 5, Train Loss: 28.1653, Val Acc: 0.9926
Epoch 6, Train Loss: 26.3641, Val Acc: 0.9929
Epoch 7, Train Loss: 19.0993, Val Acc: 0.9938
Epoch 8, Train Loss: 17.3804, Val Acc: 0.9931
Epoch 9, Train Loss: 17.9168, Val Acc: 0.9943
Epoch 10, Train Loss: 14.1133, Val Acc: 0.9943
Epoch 11, Train Loss: 13.6612, Val Acc: 0.9940
Epoch 12, Train Loss: 14.2235, Val Acc: 0.9943
Epoch 13, Train Loss: 12.4773, Val Acc: 0.9948
Epoch 14, Train Loss: 11.7111, Val Acc: 0.9943
Epoch 15, Train Loss: 9.9794, Val Acc: 0.9948
Epoch 16, Train Loss: 10.6661, Val Acc: 0.9948
Epoch 17, Train Loss: 10.1872, Val Acc: 0.9948
Epoch 18, Train Loss: 11.0586, Val Acc: 0.9945
Early stopping triggered.


In [12]:
class CNNModel3(nn.Module):
    def __init__(self):
        super(CNNModel3, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),

            nn.Conv2d(32, 64, kernel_size=3, padding=2, dilation=2),  # Dilated conv
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # 28x28 → 14x14

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),

            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # 14x14 → 7x7

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
        )

        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))  # Output: (B, 256, 1, 1)
        self.fc = nn.Sequential(
            nn.Flatten(),          # (B, 256)
            nn.Dropout(0.4),
            nn.Linear(256, 10)     # Final classification
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.global_avg_pool(x)
        x = self.fc(x)
        return x


In [13]:
model3 = CNNModel3()
optimizer3 = torch.optim.AdamW(model3.parameters(), lr=0.001, weight_decay=1e-5)
scheduler3 = torch.optim.lr_scheduler.StepLR(optimizer3, step_size=3, gamma=0.6)

best_val_acc3 = 0
epochs_without_improvement3 = 0
best_model_wts3 = copy.deepcopy(model3.state_dict())

for epoch in range(100):
    model3.train()
    total_loss = 0

    for images, labels in train_loader:
        optimizer3.zero_grad()
        outputs = model3(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer3.step()
        total_loss += loss.item()

    scheduler3.step()

    model3.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            outputs = model3(images)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_acc = correct / total
    print(f"[Model 3] Epoch {epoch+1}, Train Loss: {total_loss:.4f}, Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc3:
        best_val_acc3 = val_acc
        best_model_wts3 = copy.deepcopy(model3.state_dict())
        epochs_without_improvement3 = 0
        torch.save(model3.state_dict(), "best_model3.pt")
    else:
        epochs_without_improvement3 += 1
        if epochs_without_improvement3 >= patience:
            print("Model 3: Early stopping triggered.")
            break

[Model 3] Epoch 1, Train Loss: 145.4234, Val Acc: 0.9810
[Model 3] Epoch 2, Train Loss: 41.6911, Val Acc: 0.9857
[Model 3] Epoch 3, Train Loss: 33.2588, Val Acc: 0.9883
[Model 3] Epoch 4, Train Loss: 23.2935, Val Acc: 0.9926
[Model 3] Epoch 5, Train Loss: 21.1578, Val Acc: 0.9886
[Model 3] Epoch 6, Train Loss: 18.7398, Val Acc: 0.9848
[Model 3] Epoch 7, Train Loss: 15.2485, Val Acc: 0.9898
[Model 3] Epoch 8, Train Loss: 13.8384, Val Acc: 0.9924
[Model 3] Epoch 9, Train Loss: 13.7924, Val Acc: 0.9933
[Model 3] Epoch 10, Train Loss: 11.0826, Val Acc: 0.9940
[Model 3] Epoch 11, Train Loss: 9.7137, Val Acc: 0.9940
[Model 3] Epoch 12, Train Loss: 9.0619, Val Acc: 0.9931
[Model 3] Epoch 13, Train Loss: 8.3363, Val Acc: 0.9938
[Model 3] Epoch 14, Train Loss: 7.7449, Val Acc: 0.9940
[Model 3] Epoch 15, Train Loss: 8.2545, Val Acc: 0.9938
Model 3: Early stopping triggered.


In [16]:
# Load best model weights (optional)
model1.load_state_dict(best_model_wts)
model1.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in val_loader:
        outputs = model1(images)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

val_acc = correct / total
print(f"[Model 1], Val Acc: {val_acc:.4f}")

[Model 1], Val Acc: 0.9931


In [17]:
# Load best model weights (optional)
model2.load_state_dict(best_model_wts2)
model2.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in val_loader:
        outputs = model2(images)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

val_acc = correct / total
print(f"[Model 2], Val Acc: {val_acc:.4f}")

[Model 2], Val Acc: 0.9948


In [18]:
# Load best model weights (optional)
model3.load_state_dict(best_model_wts3)
model3.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in val_loader:
        outputs = model3(images)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

val_acc = correct / total
print(f"[Model 3], Val Acc: {val_acc:.4f}")

[Model 3], Val Acc: 0.9940


In [19]:
torch.save(model1.state_dict(), 'model1.pth')
torch.save(model2.state_dict(), 'model2.pth')
torch.save(model3.state_dict(), 'model3.pth')

In [20]:
model1.load_state_dict(torch.load('model1.pth'))
model2.load_state_dict(torch.load('model2.pth'))
model3.load_state_dict(torch.load('model3.pth'))

model1.eval()
model2.eval()
model3.eval()

CNNModel3(
  (conv_layers): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256, kernel_size=(3, 3), 

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
model1.to(device)
model2.to(device)
model3.to(device)

CNNModel3(
  (conv_layers): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU()
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256, kernel_size=(3, 3), 

In [23]:
correct = 0
total = 0

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs1 = F.softmax(model1(images), dim=1)
        outputs2 = F.softmax(model2(images), dim=1)
        outputs3 = F.softmax(model3(images), dim=1)

        avg_output = (outputs1 + outputs2 + outputs3) / 3
        _, preds = torch.max(avg_output, 1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

val_acc = correct / total

print(f"Validation Accuracy: {100 * correct / total:.2f}%")

Validation Accuracy: 99.60%


In [24]:
class TestDataset(Dataset):
    def __init__(self, X, transform=None):
        self.X = X.astype(np.uint8).reshape(-1, 28, 28)
        self.transform = transform

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        image = self.X[idx]
        if self.transform:
            image = self.transform(image)
        return image

In [25]:
df_test = pd.read_csv('digit-recognizer/test.csv')  # shape: (28000, 784)
X_test = df_test.values  # numpy array of shape (28000, 784)

test_dataset = TestDataset(X_test, transform=val_transform)  # use val_transform (no augmentation)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [26]:
predictions = []

with torch.no_grad():
    for images in test_loader:
        images = images.to(device)

        outputs1 = F.softmax(model1(images), dim=1)
        outputs2 = F.softmax(model2(images), dim=1)
        outputs3 = F.softmax(model3(images), dim=1)

        avg_output = (outputs1 + outputs2 + outputs3) / 3
        _, preds = torch.max(avg_output, 1)

        predictions.extend(preds.cpu().numpy())


In [27]:
# Prepare submission DataFrame
submission_df = pd.DataFrame({
    'ImageId': np.arange(1, len(predictions) + 1),
    'Label': predictions
})

submission_df.to_csv('submission.csv', index=False)