In [None]:
                                                          #----------------------------------------
                                                                   #Student Information#
                                                          #----------------------------------------
                                                           # David Engstrom - Student ID: 301537614
                                                           # Brian Salas    - Student ID: 301789398

#Import Pytorch
import torch
#Import torch neural network
import torch.nn as nn
#Import torch optimization
import torch.optim as optim
#Get the torch DataLoader class
from torch.utils.data import DataLoader
#Get the ready-to-use datasets and image preprocessing functions
from torchvision import datasets, transforms
#Handles file paths cleaner and safer across OS
from pathlib import Path

#----------------------------
# Config / Hyperparameters
#----------------------------
#Number of images processed per batch (mini-batch)
BATCH_SIZE = 128
#Number of full passes through the dataset
EPOCHS = 5
#update weights in steps of 0.001 × gradient each time
LR = 1e-3
#Random seed for reproducibility (controls shuffling, weight init, etc.)
SEED = 11
#Force training to run on CPU only (per my computers request lol)
DEVICE = "cpu"

#Set seed for PyTorch random number generator (reproducibility)
torch.manual_seed(SEED)

#----------------------------
# MNIST is grayscale 28x28; normalize to mean=0.1307, std=0.3081 (standard for MNIST)
#----------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
#Load the MNIST training dataset (60,000 images), downloading if not already present
train_ds = datasets.MNIST(root="data", train=True, download=True, transform=transform)

#Load the MNIST test dataset (10,000 images) with the same preprocessing
test_ds  = datasets.MNIST(root="data", train=False, download=True, transform=transform)

#Wrap training dataset in DataLoader: batches of BATCH_SIZE, shuffled each epoch
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) #use 2 subprocesses to load data in parallel

#Wrap test dataset in DataLoader: batches of BATCH_SIZE, no shuffling (order doesn’t matter for evaluation)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

#----------------------------
# Define a Convolutional Neural Network (CNN) for MNIST
#----------------------------
class MNISTCNN(nn.Module):
    def __init__(self):
        super().__init__()

        # Convolutional feature extractor
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),   # conv layer: 1 input channel → 32 filters, keeps 28x28
            nn.ReLU(inplace=True),                        # non-linear activation
            nn.Conv2d(32, 64, kernel_size=3, padding=1),  # conv layer: 32 → 64 filters, still 28x28
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),                              # downsample: 28x28 → 14x14

            nn.Conv2d(64, 128, kernel_size=3, padding=1), # conv layer: 64 → 128 filters, keeps 14x14
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),                              # downsample: 14x14 → 7x7
        )

        # Fully connected classifier
        self.classifier = nn.Sequential(
            nn.Flatten(),                                 # flatten feature maps into a vector
            nn.Linear(128 * 7 * 7, 256),                  # fully connected layer
            nn.ReLU(inplace=True),                        # activation
            nn.Dropout(0.3),                              # dropout (30%) to reduce overfitting
            nn.Linear(256, 10)                            # output layer: 10 classes (digits 0–9)
        )

    def forward(self, x):
        x = self.features(x)      # apply convolution + pooling layers
        x = self.classifier(x)    # apply fully connected layers
        return x                  # return raw class scores (logits)

# Initialize the model and move it to the selected device (CPU)
model = MNISTCNN().to(DEVICE)

#----------------------------
# Loss & Optimizer
#----------------------------

# CrossEntropyLoss: combines LogSoftmax + NLLLoss (Negative Log-Likelihood Loss)
#   - good choice for multi-class classification (digits 0–9)
criterion = nn.CrossEntropyLoss()

# Adam optimizer: adaptive learning rate method
#   - updates model parameters based on computed gradients
#   - uses the learning rate (LR) defined earlier
optimizer = optim.Adam(model.parameters(), lr=LR)

# ---------------------------
# Train loop for one epoch
# ---------------------------
def train_one_epoch(epoch):
    model.train()                            # set model to training mode
    running_loss, correct, total = 0.0, 0, 0 # track cumulative loss and accuracy

    # iterate over training batches
    for images, labels in train_loader:
        # move batch to device
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()                # reset gradients from previous step
        logits = model(images)               # forward pass: model predictions
        loss = criterion(logits, labels)     # compute loss between predictions & true labels
        loss.backward()                      # backpropagate to compute gradients
        optimizer.step()                     # update weights based on gradients

        # accumulate stats for reporting
        running_loss += loss.item() * images.size(0)   # sum of batch losses
        preds = logits.argmax(dim=1)                   # predicted class per sample
        correct += (preds == labels).sum().item()      # count correct predictions
        total += labels.size(0)                        # count total samples

    # compute average loss and accuracy across all batches
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    print(f"Epoch {epoch}: train loss={epoch_loss:.4f}, acc={epoch_acc:.4f}")

@torch.no_grad()                   # disable gradient tracking (faster & saves memory during eval)
def evaluate():
    model.eval()                   # set model to evaluation mode (turns off dropout, etc.)
    correct, total = 0, 0          # counters for accuracy

    # loop through the test dataset in batches
    for images, labels in test_loader:
        # move data to device
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        logits = model(images)       # forward pass (no grad since @torch.no_grad)
        preds = logits.argmax(dim=1) # get class with highest score for each image

        # update counters
        correct += (preds == labels).sum().item()
        total   += labels.size(0)

    # return accuracy over the entire test set
    return correct / total

# ---------------------------
# Run training loop
# ---------------------------

best_acc = 0.0                                     # track the best test accuracy seen so far
Path("checkpoints").mkdir(exist_ok=True)           # create folder to save model checkpoints if it doesn't exist

for epoch in range(1, EPOCHS + 1):                 # loop over all epochs
    train_one_epoch(epoch)                         # train on the training set
    test_acc = evaluate()                          # evaluate on the test set
    print(f"Test acc after epoch {epoch}: {test_acc:.4f} ({test_acc*100:.2f}%)")

    # if this is the best test accuracy so far, save the model weights
    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), "checkpoints/mnist_cnn_best.pt")
        print(f"Saved new best model with acc={best_acc:.4f} ({best_acc*100:.2f}%)")

# after all epochs, report the best accuracy achieved
print(f"Best test accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")

Epoch 1: train loss=0.1488, acc=0.9537
Test acc after epoch 1: 0.9892 (98.92%)
Saved new best model with acc=0.9892 (98.92%)
Epoch 2: train loss=0.0428, acc=0.9862
Test acc after epoch 2: 0.9915 (99.15%)
Saved new best model with acc=0.9915 (99.15%)
Epoch 3: train loss=0.0289, acc=0.9911
Test acc after epoch 3: 0.9910 (99.10%)
Epoch 4: train loss=0.0242, acc=0.9926
Test acc after epoch 4: 0.9928 (99.28%)
Saved new best model with acc=0.9928 (99.28%)
Epoch 5: train loss=0.0176, acc=0.9945
Test acc after epoch 5: 0.9923 (99.23%)
Best test accuracy: 0.9928 (99.28%)
