# Homework 6 - Experiments on MNIST for 10-class Classification

Please implement the following three functions:
- MnistMLP() - Design a 2-layer MLP
- MnistCNN() - Design a 2-layer CNN 

Please train the 2-layer MLP and CNN models on the Mnist dataset and print the training results for each epoch.

In [1]:
from torchvision.datasets import MNIST
from torchvision.transforms import Compose,ToTensor,Normalize
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import os
import torch
import numpy as np

BATCH_SIZE = 128
TEST_BATCH_SIZE = 1000
device = "cuda" if torch.cuda.is_available() else "cpu"

# dataloader for the dataset
def get_dataloader(train,batch_size=BATCH_SIZE):
    transform_fn = Compose([
        ToTensor(),
        Normalize(mean = (0.1307,),std = (0.3081,))
        ]) 
    dataset = MNIST(root = './data',train = train,transform = transform_fn, download = True)
    data_loader = DataLoader(dataset,batch_size = batch_size,shuffle = True)
    return data_loader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2-lyer MLP 
class MnistMLP(nn.Module):
    # Tip: write `def __init__(self)` and `def forward(self,input)`
    def __init__(self):
        super(MnistMLP, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)  # Input size is 28x28 (MNIST image dimensions), output is 128
        self.fc2 = nn.Linear(128, 10)       # Output size is 10 (for 10 classes in MNIST)

    def forward(self, x):
        # Flatten the input image tensor (batch_size, 1, 28, 28) -> (batch_size, 784)
        x = x.view(-1, 28 * 28)
        
        # Pass through first fully connected layer with ReLU activation
        x = F.relu(self.fc1(x))
        
        # Pass through second fully connected layer (output layer)
        x = self.fc2(x)
        
        return F.log_softmax(x, dim=1)  # Use log_softmax for numerical stability in training

In [3]:
# 2-lyer CNN
class MnistCNN(nn.Module):
    # Tip: write `def __init__(self)` and `def forward(self,input)`
    def __init__(self):
        super(MnistCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)  # Input size from conv2d output, output size is 128
        self.fc2 = nn.Linear(128, 10)          # Output size is 10 (for 10 classes in MNIST)

    def forward(self, x):
        # First convolutional layer with max pooling and ReLU activation
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        
        # Second convolutional layer with max pooling and ReLU activation
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        
        # Flatten the output of the last convolutional layer
        x = x.view(-1, 64 * 7 * 7)
        
        # First fully connected layer with ReLU activation
        x = F.relu(self.fc1(x))
        
        # Second fully connected layer (output layer)
        x = self.fc2(x)
        
        return F.log_softmax(x, dim=1)  # Use log_softmax for numerical stability in training

## Train the MLP model

In [4]:
model = MnistMLP().to(device)
optimizer = Adam(model.parameters(), lr=0.001)

In [5]:
def train(epoch, num_epochs):
    data_loader = get_dataloader(True)
    total_step = len(data_loader)
    for idx, (input, target) in enumerate(data_loader):
        optimizer.zero_grad()
        output = model(input.to(device))
        loss = F.nll_loss(output, target.to(device))
        loss.backward()
        optimizer.step()
        if (idx+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, idx+1, total_step, loss.item()))

In [6]:
def test():
    loss_list = []
    acc_list = []
    test_dataloader = get_dataloader(train = False,batch_size=TEST_BATCH_SIZE)
    for idx,(input,target) in enumerate(test_dataloader):
        with torch.no_grad():
            output = model(input.to(device))
            target = target.to(device)
            cur_loss = F.nll_loss(output, target)
            loss_list.append(cur_loss.cpu())
            pred = output.max(dim = -1)[-1]
            cur_acc = pred.eq(target).float().mean()
            acc_list.append(cur_acc.cpu())
    print("Mean accuracy: ", np.mean(acc_list), "Mean loss: ", np.mean(loss_list))

In [7]:
test()
num_epochs = 3
for i in range(num_epochs):
    train(i, num_epochs)
test()

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


9913344it [00:05, 1788335.66it/s]                             


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


29696it [00:00, 457880.59it/s]           


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


1649664it [00:01, 1543288.22it/s]                             


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


5120it [00:00, 2898088.59it/s]          


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

Mean accuracy:  0.0778 Mean loss:  2.3645055
Epoch [1/3], Step [100/469], Loss: 0.4857
Epoch [1/3], Step [200/469], Loss: 0.3060
Epoch [1/3], Step [300/469], Loss: 0.1634
Epoch [1/3], Step [400/469], Loss: 0.2794
Epoch [2/3], Step [100/469], Loss: 0.0652
Epoch [2/3], Step [200/469], Loss: 0.1567
Epoch [2/3], Step [300/469], Loss: 0.1642
Epoch [2/3], Step [400/469], Loss: 0.0900
Epoch [3/3], Step [100/469], Loss: 0.0728
Epoch [3/3], Step [200/469], Loss: 0.0645
Epoch [3/3], Step [300/469], Loss: 0.0895
Epoch [3/3], Step [400/469], Loss: 0.1474
Mean accuracy:  0.9712001 Mean loss:  0.095482014


## Train the CNN model

In [8]:
model = MnistCNN().to(device)
optimizer = Adam(model.parameters(), lr=0.001)

In [9]:
test()
num_epochs = 3
for i in range(num_epochs):
    train(i, num_epochs)
test()

Mean accuracy:  0.060399998 Mean loss:  2.303498
Epoch [1/3], Step [100/469], Loss: 0.1260
Epoch [1/3], Step [200/469], Loss: 0.1176
Epoch [1/3], Step [300/469], Loss: 0.0708
Epoch [1/3], Step [400/469], Loss: 0.0134
Epoch [2/3], Step [100/469], Loss: 0.0666
Epoch [2/3], Step [200/469], Loss: 0.1009
Epoch [2/3], Step [300/469], Loss: 0.0624
Epoch [2/3], Step [400/469], Loss: 0.0390
Epoch [3/3], Step [100/469], Loss: 0.0325
Epoch [3/3], Step [200/469], Loss: 0.0217
Epoch [3/3], Step [300/469], Loss: 0.0547
Epoch [3/3], Step [400/469], Loss: 0.0479
Mean accuracy:  0.9858001 Mean loss:  0.04219047
