# Homework 4

As always, please provide your solution in a Jupyter Notebook.

1. Now that you have all the tools to train an MLP with high performance on MNIST, try reaching 0-loss (or 100% accuracy) on the training data (with a small epsilon, e.g. 99.99% training performance -- don't worry if you overfit!). The implementation is completely up to you. You just need to keep it an MLP without using fancy layers (e.g., keep the `Linear` layers, don't use `Conv1d` or something like this, don't use attention). You are free to use any LR scheduler or optimizer, any one of batchnorm/groupnorm, regularization methods... If you use something we haven't seen during lectures, please motivate your choice and explain (as briefly as possible) how it works.

2. Try reaching 0-loss on the training data with **permuted labels**. Assess the model on the test data (without permuted labels) and comment. Help yourself with [3](https://arxiv.org/abs/1611.03530). *Tip:* To permute the labels, act on the `trainset.targets` with an appropriate torch function. Then, you can pass this "permuted" `Dataset` to a `DataLoader` like so:
`trainloader_permuted = torch.utils.data.DataLoader(trainset_permuted, batch_size=batch_size_train, shuffle=True)`.
You can now use this `DataLoader` inside the training function. Additional view for motivating this exercise: [link](https://youtu.be/vl2QsVWEqdA)

P.S. I increased the number of files to upload from 1 to 5, so if you want you may include up to 4 local image files in your solution.

In [1]:
import torch
import os
import sys
from torch import nn
from matplotlib import pyplot as plt


sys.path.append("../labs")

from scripts import mnist
from scripts.train_utils import accuracy, AverageMeter

In [2]:
class MLP(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.flat = torch.nn.Flatten()
        self.layer1 = torch.nn.Linear(28*28, 16)
        self.layer2 = torch.nn.Linear(16, 32)
        self.layer3 = torch.nn.Linear(32, 24)
        self.layer4 = torch.nn.Linear(24, 10)

    def forward(self, X): 
        out = self.flat(X)
        out = self.layer1(out)
        out = torch.nn.functional.relu(out)
        out = self.layer2(out)
        out = torch.nn.functional.relu(out)
        out = self.layer3(out)
        out = torch.nn.functional.relu(out)
        out = self.layer4(out)
        out = torch.nn.functional.log_softmax(out)
        return out

In [3]:
def train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device):
    for X, y in dataloader:
        # TRANSFER X AND y TO GPU IF SPECIFIED
        X = X.to(device)
        y = y.to(device)
        # 1. reset the gradients previously accumulated by the optimizer
        #    this will avoid re-using gradients from previous loops
        optimizer.zero_grad() 
        # 2. get the predictions from the current state of the model
        #    this is the forward pass
        y_hat = model(X)
        # 3. calculate the loss on the current mini-batch
        loss = loss_fn(y_hat, y)
        # 4. execute the backward pass given the current loss
        loss.backward()
        # 5. update the value of the params
        optimizer.step()
        # 6. calculate the accuracy for this mini-batch
        acc = performance(y_hat, y)
        # 7. update the loss and accuracy AverageMeter
        loss_meter.update(val=loss.item(), n=X.shape[0])
        performance_meter.update(val=acc, n=X.shape[0])


def train_model(model, dataloader, loss_fn, optimizer, num_epochs, checkpoint_loc=None, checkpoint_name="checkpoint.pt", performance=accuracy, device=None):

    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Training on {device}")

    # create the folder for the checkpoints (if it's not None)
    if checkpoint_loc is not None:
        os.makedirs(checkpoint_loc, exist_ok=True)
    
    model.to(device)
    model.train()

    # epoch loop
    for epoch in range(num_epochs):
        loss_meter = AverageMeter()
        performance_meter = AverageMeter()

        train_epoch(model, dataloader, loss_fn, optimizer, loss_meter, performance_meter, performance, device)

        print(f"Epoch {epoch+1} completed. Loss - total: {loss_meter.sum} - average: {loss_meter.avg}; Performance: {performance_meter.avg}")

        # produce checkpoint dictionary -- but only if the name and folder of the checkpoint are not None
        if checkpoint_name is not None and checkpoint_loc is not None:
            checkpoint_dict = {
                "parameters": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "epoch": epoch
            }
            torch.save(checkpoint_dict, os.path.join(checkpoint_loc, checkpoint_name))

    return loss_meter.sum, performance_meter.avg

def test_model(model, dataloader, performance=accuracy, loss_fn=None, device=None):

    # establish device
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Testing on {device}")
    
    # create an AverageMeter for the loss if passed
    if loss_fn is not None:
        loss_meter = AverageMeter()
    
    performance_meter = AverageMeter()

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            y_hat = model(X)
            loss = loss_fn(y_hat, y) if loss_fn is not None else None
            acc = performance(y_hat, y)
            if loss_fn is not None:
                loss_meter.update(loss.item(), X.shape[0])
            performance_meter.update(acc, X.shape[0])
    # get final performances
    fin_loss = loss_meter.sum if loss_fn is not None else None
    fin_perf = performance_meter.avg
    print(f"TESTING - loss {fin_loss if fin_loss is not None else '--'} - performance {fin_perf}")
    return fin_loss, fin_perf

Let's use the vanilla SGD with momentum:

In [4]:
lr = .1
wd = 5e-4
momentum = .9

num_epochs = 30

loss_fn = torch.nn.CrossEntropyLoss()

model = MLP()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd, momentum=momentum)

In [5]:
minibatch_size_train = 256
minibatch_size_test = 512

trainloader, testloader, trainset, testset = mnist.get_data(data_root="../labs/datasets/", batch_size_train=minibatch_size_test, batch_size_test=minibatch_size_test)

Let's train our network:

In [8]:
train_model(model, trainloader, loss_fn, optimizer, num_epochs, device="cpu")

Training on cpu
Epoch 1 completed. Loss - total: 17731.810479164124 - average: 0.2955301746527354; Performance: 0.9110333333333334
Epoch 2 completed. Loss - total: 12130.750637054443 - average: 0.20217917728424073; Performance: 0.9382333333333334
Epoch 3 completed. Loss - total: 10631.62873840332 - average: 0.17719381230672201; Performance: 0.9460166666666666
Epoch 4 completed. Loss - total: 9510.347358226776 - average: 0.1585057893037796; Performance: 0.9510833333333333
Epoch 5 completed. Loss - total: 9027.319495916367 - average: 0.15045532493193944; Performance: 0.95385
Epoch 6 completed. Loss - total: 8393.216874599457 - average: 0.13988694790999096; Performance: 0.9570833333333333
Epoch 7 completed. Loss - total: 7721.327761888504 - average: 0.12868879603147507; Performance: 0.9597333333333333
Epoch 8 completed. Loss - total: 7570.34238743782 - average: 0.12617237312396368; Performance: 0.9607
Epoch 9 completed. Loss - total: 7577.6309159994125 - average: 0.1262938485999902; Perfo

(5874.961523771286, 0.9680666666666666)

And now let's test it:

In [10]:
test_model(model, testloader, device="cpu")

Testing on cpu
TESTING - loss -- - performance 0.96785


(None, 0.96785)

In [9]:
test_model(model, testloader, performance=accuracy, loss_fn=loss_fn, device="cpu")

Testing on cpu
TESTING - loss 5965.80991601944 - performance 0.96785


(5965.80991601944, 0.96785)