# CNN Hyperparameters
In this notebook, you will observe the effect of various hyperparameters on the training of a deep convolutional neural network.

In [None]:
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

# data_root = "/home/space/datasets/mnist"
data_root = "./data/MNIST"

In [None]:
# set the device
num_workers, pin_memory = 4, False
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # NVIDIA GPU
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")  # Apple Silicon (Metal)
else:
    device = torch.device("cpu") # CPU (slowest option)

print(f"Device set to {device}!")


# Define network architecture

Implement the `__init__` and `forward` methods of the class `Net` with the same architecture as in *cnn_example.ipynb*. Add a dropout layer to the model after the second ConvLayer using **`nn.Dropout2d`** . The dropout ratio can be specified when creating the network and it should be a member variable of the class `Net`, i.e. it can be accessed by `self.dropout_ratio` inside the class. *See cnn_example.ipynb as a reference*.

In [None]:
nn.Dropout2d?

In [None]:
class Net(nn.Module):
    # >>>>> YOUR CODE HERE
    raise NotImplementedError("Replace this line by your code.")
    # <<<<< END YOUR CODE

## Data loading and model setup
Implement a function `setup` that builds the data loader as well as the model and optimizer. All relevant hyperparameters parameters are parsed as optional arguments to the function. *See cnn_example.ipynb as a reference*.

In [None]:
def setup(dropout_ratio=0.5, lr=1e-2, momentum=0.5, batch_size=128, mean=0.1307, std=0.3081):
    # >>>>> YOUR CODE HERE
    raise NotImplementedError("Replace this line by your code.")
    # <<<<< END YOUR CODE
    return train_loader, device, model, optimizer

## Training
Below you are given a function that performs the training for a single epoch.

In [None]:
def train_epoch(model, device, train_loader, optimizer):
    model.train()
    
    losses = list()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())
    return losses

Setup the data loader, model and optimizer, run the training (e.g. for 3 epochs) and plot the evolution of the training loss.

In [None]:
plt.figure()
# >>>>> YOUR CODE HERE
raise NotImplementedError("Replace this line by your code.")
# <<<<< END YOUR CODE
plt.xlabel("Iteration")
plt.ylabel("Train Loss")
plt.show()

## Hyperparameter Tuning
Try out several values (given below) of the following hyperparameters:
* Learning rate
* Momentum
* Batch size

Visualize the learning curves in one plot for comparison.

*Optional*: Investigate the result if you assume that the data are already normalized (change the mean and standard deviation accordingly).

In [None]:
lrs = [1e0, 1e-2, 1e-4]
momenta = [0.0, 0.5, 0.9]
batch_sizes = [32, 128, 1024]  # CAVE: compare epochs, not iterations
dropout_ratios = [0.0, 0.5, 0.7]

### Learning rate

In [None]:
plt.figure()
for lr in lrs:
    # >>>>> YOUR CODE HERE
    raise NotImplementedError("Replace this line by your code.")
    # <<<<< END YOUR CODE

plt.legend()
plt.xlabel("Iteration")
plt.ylabel("Train Loss")
plt.show()

### Momentum

In [None]:
plt.figure()
for momentum in momenta:
    # >>>>> YOUR CODE HERE
    raise NotImplementedError("Replace this line by your code.")
    # <<<<< END YOUR CODE

plt.legend()
plt.xlabel("Iteration")
plt.show()

### Batch size

In [None]:
# Make sure you compare the losses for the same number of samples seen (not the number of iterations of gradient descent).

plt.figure()
for batch_size in batch_sizes:
    # >>>>> YOUR CODE HERE
    raise NotImplementedError("Replace this line by your code.")
    # <<<<< END YOUR CODE

plt.legend()
plt.xlabel("Number of samples seen")
plt.show()

## Generalization error
Even though the training loss can give valuable hints on the hyperparameters, it is typically not what we are interested in. Much more important is the performance of the model on unseen data, the so called validation/test data. Implement a data loader `test_loader` for the test data (similar to the training data loader but set the `train` parameter of `datasets.MNIST` to `False`).

In [None]:
# >>>>> YOUR CODE HERE
raise NotImplementedError("Replace this line by your code.")
# <<<<< END YOUR CODE
print(f"Evaluating on {len(test_loader.dataset)} samples.")

Implement an evaluation function that runs and evaluates the model. Compute the loss **and** the accuracy on the evaluation data.

In [None]:
def evaluate(model, device, data_loader):
    model.eval()
    losses = list()
    correct = 0
    num_samples = 0
    
    with torch.no_grad():  # Tell the model that we do not need gradient computation for evaluation
        for data, target in data_loader:
            # Compute output of network
            # >>>>> YOUR CODE HERE
            raise NotImplementedError("Replace this line by your code.")
            # <<<<< END YOUR CODE
            
            # Compute loss and store in list
            # >>>>> YOUR CODE HERE
            raise NotImplementedError("Replace this line by your code.")
            # <<<<< END YOUR CODE
            
            
            prediction = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += prediction.eq(target.view_as(prediction)).sum().item()
            num_samples += len(data)

    # Compute average loss and accuracy
    # >>>>> YOUR CODE HERE
    raise NotImplementedError("Replace this line by your code.")
    # <<<<< END YOUR CODE
    
    return avg_loss, accuracy

### Performance at chance level
Verify your evaluation function by running it on an untrained model on both the training and the test set. You can use the `setup` function from above to get a randomly initialized network or by calling the constructor of the `Net` class.

Which values do you expect for the test loss and accuracy?

In [None]:
# >>>>> YOUR CODE HERE
raise NotImplementedError("Replace this line by your code.")
# <<<<< END YOUR CODE

print(f"Training set: \n \t Average loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.0f}%")
print(f"Test set: \n \t Average loss: {avg_test_loss:.4f}, Accuracy: {test_accuracy:.0f}%")

### Performance on trained model
Evaluate a trained model (training procedure is provided below) on both the training and validation data.

In [None]:
train_loader, device, model, optimizer = setup()

losses = list()
for epoch in tqdm(range(10)):
    epoch_losses = train_epoch(model, device, train_loader, optimizer)
    losses.extend(epoch_losses)

In [None]:
# >>>>> YOUR CODE HERE
raise NotImplementedError("Replace this line by your code.")
# <<<<< END YOUR CODE

print(f"Training set: \n \t Average loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.0f}%")
print(f"Test set: \n \t Average loss: {avg_test_loss:.4f}, Accuracy: {test_accuracy:.0f}%")

### Evolution of test performance over training
Below you are given code to track and plot the test accuracy for different epochs during training.

In [None]:
train_loader, device, model, optimizer = setup(momentum=0.9)

train_losses = list()
test_losses = list()
train_accuracies = list()
test_accuracies = list()
val_epochs = list()
for epoch in tqdm(range(8)):
    # Training
    epoch_losses = train_epoch(model, device, train_loader, optimizer)
    train_losses.append(np.mean(epoch_losses))
    
    # Evaluation (only every other epoch)
    if epoch % 2 == 0:
        avg_test_losses, test_accuracy = evaluate(model, device, test_loader)
    
        test_losses.append(avg_test_losses)
        test_accuracies.append(test_accuracy) 
        val_epochs.append(epoch)

In [None]:
# Plot
fig, ax1 = plt.subplots()
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.plot(train_losses, color="b", label="Train")
ax1.plot(val_epochs, test_losses, color="r", label="Test")
ax2 = ax1.twinx()
ax2.set_ylabel('* Accuracy')
ax2.plot(val_epochs, test_accuracies, color="r", marker="*")
fig.legend()
plt.show()