# Exercise 1.4

## Classification of CIFAR10 images
### Optimizers
In this exercise we will classify the images from the CIFAR10 dataset. We will use different optimizers and compare their convergence speed. First we import the libraries that we need.

**NB! The exercise is formulated in a Jupyter notebook for ease of communication, but you should feel *very* free to carry out the entire exercise without the notebook. If you do carry it out in a notebook, please finish by migrating your code over to a script that you can run from the terminal**

In [1]:
import numpy as np
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import os

We always check that we are running on a GPU

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
if torch.cuda.is_available():
    print("The code will run on GPU.")
else:
    print("The code will run on CPU. Go to Edit->Notebook Settings and choose GPU as the hardware accelerator")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

The code will run on GPU.


In this exercise we will classify images from the [CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset. 
CIFAR10 has 60000 colour images of size 32x32 equally distributed in 10 classes.
* You should load this dataset (hint: it is a built-in dataset in pytorch).

In [4]:
batch_size = 64
# load the CIFAR10 dataset
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))]))
train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))]))
test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


* Make a CNN to train on the CIFAR10 dataset

In [5]:
class Network(nn.Module):
    def __init__(self, BN=False):
        super(Network, self).__init__()
        if BN:
            self.convolutional = nn.Sequential(
                    nn.Conv2d(3, 8, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.BatchNorm2d(8),
                    nn.MaxPool2d(2, 2),
                    nn.Conv2d(8, 16, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.BatchNorm2d(16))
        else:
            self.convolutional = nn.Sequential(
                    nn.Conv2d(3, 8, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.MaxPool2d(2, 2),
                    nn.Conv2d(8, 16, kernel_size=3, padding=1),
                    nn.ReLU())

        self.fully_connected = nn.Sequential(
                nn.Linear(16*16*16, 500),
                nn.ReLU(),
                nn.Linear(500, 10))#,
                #nn.Softmax(dim=1))
        
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        x = self.convolutional(x)
        #reshape x so it becomes flat, except for the first dimension (which is the minibatch)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fully_connected(x)
        return x

In [6]:
#We define the training as a function so we can easily re-use it.
def train(model, optimizer, num_epochs=10):
    def loss_fun(output, target):
        #return F.nll_loss(torch.log(output), target)
        return F.cross_entropy(output, target)
    out_dict = {'train_acc': [],
              'test_acc': [],
              'train_loss': [],
              'test_loss': []}
  
    for epoch in tqdm(range(num_epochs), unit='epoch'):
        model.train()
        #For each epoch
        train_correct = 0
        train_loss = []
        for minibatch_no, (data, target) in tqdm(enumerate(train_loader), total=len(train_loader)):
            data, target = data.to(device), target.to(device)
            #Zero the gradients computed for each weight
            optimizer.zero_grad()
            #Forward pass your image through the network
            output = model(data)
            #Compute the loss
            loss = loss_fun(output, target)
            #Backward pass through the network
            loss.backward()
            #Update the weights
            optimizer.step()

            train_loss.append(loss.item())
            #Compute how many were correctly classified
            predicted = output.argmax(1)
            train_correct += (target==predicted).sum().cpu().item()
        #Comput the test accuracy
        test_loss = []
        test_correct = 0
        model.eval()
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            with torch.no_grad():
                output = model(data)
            test_loss.append(loss_fun(output, target).cpu().item())
            predicted = output.argmax(1)
            test_correct += (target==predicted).sum().cpu().item()
        out_dict['train_acc'].append(train_correct/len(trainset))
        out_dict['test_acc'].append(test_correct/len(testset))
        out_dict['train_loss'].append(np.mean(train_loss))
        out_dict['test_loss'].append(np.mean(test_loss))
        print(f"Loss train: {np.mean(train_loss):.3f}\t test: {np.mean(test_loss):.3f}\t",
              f"Accuracy train: {out_dict['train_acc'][-1]*100:.1f}%\t test: {out_dict['test_acc'][-1]*100:.1f}%")
    return out_dict

 * Train the network and plot make a plot of the loss and accuracy for both training and with the epoch on the x-axis

In [12]:
print("SGD lr: 0.1")
model = Network()
model.to(device)
#Initialize the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
out_dict1 = train(model, optimizer)


SGD lr: 0.1


  0%|          | 0/10 [00:00<?, ?epoch/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.674	 test: 1.538	 Accuracy train: 39.6%	 test: 45.6%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.294	 test: 1.305	 Accuracy train: 53.9%	 test: 54.1%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.125	 test: 1.242	 Accuracy train: 60.1%	 test: 55.9%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.987	 test: 1.387	 Accuracy train: 65.3%	 test: 53.4%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.867	 test: 1.240	 Accuracy train: 69.3%	 test: 59.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.738	 test: 1.192	 Accuracy train: 74.0%	 test: 61.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.620	 test: 1.340	 Accuracy train: 78.0%	 test: 58.6%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.510	 test: 1.211	 Accuracy train: 81.9%	 test: 63.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.411	 test: 1.374	 Accuracy train: 85.7%	 test: 62.6%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.341	 test: 1.991	 Accuracy train: 88.0%	 test: 56.7%


In [13]:
print("SGD lr: 0.01")
model = Network()
model.to(device)
#Initialize the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
out_dict2 = train(model, optimizer)

SGD lr: 0.01


  0%|          | 0/10 [00:00<?, ?epoch/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.106	 test: 1.919	 Accuracy train: 23.4%	 test: 30.1%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.812	 test: 1.750	 Accuracy train: 35.2%	 test: 37.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.628	 test: 1.589	 Accuracy train: 42.0%	 test: 42.9%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.497	 test: 1.507	 Accuracy train: 46.2%	 test: 46.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.402	 test: 1.380	 Accuracy train: 50.0%	 test: 49.9%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.329	 test: 1.306	 Accuracy train: 52.6%	 test: 53.5%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.274	 test: 1.312	 Accuracy train: 54.7%	 test: 52.6%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.224	 test: 1.221	 Accuracy train: 56.4%	 test: 56.2%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.179	 test: 1.177	 Accuracy train: 58.4%	 test: 57.5%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.136	 test: 1.137	 Accuracy train: 59.7%	 test: 59.7%


In [14]:
print("Adam lr: 0.1")
model = Network()
model.to(device)
#Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
out_dict3 = train(model, optimizer)


Adam lr: 0.1


  0%|          | 0/10 [00:00<?, ?epoch/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 8.248	 test: 2.315	 Accuracy train: 9.9%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.312	 test: 2.305	 Accuracy train: 9.9%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.312	 test: 2.310	 Accuracy train: 10.0%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.310	 test: 2.311	 Accuracy train: 10.2%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.311	 test: 2.316	 Accuracy train: 10.0%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.312	 test: 2.310	 Accuracy train: 9.7%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.313	 test: 2.308	 Accuracy train: 9.9%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.312	 test: 2.308	 Accuracy train: 9.9%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.313	 test: 2.312	 Accuracy train: 10.1%	 test: 10.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 2.311	 test: 2.310	 Accuracy train: 10.1%	 test: 10.0%


In [15]:
print("Adam lr: 0.01")

model = Network()
model.to(device)
#Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
out_dict4 = train(model, optimizer)


Adam lr: 0.01


  0%|          | 0/10 [00:00<?, ?epoch/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.806	 test: 1.686	 Accuracy train: 33.5%	 test: 39.5%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.584	 test: 1.525	 Accuracy train: 42.4%	 test: 43.9%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.520	 test: 1.446	 Accuracy train: 44.7%	 test: 46.8%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.463	 test: 1.486	 Accuracy train: 47.1%	 test: 46.3%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.414	 test: 1.427	 Accuracy train: 48.9%	 test: 48.2%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.394	 test: 1.395	 Accuracy train: 49.8%	 test: 49.4%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.371	 test: 1.414	 Accuracy train: 50.8%	 test: 48.8%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.349	 test: 1.462	 Accuracy train: 51.7%	 test: 48.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.334	 test: 1.449	 Accuracy train: 52.3%	 test: 48.6%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.310	 test: 1.393	 Accuracy train: 53.1%	 test: 49.9%


In [16]:

print("Adam lr: 0.01 with BatchNorm")

model = Network(BN=True)
model.to(device)
#Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
out_dict5 = train(model, optimizer)

Adam lr: 0.01 with BatchNorm


  0%|          | 0/10 [00:00<?, ?epoch/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.671	 test: 1.248	 Accuracy train: 44.7%	 test: 55.2%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.202	 test: 1.153	 Accuracy train: 57.3%	 test: 59.1%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.097	 test: 1.188	 Accuracy train: 61.5%	 test: 59.6%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.015	 test: 1.077	 Accuracy train: 64.8%	 test: 62.8%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.948	 test: 1.213	 Accuracy train: 67.5%	 test: 61.3%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.889	 test: 1.139	 Accuracy train: 69.6%	 test: 63.9%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.831	 test: 1.199	 Accuracy train: 71.9%	 test: 62.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.781	 test: 1.233	 Accuracy train: 73.7%	 test: 63.5%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.728	 test: 1.181	 Accuracy train: 75.5%	 test: 62.8%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.711	 test: 1.181	 Accuracy train: 76.4%	 test: 63.6%


In [17]:
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

axes[0].plot(out_dict1['train_loss'], label='Train loss (SDG lr: 0.1)')
axes[0].plot(out_dict1['test_loss'], label='Test loss (SDG lr: 0.1)')

axes[1].plot(out_dict1['train_acc'], label='Train accuracy (SDG lr: 0.1)')
axes[1].plot(out_dict1['test_acc'], label='Test accuracy (SDG lr: 0.1)')



axes[0].plot(out_dict2['train_loss'], label='Train loss (SDG lr: 0.01)')
axes[0].plot(out_dict2['test_loss'], label='Test loss (SDG lr: 0.01)')

axes[1].plot(out_dict2['train_acc'], label='Train accuracy (SDG lr: 0.01)')
axes[1].plot(out_dict2['test_acc'], label='Test accuracy (SDG lr: 0.01)')



axes[0].plot(out_dict3['train_loss'], label='Train loss (Adam lr: 0.01)')
axes[0].plot(out_dict3['test_loss'], label='Test loss (Adam lr: 0.01)')

axes[1].plot(out_dict3['train_acc'], label='Train accuracy (Adam lr: 0.01)')
axes[1].plot(out_dict3['test_acc'], label='Test accuracy (Adam lr: 0.01)')



axes[0].plot(out_dict4['train_loss'], label='Train loss (Adam lr: 0.001)')
axes[0].plot(out_dict4['test_loss'], label='Test loss (Adam lr: 0.001)')

axes[1].plot(out_dict4['train_acc'], label='Train accuracy (Adam lr: 0.001)')
axes[1].plot(out_dict4['test_acc'], label='Test accuracy (Adam lr: 0.001)')



axes[0].plot(out_dict5['train_loss'], label='Train loss (Adam with BN lr: 0.001)')
axes[0].plot(out_dict5['test_loss'], label='Test loss (Adam with BN lr: 0.001)')

axes[1].plot(out_dict5['train_acc'], label='Train accuracy (Adam with BN lr: 0.001)')
axes[1].plot(out_dict5['test_acc'], label='Test accuracy (Adam with BN lr: 0.001)')



axes[0].legend()
axes[1].legend()
axes[0].set_title('Loss')
axes[1].set_title('Accuracy')

Text(0.5, 1.0, 'Accuracy')

* Discuss what you see. Are you overfitting to the training data? Do you not learn anything? What can you change to do better?

* Repeat the above steps but using Adam as the optimizer. Use Pytorch's defaults parameters. Do you learn faster?
* Which optimizer works best for you?
* Plot the test and test errors for both SGD and Adam in one plot
* Try adding Batch normalisation after your convolutional layers. Does it help?

## ResNet

Now you will create and train a ResNet.
* Implement the Residual block as a network below using convolutional kernel size $3\times3$ according to the figure below
![Residual block](https://cdn-images-1.medium.com/max/800/1*D0F3UitQ2l5Q0Ak-tjEdJg.png)

In [None]:
class ResNetBlock(nn.Module):
    def __init__(self, n_features, BN=False):
        super(ResNetBlock, self).__init__()
        if BN:
            self.convolutional = nn.Sequential(nn.Conv2d(n_features, n_features, kernel_size=3, stride=1, padding=1),
                                            nn.ReLU(),
                                            nn.BatchNorm2d(n_features),
                                            nn.Conv2d(n_features, n_features, kernel_size=3, stride=1, padding=1))
            self.relu = nn.Sequential(nn.ReLU(),nn.BatchNorm2d(n_features))
        else:
            self.convolutional = nn.Sequential(nn.Conv2d(n_features, n_features, kernel_size=3, stride=1, padding=1),
                                            nn.ReLU(),
                                            nn.Conv2d(n_features, n_features, kernel_size=3, stride=1, padding=1))
            self.relu = nn.ReLU()

    def forward(self, x):
        
        r = self.convolutional(x)
        out = self.relu(x + r)

        return out

The following code is a sanity of your residual block network

In [None]:
#Sanity test of your implementation
C = 4
res_block = ResNetBlock(C)
assert(len(res_block.state_dict())==4)
for name, weight in res_block.state_dict().items():
    weight*=0
    desired_shape = {'bias': (C,), 'weight': (C, C, 3, 3)}[name.split('.')[-1]]
    assert(desired_shape==weight.shape)

x = torch.randn(32, C, 32,32)
assert(torch.abs(res_block(x)-F.relu(x)).max()==0)
print("Passed sanity check")

Passed sanity check


We define a network that uses your `ResNetBlock`

In [None]:
class ResNet(nn.Module):
    def __init__(self, n_in, n_features, num_res_blocks=3, BN=False):
        super(ResNet, self).__init__()
        #First conv layers needs to output the desired number of features.
        if BN:
            conv_layers = [nn.Conv2d(n_in, n_features, kernel_size=3, stride=1, padding=1),
                        nn.ReLU(),
                        nn.BatchNorm2d(n_features)]
        else:
            conv_layers = [nn.Conv2d(n_in, n_features, kernel_size=3, stride=1, padding=1),
                        nn.ReLU()]
        
        for i in range(num_res_blocks):
            conv_layers.append(ResNetBlock(n_features, BN))
        self.res_blocks = nn.Sequential(*conv_layers)
        self.fc = nn.Sequential(nn.Linear(32*32*n_features, 2048),
                                nn.ReLU(),
                                nn.Linear(2048, 512),
                                nn.ReLU(),
                                nn.Linear(512,10))#,
                                #nn.Softmax(dim=1))
        
    def forward(self, x):
        x = self.res_blocks(x)
        #reshape x so it becomes flat, except for the first dimension (which is the minibatch)
        x = x.view(x.size(0), -1)
        out = self.fc(x)
        return out

Let's train our new ResNet!

In [None]:
model = ResNet(3, 8)
model.to(device)
#Initialize the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
out_dict1 = train(model, optimizer)

  0%|          | 0/10 [00:00<?, ?epoch/s]

  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.657	 test: 1.425	 Accuracy train: 40.4%	 test: 49.7%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.258	 test: 1.196	 Accuracy train: 54.7%	 test: 57.9%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 1.016	 test: 1.738	 Accuracy train: 63.8%	 test: 48.2%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.766	 test: 1.355	 Accuracy train: 72.7%	 test: 57.0%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.470	 test: 1.350	 Accuracy train: 83.5%	 test: 59.3%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.233	 test: 2.717	 Accuracy train: 92.1%	 test: 51.2%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.171	 test: 2.218	 Accuracy train: 94.4%	 test: 56.3%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.089	 test: 2.212	 Accuracy train: 97.0%	 test: 59.3%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.062	 test: 2.151	 Accuracy train: 98.0%	 test: 59.1%


  0%|          | 0/782 [00:00<?, ?it/s]

Loss train: 0.045	 test: 2.360	 Accuracy train: 98.5%	 test: 60.3%


In [None]:
model = ResNet(3, 8)
model.to(device)
#Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
out_dict2 = train(model, optimizer)

In [None]:
model = ResNet(3, 8, BN = True)
model.to(device)
#Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
out_dict3 = train(model, optimizer)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

axes[0].plot(out_dict1['train_loss'], label='Train loss (SDG lr: 0.1)')
axes[0].plot(out_dict1['test_loss'], label='Test loss (SDG lr: 0.1)')

axes[1].plot(out_dict1['train_acc'], label='Train accuracy (SDG lr: 0.1)')
axes[1].plot(out_dict1['test_acc'], label='Test accuracy (SDG lr: 0.1)')



axes[0].plot(out_dict2['train_loss'], label='Train loss (SDG lr: 0.01)')
axes[0].plot(out_dict2['test_loss'], label='Test loss (SDG lr: 0.01)')

axes[1].plot(out_dict2['train_acc'], label='Train accuracy (SDG lr: 0.01)')
axes[1].plot(out_dict2['test_acc'], label='Test accuracy (SDG lr: 0.01)')



axes[0].plot(out_dict3['train_loss'], label='Train loss (Adam lr: 0.01)')
axes[0].plot(out_dict3['test_loss'], label='Test loss (Adam lr: 0.01)')

axes[1].plot(out_dict3['train_acc'], label='Train accuracy (Adam lr: 0.01)')
axes[1].plot(out_dict3['test_acc'], label='Test accuracy (Adam lr: 0.01)')



axes[0].legend()
axes[1].legend()
axes[0].set_title('Loss')
axes[1].set_title('Accuracy')



Do you get nan loss at some point during training? 
This can be caused by the numerical instability of using softmax and log as two functions. 
* Change your network and loss to use a layer that combines the softmax log into one such as `nn.LogSoftmax`. You can also use `nn.CrossEntropyLoss` which also integrates `nn.NLLLoss`.