## ResNet-18
**How to write a ResNet-18 with pytorch**

<br>
The following cells contain codes for building a ResNet-18 in pytorch, and training it on the CIFAR-10 dataset. Then, training losses and hidden layers at the end of training are visualised.

In [None]:
!pip install torch torchvision

import numpy as np

import torch
from torch.nn import Conv2d, MaxPool2d
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

In [None]:
# define resnet building blocks

class ResidualBlock(nn.Module): 
    def __init__(self, inchannel, outchannel, stride=1): 
        
        super(ResidualBlock, self).__init__() 
        
        self.left = nn.Sequential(Conv2d(inchannel, outchannel, kernel_size=3, 
                                         stride=stride, padding=1, bias=False), 
                                  nn.BatchNorm2d(outchannel), 
                                  nn.ReLU(inplace=True), 
                                  Conv2d(outchannel, outchannel, kernel_size=3, 
                                         stride=1, padding=1, bias=False), 
                                  nn.BatchNorm2d(outchannel)) 
        
        self.shortcut = nn.Sequential() 
        
        if stride != 1 or inchannel != outchannel: 
            
            self.shortcut = nn.Sequential(Conv2d(inchannel, outchannel, 
                                                 kernel_size=1, stride=stride, 
                                                 padding = 0, bias=False), 
                                          nn.BatchNorm2d(outchannel) ) 
            
    def forward(self, x): 
        
        out = self.left(x) 
        
        out += self.shortcut(x) 
        
        out = F.relu(out) 
        
        return out


    
    # define resnet

class ResNet(nn.Module):
    
    def __init__(self, ResidualBlock, num_classes = 10):
        
        super(ResNet, self).__init__()
        
        self.inchannel = 64
        self.conv1 = nn.Sequential(Conv2d(3, 64, kernel_size = 3, stride = 1,
                                            padding = 1, bias = False), 
                                  nn.BatchNorm2d(64), 
                                  nn.ReLU())
        
        self.layer1 = self.make_layer(ResidualBlock, 64, 2, stride = 1)
        self.layer2 = self.make_layer(ResidualBlock, 128, 2, stride = 2)
        self.layer3 = self.make_layer(ResidualBlock, 256, 2, stride = 2)
        self.layer4 = self.make_layer(ResidualBlock, 512, 2, stride = 2)
        self.maxpool = MaxPool2d(4)
        self.fc = nn.Linear(512, num_classes)
        
    
    def make_layer(self, block, channels, num_blocks, stride):
        
        strides = [stride] + [1] * (num_blocks - 1)
        
        layers = []
        
        for stride in strides:
            
            layers.append(block(self.inchannel, channels, stride))
            
            self.inchannel = channels
            
        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        
        x = self.conv1(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.maxpool(x)
        
        x = x.view(x.size(0), -1)
        
        x = self.fc(x)
        
        return x
    
    
def ResNet18():
    return ResNet(ResidualBlock)

In [None]:
# convert to tensor and normalise
# then do image augmentation by flipping samples at random with probability 0.2
train_transform = T.Compose(
        [T.ToTensor(), T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        T.RandomHorizontalFlip(p=0.2)])

test_transform = T.Compose(
        [T.ToTensor(), T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

In [None]:
# Load dataset
data_dir = './data'
batch_size = 8
# transform and load the dataset
train_dataset = dset.CIFAR10(root=data_dir, train=True,
                                            download = True, transform = train_transform)
test_dataset = dset.CIFAR10(root=data_dir, train=False,
                                            download = True, transform = test_transform)
# create data loaders
loader_train = DataLoader(train_dataset, batch_size=batch_size,
                                          shuffle = True)
loader_test = DataLoader(test_dataset, batch_size = batch_size,
                                         shuffle = False)

In [None]:
# Functions
USE_GPU = True
dtype = torch.float32 

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


print_every = 100


def check_accuracy(loader, model):
    # function for test accuracy on validation and test set
    
#     if loader.dataset.train:
#         print('Checking accuracy on validation set')
#     else:
#         print('Checking accuracy on test set')   
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  # move to device
            y = y.to(device=device, dtype=torch.long)
            scores = model(x)
            _, preds = scores.max(1)
            num_correct += (preds == y).sum()
            num_samples += preds.size(0)
        # declare accuracy to be global
        global acc
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

        

def train_part(model, optimizer, epochs=1):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        print(len(loader_train))
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            loss.backward()

            # Update the parameters of the model using the gradients
            optimizer.step()

            if t % print_every == 0:
                print('Epoch: %d, Iteration %d, loss = %.4f' % (e, t, loss.item()))
                # store the loss for plotting
                losses.append(loss.item())
                epochh.append(e+1)
                #check_accuracy(loader_val, model)
                print()

In [None]:
# Optimising network performance

# hyperparameter tuning with random search
# the same process is done separately for both Adam and SGD
# when SGD is the optimiser, momentum is tuned as well
from torch.utils.data import random_split
import numpy as np

# store the hyperparameters in lists
lr = []
batchsize = []
weightdecay = []
accuracy = []

# train 25 models by randomly sampling hyperparameters
# after finding the best model, zoom into the neighbourhood of its hyperparameter space
# and perform subsequent rounds of random search
# lastly, hyperparameters like epoch are also tuned individually to achieve better validation performance
num_model = 25
for i in range(num_model):
    learning_rate = 10**(-2*np.random.rand()-2)  #10^-4 to 10^-2, sampling from log scale
    lr.append(learning_rate)

    batch = np.array([2,4,8,16,32,64])
    batch_size = int(np.random.choice(batch))
    batchsize.append(batch_size)

    weight_decay = 10**(-2*np.random.rand()-2)  # 10^-4 to 10^-2, sampling from log scale
    weightdecay.append(weight_decay)

    # split data into training and validation set
    torch.manual_seed(5)
    val_size = 10000
    train_size = 40000
    # I did not do cross validation here because both the training and validation sets
    # have a large number of observations, and training data already underwent augmentation.
    # The validation accuracy should be a good enough indicator of the model's performance.
    train, val = random_split(train_dataset, [train_size, val_size])
  
    # load data into loader
    loader_train = DataLoader(train, batch_size=batch_size,
                                             shuffle = True)
    loader_val = DataLoader(val, batch_size=batch_size,
                                             shuffle = False)

    # define and train the network
    model = ResNet18()
    # define optimiser and relevant hyperparameters here
    optimizer = optim.Adam(model.parameters(), lr = learning_rate, weight_decay = weight_decay)
    # training loop
    train_part(model, optimizer, epochs = 10)
    acc = 0
    check_accuracy(loader_val, model)
    accuracy.append(acc)
    print(acc)


# view results in pandas dataframe
import pandas as pd

print(lr)
print(batchsize)
print(weightdecay)
print(accuracy)

result = {'lr': lr, 'batch_size': batchsize, 'weight_decay': weightdecay, 'accuracy': accuracy}
df = pd.DataFrame(result)
df = df.sort_values(by = ['accuracy'], ascending = False)

print(df)


In [None]:
# define and train the network with the optimal hyperparameters 
# lr = 0.0001, batch_size = 8, flipping probability = 0.2
# weight_decay = 0.0008, optimiser = Adam, epoch = 10

model = ResNet18()
optimizer = optim.Adam(model.parameters(), lr = 0.0001, weight_decay=0.0008)

losses = []
epochh = []
train_part(model, optimizer, epochs = 10)


# report test set accuracy

check_accuracy(loader_test, model) # test accuracy at 86.15%


# save the model
torch.save(model.state_dict(), 'model.pt')


In [None]:
# Plot losses
!pip install matplotlib

import matplotlib.pyplot as plt
plt.plot(epochh, losses)

In [None]:
# Visualise the hidden layers

plt.tight_layout()

activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

vis_labels = ['conv1', 'layer1', 'layer2', 'layer3', 'layer4']

for l in vis_labels:

    getattr(model, l).register_forward_hook(get_activation(l))
    
    
# data, _ = cifar10_test[0]
data, _ = test_dataset[0]
data = data.unsqueeze_(0).to(device = device, dtype = dtype)

output = model(data)

for idx, l in enumerate(vis_labels):

    act = activation[l].squeeze()

    if idx < 2:
        ncols = 8
    else:
        ncols = 32
        
    nrows = act.size(0) // ncols
    
    fig, axarr = plt.subplots(nrows, ncols)
    fig.suptitle(l)


    for i in range(nrows):
        for j in range(ncols):
            axarr[i, j].imshow(act[i * nrows + j].cpu())
            axarr[i, j].axis('off')

    plt.savefig(f'{l}')