In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Residual Networks
In this notebook we are going to build a very deep convolutional network, using Residual Networks (ResNets). In theory, very deep networks can represent very complex functions; but in practice, they are hard to train. Residual Networks, introduced by [He et al.](https://arxiv.org/pdf/1512.03385.pdf), allow you to train much deeper networks than were previously feasible.

### Import Packages

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Problem of Very Deep Neural Networks
In recent years, neural networks have become much deeper, with state-of-the-art networks evolving from having just a few layers (e.g., AlexNet) to over a hundred layers.

* The main benefit of a very deep network is that it can represent very complex functions. It can also learn features at many different levels of abstraction, from edges (at the shallower layers, closer to the input) to very complex features (at the deeper layers, closer to the output). 

* However, using a deeper network doesn't always help. A huge barrier to training them is vanishing gradients: very deep networks often have a gradient signal that goes to zero quickly, thus making gradient descent prohibitively slow.

* More specifically, during gradient descent, as you backpropagate from the final layer back to the first layer, you are multiplying by the weight matrix on each step, and thus the gradient can decrease exponentially quickly to zero (or, in rare cases, grow exponentially quickly and "explode," from gaining very large values). 

* During training, you might therefore see the magnitude (or norm) of the gradient for the shallower layers decrease to zero very rapidly as training proceeds.

### Building a Residual Network
In ResNets, a "shortcut" or a "skip connection" allows the model to skip layers:  

<center><img src="Images/skip_connection.png" style="width:650px;height:200px;"></center>
<caption><center> <u> <font color='purple'> <b>Figure 1</b> </u><font color='purple'>  : A ResNet block showing a skip-connection <br> </center></caption>

The image on the left shows the "main path" through the network. The image on the right adds a shortcut to the main path. By stacking these ResNet blocks on top of each other, you can form a very deep network.

#### The Identity Block

The identity block is the standard block used in ResNets, and corresponds to the case where the input activation (say $a^{[l]}$) has the same dimension as the output activation (say $a^{[l+2]}$).

In the code cell below, we are going to implement a slightly more powerful version of this identity block, in which the skip connection "skips over" 3 hidden layers rather than 2 layers. It looks like this: 

<center><img src="Images/idblock3_kiank.png" style="width:650px;height:150px;"></center>

In [4]:
class IdentityBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride = 1):
        super().__init__()
        F1, F2, F3 = out_channels
        self.conv1 = nn.Sequential(
                        nn.Conv2d(in_channels, F1, kernel_size = 1, stride = stride, padding = 'valid'),
                        nn.BatchNorm2d(F1),
                        nn.ReLU())
        self.conv2 = nn.Sequential(
                        nn.Conv2d(F1, F2, kernel_size = kernel_size, stride = 1, padding = 'same'),
                        nn.BatchNorm2d(F2),
                        nn.ReLU())
        self.conv3 = nn.Sequential(
                        nn.Conv2d(F2, F3, kernel_size=1, stride=1, padding='valid'),
                        nn.BatchNorm2d(F3))
    
        self.relu = nn.ReLU()
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)
        out += residual
        out = self.relu(out)
        return out

#### Convolution Block
The ResNet "convolutional block" is the second block type. You can use this type of block when the input and output dimensions don't match up. The difference with the identity block is that there is a CONV2D layer in the shortcut path:
<center><img src="images/convblock_kiank.png" style="width:650px;height:150px;"></center>

In [5]:
class ConvolutionBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride = 1):
        super().__init__()
        F1, F2, F3 = out_channels
        self.conv1 = nn.Sequential(
                        nn.Conv2d(in_channels, F1, kernel_size = 1, stride = stride, padding = 'valid'),
                        nn.BatchNorm2d(F1),
                        nn.ReLU())
        self.conv2 = nn.Sequential(
                        nn.Conv2d(F1, F2, kernel_size = kernel_size, stride = 1, padding = 'same'),
                        nn.BatchNorm2d(F2),
                        nn.ReLU())
        self.conv3 = nn.Sequential(
                        nn.Conv2d(F2, F3, kernel_size=1, stride=1, padding='valid'),
                        nn.BatchNorm2d(F3))
        
        self.shortcut_conv = nn.Sequential(
                        nn.Conv2d(in_channels, F3, kernel_size=1, stride=stride, padding='valid'),
                        nn.BatchNorm2d(F3))
    
        self.relu = nn.ReLU()
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)
        residual = self.shortcut_conv(residual)
        out += residual
        out = self.relu(out)
        return out

### Residual Network Stages
We now have the necessary blocks to build a very deep ResNet. The following figure describes in detail the architecture of this neural network. "ID BLOCK" in the diagram stands for "Identity block," and "ID BLOCK x3" means you should stack 3 identity blocks together.
<center><img src="images/resnet_kiank.png" style="width:850px;height:150px;"></center>

In [6]:
class ResidualStage(nn.Module):

    def get_Identity_blocks(self):
        identity_blocks = []
        in_channel = self.out_channels[-1]
        for _ in range(self.n_identity):
            block = IdentityBlock(in_channel, self.out_channels, self.filter_size).to(device)
            identity_blocks.append(block)
        return identity_blocks 

    def __init__(self, n_identity, in_channels, out_channels, filter_size=3, stride=2):
        super().__init__()
        self.n_identity = n_identity
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.filter_size = filter_size
        self.stride = stride
        self.conv = ConvolutionBlock(in_channels, out_channels, filter_size, stride)
        self.identity_blocks = self.get_Identity_blocks()

    def forward(self, x):
        out = self.conv.forward(x)
        for id_block in self.identity_blocks:
            out = id_block(out)
        return out

## Building ResNet Model (50 layers)

In [7]:
class ResNet50(nn.Module):

    def __init__(self, n_classes):
        super().__init__()
        self.n_classes = n_classes
        self.stage_0 = nn.Sequential(
                       nn.Conv2d(3, 64, 7, 2, padding=3),
                       nn.BatchNorm2d(64),
                       nn.ReLU(),
                       nn.MaxPool2d(3, 2, 1)
        )
        self.stage_1 = ResidualStage(2, 64, [64, 64, 256], filter_size=3, stride=1).to(device)
        self.stage_2 = ResidualStage(3, 256, [128, 128, 512], filter_size=3, stride=2).to(device)
        self.stage_3 = ResidualStage(5, 512, [256, 256, 1024], filter_size=3, stride=2).to(device)
        self.stage_4 = ResidualStage(2, 1024, [512, 512, 2048], filter_size=3, stride=2).to(device)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.flat = nn.Flatten()
        self.fc = nn.Linear(2048, self.n_classes)
        
    def forward(self, x):
        out = self.stage_0(x)
        out = self.stage_1(out)
        out = self.stage_2(out)
        out = self.stage_3(out)
        out = self.stage_4(out)
        out = self.avgpool(out)
        out = self.flat(out)
        out = self.fc(out)
        return out


### Load and Transform `CIFAR-10` dataset

In [8]:
transform = transforms.Compose(
    [transforms.Resize((224,224)),
     transforms.ToTensor(),
     transforms.Normalize(
         mean=[0.4914, 0.4822, 0.4465],
         std=[0.2023, 0.1994, 0.2010])
    ])

batch_size = 16

trainset = datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


## Set Cost Function and Optimizer

In [9]:
num_classes = 10
num_epochs = 10
learning_rate = 0.01

model = ResNet50(num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.001)  

# Train the model
total_step = len(trainloader)
print(f"Number of batches in training data : {total_step}")

Number of batches in training data : 3125


## Train the Network

In [10]:
for epoch in range(num_epochs):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        del inputs, labels, outputs
        torch.cuda.empty_cache()
        # print statistics
        running_loss += loss.item()
        
    with torch.no_grad():
        total_valid_loss = 0.0
        for images, labels in testloader:
            images = images.to(device)
            labels = labels.to(device)
            valid_outputs = model(images)
            valid_batch_loss = criterion(valid_outputs, labels)
            total_valid_loss += valid_batch_loss.item()
            
        test_batch_len = len(testloader)
        print(f'[{epoch + 1}, {i + 1:5d}] training loss: {running_loss / total_step:.6f} validation loss: {total_valid_loss / test_batch_len:.6f}')
        running_loss = 0.0

print('Finished Training')

[1,  3125] training loss: 3.170051 validation loss: 3.385314
[2,  3125] training loss: 2.792747 validation loss: 2.458987
[3,  3125] training loss: 2.700126 validation loss: 2.192587
[4,  3125] training loss: 2.146834 validation loss: 1.952478
[5,  3125] training loss: 2.030197 validation loss: 2.208726
[6,  3125] training loss: 1.893452 validation loss: 1.960695
[7,  3125] training loss: 2.015880 validation loss: 1.746025
[8,  3125] training loss: 2.038822 validation loss: 2.275704
[9,  3125] training loss: 2.018946 validation loss: 2.594853
[10,  3125] training loss: 1.937749 validation loss: 1.711144
Finished Training
