# Advanced CNNs

In [22]:
# imports.
import ipdb
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

In [23]:
# imports.
import numpy as np
import torch
import torchvision
from torchvision import transforms
from torch.autograd import Variable
from torch.utils.data import DataLoader

### DataLoader

In [24]:
# first we download the MNIST Dataset.
mnist_train = torchvision.datasets.MNIST(root='./data', 
                                         train=True, 
                                         download=True, 
                                         transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((.1307,), (.3081,))]))
mnist_test  = torchvision.datasets.MNIST(root='./data', 
                                         train=False, 
                                         transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((.1307,), (.3081,))]))

In [25]:
# then we load it.
train_loader = DataLoader(dataset=mnist_train, batch_size=128, shuffle=True)
test_loader  = DataLoader(dataset=mnist_test, batch_size=128, shuffle=False)

A big question mark is what sizes of filters should we use, (1,1), (2,2), (3,3) ?

The idea of Inception modules is simple: We're going to try all possible filters. And we're going to concatenate their activations all together.

`1x1` convolutions serve as the dimenstionality reducers (depth reducers to be specific) that limit the number of expensive `NxN` convolutions that follow. 

So Let's try to implement this Inception Module:

<img src="InceptionModule.png" />

In [34]:
class BasicConv2d(nn.Module):
    '''
    This is the Basic Convolution Layer with
    Batch Normalization and the ReLu activation Function.
    '''
    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicConv2d, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.bn = nn.BatchNorm2d(num_features=out_channels, eps=0.001)
    
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        
        return F.relu(x, inplace=True)

In [35]:
class InceptionA_v2(nn.Module):
    def __init__(self, in_channels, pool_features):
        super(InceptionA_v2, self).__init__()
        self.branch1x1 = BasicConv2d(in_channels, 16, kernel_size=1)
        
        self.branch5x5_1 = BasicConv2d(in_channels, 16, kernel_size=1)
        self.branch5x5_2 = BasicConv2d(16, 24, kernel_size=5, padding=2)
        
        self.branch3x3dbl_1 = BasicConv2d(in_channels, 16, kernel_size=1)
        self.branch3x3dbl_2 = BasicConv2d(16, 24, kernel_size=3, padding=1)
        self.branch3x3dbl_3 = BasicConv2d(24, 24, kernel_size=3, padding=1)
        
        self.branch_pool = BasicConv2d(in_channels, pool_features, kernel_size=1)
    
    def forward(self, x):
        
        # debugging starts here.
        #ipdb.set_trace()
        
        branch1x1 = self.branch1x1(x)
        
        branch5x5 = self.branch5x5_1(x)
        branch5x5 = self.branch5x5_2(branch5x5)
        
        branch3x3dbl = self.branch3x3dbl_1(x)
        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
        
        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
        branch_pool = self.branch_pool(branch_pool)
        
        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
        return torch.cat(outputs, 1)

Even though the graph seems somewhat complicated, if we implement it branch by branch it will be very simple, easy, and intuitive (using PyTorch).

You can just use the InceptionA Graph in other Graphs as you like, they will act as an I/O pipe that you can think about as an ordinary layer, Let's do it:

In [36]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=10, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=88, out_channels=20, kernel_size=5)
        
        self.incept1 = InceptionA_v2(in_channels=10, pool_features=24)
        self.incept2 = InceptionA_v2(in_channels=20, pool_features=24)
        
        self.mp = nn.MaxPool2d(kernel_size=2)
        self.fc = nn.Linear(in_features=1408, out_features=10)
    
    def forward(self, x):
        in_size = x.size(0)
        
        # debugging starts here.
        #ipdb.set_trace()
        
        # Convolution Layer 1 + MaxPooling + ReLu + Inception 1
        x = F.relu(self.mp(self.conv1(x)))
        x = self.incept1(x)
        
        # Convolution Layer 2 + MaxPooling + ReLu + Inception 2
        x = F.relu(self.mp(self.conv2(x)))
        x = self.incept2(x)
        
        # flatten the tensor.
        x = x.view(in_size, -1)
        
        # Linear Layer for Classification.
        x = self.fc(x)
        
        # We output Probabilities over the 10 Labels.
        return F.log_softmax(x)

In [37]:
model = Net()

### Loss & Optimizer Definition

In [38]:
criterion = torch.nn.CrossEntropyLoss()

In [39]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01, momentum=0.5)

### Training

In [40]:
def train(epochs):
    # set the model in training mode.
    model.train()
    
    for epoch in range(epochs):
        # let's loop over the train_loader batches.
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            
            # debugging starts here.
            #ipdb.set_trace()
            
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 10 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, 
                                                                               batch_idx * len(data), 
                                                                               len(train_loader.dataset), 
                                                                               100. * batch_idx / len(train_loader), 
                                                                               loss.data[0]))

In [41]:
train(1)





In [42]:
def validate():
    
    # sets the model in evaluation mode.
    model.eval()
    
    test_loss = 0
    correct = 0
    
    for data, target in test_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        
        # sum up batch loss.
        test_loss += criterion(output, target).data[0]
        # get the index of the max log probability.
        pred = torch.max(output.data, 1)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
    
    test_loss /= len(test_loader.dataset)
    print('\nValidation set Loss: {:.4f}, accuracy: {}/{} ({:.0f}%)\n'.format(test_loss, 
                                                                              correct, 
                                                                              len(test_loader.dataset), 
                                                                              100. * correct / len(test_loader.dataset)))

In [43]:
validate()




Validation set Loss: 0.0007, accuracy: 9791/10000 (98%)



Improved accuracy to **98%** using a simple inception model.

# Exercice 
Implement the Full Inception v3/v4
<img src="Inception_V3V4.png" />

### Can we just keep going deeper, Stacking More layers ?

<img src="NotSoGood.png" />
<img src="ComparingPerformance.png" />

* 56 layer net has higher training error and test error than 20-layer net.
* "Overly deep" plain nets have higher training error.
* A general phenomenon, observed in many datasets.

## Problems with stacking layers

* the vanishing gradients problem.
* Degradation problem: with increased network depth accuracy gets saturated and then rapidly degrades.

## A Proposed Solution
By using Residual Networks, you keep the gradient alive by injecting the input in later layers:

<img src="VanishingGradientsSolution.png" />

### How to Design a Neural Network ?

<img src="NN_design.png" />

### Exercice: Implement DenseNet

<img src="DenseNet.png" />