# <center>A comprehensive example of CNN with Pytorch</center>

>*Convolutional networks are simply neural networks that use convolution in place of general matrix multiplication in at least one of their layers.*

__Deep Learning__, I. Goodfellow & al.

Our aim in this section is to use multiple methods introduced by CNN in order to outperform simple fully connected neural networks. We will first describe each method and explain the motivation behind. Then, we will train a model using all these methods and compare it to simple neural networks. 

Note : we will mostly use __Deep Learning__, I. Goodfellow & al.

<a name="table"></a>
- **I- [The convolution operator as a network simplifier](#convolution)**
	- 1- [Principle of convolution](#principle_conv)
	- 2- [Motivation behind convolution](#motivation_conv)
- **II- [Pooling to improve statistical robustness](#pooling)**
	- 1- [What is pooling ?](#what_pool)
	- 2- [Different ways of pooling](#diff_pool)
    - 3- [Pooling is useful for object detection](#detect_pool)
- **III- [Batch normalization to reduce internal covariate shift](#batch)**
	- 1- [The problem of the internal covariate shift](#covariate)
    - 2- [The method of Batch Normalization](#method)

<a name="convolution"></a>
# I- The convolution operator as a network simplifier

# III- Creating and training a CNN model on MNIST

##### Import Pytorch and other useful librairies

In [1]:
from matplotlib import pyplot as plt
plt.gray()
import math
import numpy as np
import pandas as pd

import torch
import torchvision.datasets as datasets
import torch.nn.functional as F
from torch import nn
from torch import optim
import copy

In [2]:
loss_func = F.cross_entropy

def accuracy(Y_hat, Y):
    preds = torch.argmax(Y_hat, dim=1)
    return (preds == Y).float().mean()

##### Load and preprocess dataset

In [38]:
#import data
mnist_trainset = datasets.MNIST(root='../data', train=True, download=True, transform=None)
mnist_testset = datasets.MNIST(root='../data', train=False, download=True, transform=None)

#load trainset into tensors
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=1, shuffle=True)
X_train = train_loader.dataset.data
Y_train = train_loader.dataset.targets

#load testset into tensors
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=10000, shuffle=False)
X_test = test_loader.dataset.data
Y_test = test_loader.dataset.targets

#scale data to [0:1] and convert to float32
X_train = (X_train.to(dtype=torch.float32) / X_train.max().to(dtype=torch.float32))
X_test = (X_test.to(dtype=torch.float32) / X_test.max().to(dtype=torch.float32))

#Flatten train and test data
X_train = X_train.reshape(X_train.shape[0],1,28,28)
X_test = X_test.reshape(X_test.shape[0],1,28,28)

print("Train examples : ",X_train.shape[0])
print("Test examples : ",X_test.shape[0])
print("Nb of features : ",X_train.shape[1])

Train examples :  60000
Test examples :  10000
Nb of features :  1


In [39]:
X_train = X_train[:1000]

##### Define the CNN architecture

In [40]:
class NeuralNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 24, kernel_size=5, stride=1, padding=2)
        self.max1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.bn1 = nn.BatchNorm2d(24)
        self.conv2 = nn.Conv2d(24, 48, kernel_size=5, stride=1, padding=2)
        self.max2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.bn2 = nn.BatchNorm2d(48)
        self.conv3 = nn.Conv2d(48, 64, kernel_size=5, stride=1, padding=2)
        self.max3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.bn3 = nn.BatchNorm2d(64)
        self.linear4 = nn.Linear(64*3*3,256)
        self.bn4 = nn.BatchNorm1d(256)
        self.linear5 = nn.Linear(256,10)
        
        self.number_params = 18
        
        self.mu = [None] * self.number_params
        
        self.copy_snapshot()
 
    def forward(self, x):
        #print("--------FORWARD---------")
        x = torch.relu(self.conv1(x))
        #print("conv1 :" , x.shape)
        x = self.max1(x)
        x = self.bn1(x)
        #print("max1 :" , x.shape)
        x = torch.relu(self.conv2(x))
        #print("conv2 :" , x.shape)
        x = self.max2(x)
        x = self.bn2(x)
        #print("max2 :" , x.shape)
        x = torch.relu(self.conv3(x))
        #print("conv3 :" , x.shape)
        x = self.max3(x)
        x = self.bn3(x)
        #print("max3 :" , x.shape)
        x = self.linear4(torch.relu(x.reshape(x.shape[0],-1)))
        #print("linear4 :" , x.shape)
        x = self.bn4(x)
        x = self.linear5(torch.softmax(x,1))
        #print("linear5 :" , x.shape)
        return x
    
    def forward_snapshot(self, x):
        #print("--------FORWARD---------")
        x = torch.relu(self.conv1_snapshot(x))
        #print("conv1 :" , x.shape)
        x = self.max1(x)
        x = self.bn1_snapshot(x)
        #print("max1 :" , x.shape)
        x = torch.relu(self.conv2_snapshot(x))
        #print("conv2 :" , x.shape)
        x = self.max2(x)
        x = self.bn2_snapshot(x)
        #print("max2 :" , x.shape)
        x = torch.relu(self.conv3_snapshot(x))
        #print("conv3 :" , x.shape)
        x = self.max3(x)
        x = self.bn3_snapshot(x)
        #print("max3 :" , x.shape)
        x = self.linear4_snapshot(torch.relu(x.reshape(x.shape[0],-1)))
        #print("linear4 :" , x.shape)
        x = self.bn4_snapshot(x)
        x = self.linear5_snapshot(torch.softmax(x,1))
        #print("linear5 :" , x.shape)
        return x
    
    def copy_snapshot(self):
        self.conv1_snapshot = copy.deepcopy(self.conv1)
        self.bn1_snapshot = copy.deepcopy(self.bn1)
        self.conv2_snapshot = copy.deepcopy(self.conv2)
        self.bn2_snapshot = copy.deepcopy(self.bn2)
        self.conv3_snapshot = copy.deepcopy(self.conv3)
        self.bn3_snapshot = copy.deepcopy(self.bn3)
        self.linear4_snapshot = copy.deepcopy(self.linear4)
        self.bn4_snapshot = copy.deepcopy(self.bn4)
        self.linear5_snapshot = copy.deepcopy(self.linear5)

        i=0
        for param in self.parameters():
            if (i < self.number_params) :
                self.mu[i] = torch.zeros(param.shape)
                i+=1

    def update_SGD(self, lr=0.1):
        params = list(self.parameters())
        for i in range(self.number_params // 2,self.number_params):
            params[i].data = params[i].data - lr * params[i].grad.data
        
    def update_SVRG(self,lr):
        params = list(self.parameters())
        k = len(params) // 2
        for i in range(k):
            params[i].data = params[i].data - lr * (params[i].grad.data - params[i+k].grad.data + self.mu[i].data)     
    
    def update_mu(self,batch_size):
        params = list(self.parameters())
        for i in range(len(self.mu)):
            self.mu[i].data = self.mu[i].data + params[i+self.number_params].grad.data / batch_size
        
        
        
                     
                            
    def fit(self,optimizer,epochs,batch_size,lr,decay):
        n = X_train.shape[0]
        model.train()
        
        #Warm start
        for _ in range(3):
            for i in range((n - 1) // batch_size + 1):
                optimizer.zero_grad()
                X = X_train[ i * batch_size : (i+1) * batch_size ]
                Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                pred = self.forward( X )
                loss = loss_func( pred , Y )
                loss.backward()
                self.update_SGD()
                
            print("0\t",loss.item())

        self.copy_snapshot()
    
        for epoch in range(epochs):
            model.train()
            #update mu
            for i in range((n - 1) // batch_size + 1):
                optimizer.zero_grad()
                X = X_train[ i * batch_size : (i+1) * batch_size ]
                Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                pred = self.forward_snapshot( X )
                loss_snapshot = loss_func( pred , Y )
                loss_snapshot.backward()
                self.update_mu(batch_size)
            
            
            for m in range(5):
                for i in range((n - 1) // batch_size + 1):
                    optimizer.zero_grad()
                    
                    #Snapshot gradient computation
                    X = X_train[ i * batch_size : (i+1) * batch_size ]
                    Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                    pred = self.forward_snapshot( X )
                    loss_snapshot = loss_func( pred , Y )
                    loss_snapshot.backward()
                    
                    #'real' gradient computation
                    X = X_train[ i * batch_size : (i+1) * batch_size ]
                    Y = Y_train[ i * batch_size : (i+1) * batch_size ]
                    pred = self.forward( X )
                    loss = loss_func( pred , Y )
                    loss.backward()
                    self.update_SVRG(lr)
                    
           
                print(epoch * 5 + m+1,"\t",loss.item())
            
            self.copy_snapshot()
            with torch.no_grad():
                model.eval()
                print("Test set \t", round(accuracy( model.forward(X_test) , Y_test).item(),3))

In [41]:
model = NeuralNet()

In [42]:
opt = optim.SGD(model.parameters(), lr=1)
epochs = 2
batch_size = 100
learning_rate = 0.001

model.fit(opt,epochs,batch_size,learning_rate,decay)

0	 2.3003013134002686
0	 2.298699378967285
0	 2.2973012924194336
1 	 2.296299457550049
2 	 2.296624183654785
3 	 2.2971198558807373
4 	 2.2977921962738037
5 	 2.2986438274383545
Test set 	 0.103
6 	 2.298912286758423
7 	 2.2992868423461914
8 	 2.2998552322387695
9 	 2.3006157875061035
10 	 2.3015732765197754
Test set 	 0.102


In [110]:
for tens in model.parameters():
    print(tens.shape)

torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([24])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([48])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])
torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])
torch.Size([24])
torch.Size([24])
torch.Size([48])
torch.Size([48])
torch.Size([64])
torch.Size([64])
torch.Size([256])
torch.Size([256])


###### Load, Preprocess and predict test set from Kaggle

In [104]:
#Load data from CSV
test = pd.read_csv('../data/MNIST/test.csv')
test_tensor = torch.tensor(test.values)

#Preprocess
test_tensor = (test_tensor.to(dtype=torch.float32) / test_tensor.max().to(dtype=torch.float32))
test_tensor = test_tensor.reshape(test_tensor.shape[0],1,28,28)

#Predict
test_tensor = model.forward(test_tensor)
test_tensor = test_tensor.argmax(1)

##### Save predictions to a csv file

In [19]:
#Convert to a numpy array
arr = test_tensor.numpy()

# write CSV
np.savetxt('../data/MNIST/predictions.csv', arr)

NameError: name 'test_tensor' is not defined

In [62]:
for i in range(len(model.mu)):
    print(model.mu[i].shape)

torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([24])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([48])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])
torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([24])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([48])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])


In [58]:
for param in model.parameters():
    print(param.shape)

torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([24])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([48])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])
torch.Size([24, 1, 5, 5])
torch.Size([24])
torch.Size([24])
torch.Size([24])
torch.Size([48, 24, 5, 5])
torch.Size([48])
torch.Size([48])
torch.Size([48])
torch.Size([64, 48, 5, 5])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([256, 576])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([10, 256])
torch.Size([10])
