<a href="https://colab.research.google.com/github/ThrupthiAnn/SummerSchool2021_HandsOn_Aug7/blob/main/CNN/3_Weight_Initialization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Trying out different weight initializer techniques

We begin with building a CNN architecture for image classification task on CIFAR10 dataset. In this part of the tutorial, we will understand how to use  different weight initializer techniques to train a CNN network.

To make data loading simple, we would use the torchvision package created as part of PyTorch which has data loaders for standard datasets such as ImageNet, CIFAR10, MNIST.

### CIFAR10 dataset
![CIFAR10](https://github.com/ckraju/summer/blob/main/data/resnet/cifar10.png?raw=1)

In [None]:
#a Tensor library with GPU support
import torch

#Datasets, Transforms and Models specific to Computer Vision
import torchvision
import torchvision.transforms as transforms

#differentiation library that supports all differentiable Tensor operations in torch
from torch.autograd import Variable

#a neural networks library integrated with autograd functionality
import torch.nn as nn
import torch.nn.functional as F

#an optimization package with standard optimization methods such as SGD, RMSProp, LBFGS, Adam etc.
import torch.optim as optim

#Weight Initialization
import torch.nn.init as weight_init

#scientific computing library for Python
import numpy as np

#plotting and visualization library
import matplotlib.pyplot as plt
#Display on the notebook
%matplotlib inline 
plt.ion() #Turn interactive mode on.

### Dataloader and Transformers

In [None]:
#Train data
#Compose transforms (applies data transformation and augmentation) prior to feeding to training
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

#inbuilt dataset class for reading CIFAR10 dataset
trainset = torchvision.datasets.CIFAR10(root='../../data/lab1', train=True,
                                        download=True, transform=transform)

#dataloader for Batching, shuffling and loading data in parallel
train_loader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

#test data
testset = torchvision.datasets.CIFAR10(root='../../data/lab1', train=False,
                                       download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')



Files already downloaded and verified
Files already downloaded and verified



### Defining the model

To create a network, we should first inherit the base class nn.Module. You just have to define the forward function, and the backward function (where gradients are computed) is automatically defined for you using autograd. You can use any of the Tensor operations in the forward function.

In [None]:

# This scheme can be one of 'uniform', 'normal', 'constant' 'Xavier' and 'custom'

weight_initialization_scheme = 'uniform'


def weight_init_custom_conv(module):
    import math
    n = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
    module.weight.data.normal_(0, math.sqrt(2. / n))

def weight_init_custom_linear(module):
    import math
#     import pdb
#     pdb.set_trace()
    n = module.in_features * module.out_features
    module.weight.data.normal_(0, math.sqrt(2. / n))

    
class Model(nn.Module):
    
    #define the learnable paramters by calling the respective modules (nn.Conv2d, nn.MaxPool2d etc.)
    def __init__(self):
        super(Model, self).__init__()
        
        #calling conv2d module for convolution
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5,stride=1,padding=0,bias=True)
        
        #calling MaxPool2d module for max pooling with downsampling of 2
        self.pool_1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.pool_2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        #fully connected layers
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)   
        
        
        # Setting the weights for the conv1 layer
        for m in self.modules():
            if weight_initialization_scheme == 'uniform':
#                     print('Initializating with uniform scheme')
                    weight_init.uniform(self.conv1.weight)
                    weight_init.uniform(self.conv2.weight)
                    weight_init.uniform(self.fc1.weight)
                    weight_init.uniform(self.fc2.weight)
                    weight_init.uniform(self.fc3.weight)
            if weight_initialization_scheme == 'normal':
#                     print('Initializating with normal scheme')
                    weight_init.normal(self.conv1.weight)
                    weight_init.normal(self.conv2.weight)
                    weight_init.normal(self.fc1.weight)
                    weight_init.normal(self.fc2.weight)
                    weight_init.normal(self.fc3.weight)
            if weight_initialization_scheme == 'constant':
#                     print('Initializating with constant scheme')
                    weight_init.constant(self.conv1.weight, 0.1)
                    weight_init.constant(self.conv2.weight, 0.1)
                    weight_init.constant(self.fc1.weight, 0.1)
                    weight_init.constant(self.fc2.weight, 0.1)
                    weight_init.constant(self.fc3.weight, 0.1)
            if weight_initialization_scheme == 'Xavier':
#                     print('Initializating with Xavier scheme')
                    weight_init.xavier_normal(self.conv1.weight)
                    weight_init.xavier_normal(self.conv2.weight)
                    weight_init.xavier_normal(self.fc1.weight)
                    weight_init.xavier_normal(self.fc2.weight)
                    weight_init.xavier_normal(self.fc3.weight)
            if weight_initialization_scheme == 'custom':
#                     print('Initializating with custom scheme')
                    weight_init_custom_conv(self.conv1)
                    weight_init_custom_conv(self.conv2)
                    weight_init_custom_linear(self.fc1)
                    weight_init_custom_linear(self.fc2)
                    weight_init_custom_linear(self.fc3)
                    


    
    #defining the structure of the network
    def forward(self, x):
        
        #Applying relu activation after each conv layer
        x = self.pool_1(F.relu(self.conv1(x)))
        x = self.pool_2(F.relu(self.conv2(x)))
        
        #reshaping to 1d for giving input to fully connected units
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = Model()
model = model.cuda()

#Printing the network architecture
print(model)



Model(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool_1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (pool_2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


### Define the optimizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

### Train the network

In [None]:
for epoch in range(5):  # loop over the dataset multiple times

    total_loss = 0.0
    correct = 0
    for i, data in enumerate(train_loader):
        # get the inputs
        inputs, labels = data

        # wrap them in Variable
        inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        #print (loss.data.item() - loss.data.cpu())
        #print (loss.data.cpu())
        #print ((loss.data[0]).item())
        total_loss += loss.data.cpu()
        # Calculate no of correct classifications
        _, predicted_class = outputs.max(1)
        correct += predicted_class.data.eq(labels.data).sum()     
        
    print("Epoch: {0} | loss: {1} | accuracy: {2}".format(epoch, total_loss/len(train_loader)
                                                          , correct/float(len(train_loader.dataset))))

Epoch: 0 | loss: 2.3082973957061768 | accuracy: 0.0995199978351593
Epoch: 1 | loss: 2.308103084564209 | accuracy: 0.10223999619483948
Epoch: 2 | loss: 2.30814266204834 | accuracy: 0.09933999925851822
Epoch: 3 | loss: 2.308627128601074 | accuracy: 0.09737999737262726
Epoch: 4 | loss: 2.3084802627563477 | accuracy: 0.09781999886035919
