# Session 6:
## Target: 
Run below versions for 25 epochs and report findings:<br>
with L1 + BN <br>
with L2 + BN <br>
with L1 and L2 with BN <br>
with GBN <br>
with L1 and L2 with GBN <br>

## Results:
Parameters: 8786 <br>
Base Model NLL with BN (Batch size 128) - Train Accuracy / Test Accuracy:  98.72% (max 98.79%) / 99.51% (max 99.53%) <br>
with L1 + BN (Batch size 128) - Train Accuracy / Test Accuracy:  98.78% (max 98.83%) / 99.42% (max 99.43%) <br>
with L2 + BN (Batch size 128) - Train Accuracy / Test Accuracy: 98.77% (max 98.79%) / 99.36% (max 99.38%) <br>
with L1 and L2 with BN (Batch size 128) - Train Accuracy / Test Accuracy: 98.77% (max 98.80%) / 99.42% (max 99.45%) <br>
with GBN (Batch size 512) - Train Accuracy / Test Accuracy: 98.3 (max 98.3) / 99.23 (max 99.24) <br>
with L1 and L2 with GBN (Batch size 512) - Train Accuracy / Test Accuracy: 98.34% (max 98.34%) / 99.18% (max 99.20%) <br>

## Analysis: <br>
- L1 Regularization reduces the gap between train and test accuracies, in this case train accuracy increased and test accuracy decreased. As expected from regularization. <br>

- L2 Regularization reduces the gap between train and test accuracies, in this case train accuracy increased and test accuracy decreased. As expected from regularization. <br>

- L1+L2 regularisation seems to give better results, with test accuracy also increasing while the gap between train and test reduced.

- For L1+L2 with / without GBN, it is seen that Ghost Batch normalisation seems to reduce the gap between train and test accuracy, in this case increased the train accuracy but decreased the test accuracy. <br>
However, if we look NLL + BN and NLL + GBN comparison, NLL + BN gives higher train and test accuracy. <br>


# Import Libraries

In [1]:
# importing all the Python Packages & torch Library.
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

# Define Data Transformations & Dataset for Train/Test

In [2]:
train_transforms = transforms.Compose([
                        transforms.RandomRotation((-10.0, 10.0), fill=(1,)),
                        # transforms.RandomErasing(),
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])

test_transforms = transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])

train_data = datasets.MNIST('./data', train=True, download=True, transform=train_transforms)
test_data = datasets.MNIST('./data', train=False, download=True, transform=test_transforms)

# Dataloader Arguments & Train / Test Dataloaders

In [3]:
# seed the model to obtain consistent results
torch.manual_seed(1)
# this is the batch size , in 1 pas no of images passed together.
batch_size = 128
num_splits = 2
use_cuda = torch.cuda.is_available()

print("CUDA Available?", use_cuda)

# kwargs = {'num_workers': 2, 'pin_memory': True} if use_cuda else {}

# dataloader arguments - something you'll fetch these from cmdprmt
# dataloader_args = dict(shuffle=True, batch_size=batch_size, num_workers=4, pin_memory=True) if use_cuda else dict(shuffle=True, batch_size=int(batch_size/2))



# load the training data and perform standard normalization 
# parameter for normalization is mean and std dev.
# train dataloader
# train_loader = torch.utils.data.DataLoader(train, **dataloader_args)

# test dataloader
# test_loader = torch.utils.data.DataLoader(test, **dataloader_args)

CUDA Available? False


# Ghost Batch Normalisation

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class GhostBatchNorm(nn.BatchNorm2d):
    """
    From : https://github.com/davidcpage/cifar10-fast/blob/master/bag_of_tricks.ipynb

    Batch norm seems to work best with batch size of around 32. The reasons presumably have to do 
    with noise in the batch statistics and specifically a balance between a beneficial regularising effect 
    at intermediate batch sizes and an excess of noise at small batches.
    
    Our batches are of size 512 and we can't afford to reduce them without taking a serious hit on training times, 
    but we can apply batch norm separately to subsets of a training batch. This technique, known as 'ghost' batch 
    norm, is usually used in a distributed setting but is just as useful when using large batches on a single node. 
    It isn't supported directly in PyTorch but we can roll our own easily enough.
    """
    def __init__(self, num_features, num_splits, eps=1e-05, momentum=0.1, weight=True, bias=True):
        super(GhostBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum)
        self.weight.data.fill_(1.0)
        self.bias.data.fill_(0.0)
        self.weight.requires_grad = weight
        self.bias.requires_grad = bias        
        self.num_splits = num_splits
        self.register_buffer('running_mean', torch.zeros(num_features*self.num_splits))
        self.register_buffer('running_var', torch.ones(num_features*self.num_splits))

    def train(self, mode=True):
        if (self.training is True) and (mode is False):
            self.running_mean = torch.mean(self.running_mean.view(self.num_splits, self.num_features), dim=0).repeat(self.num_splits)
            self.running_var = torch.mean(self.running_var.view(self.num_splits, self.num_features), dim=0).repeat(self.num_splits)
        return super(GhostBatchNorm, self).train(mode)
        
    def forward(self, input):
        N, C, H, W = input.shape
        if self.training or not self.track_running_stats:
            return F.batch_norm(
                input.view(-1, C*self.num_splits, H, W), self.running_mean, self.running_var, 
                self.weight.repeat(self.num_splits), self.bias.repeat(self.num_splits),
                True, self.momentum, self.eps).view(N, C, H, W) 
        else:
            return F.batch_norm(
                input, self.running_mean[:self.num_features], self.running_var[:self.num_features], 
                self.weight, self.bias, False, self.momentum, self.eps)



# The Model

In [5]:
class Net(nn.Module):
    def __init__(self, batchnorm):
        super(Net, self).__init__()
        
        if (batchnorm == "GBN"):
          self.conv1 = nn.Sequential(
              
              nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(3, 3), padding=0, bias = True), # output 26X26X4 : RF- 3X3
              nn.ReLU(),
              GhostBatchNorm(4, num_splits, weight=False), # Ghost Batch Normalization after each convolution.
              nn.Dropout2d(0.05), # dropout of 5% at each layer
              
              nn.Conv2d(4, 8, 3), # output 24X24X8 : RF - 5x5
              nn.ReLU(),
              GhostBatchNorm(8, num_splits, weight=False),  
              nn.Dropout2d(0.05),  # dropout of 5% at each layer

              nn.Conv2d(8, 16, 3), # output 22X22X16 : RF - 7X7
              nn.ReLU(),
              GhostBatchNorm(16, num_splits, weight=False),  
              nn.Dropout2d(0.05),  # dropout of 5% at each layer

              nn.MaxPool2d(2, 2)       # output 11X11X16 : RF - 8x8 
              )
          self.conv2 = nn.Sequential(
              
            
              nn.Conv2d(16, 16, 3), # output 9X9X16 : RF - 12x12
              nn.ReLU(),
              GhostBatchNorm(16, num_splits, weight=False),
              nn.Dropout2d(0.05), # 5% dropout

              nn.Conv2d(16, 16, 3, padding=1), # output 9X9X16 : RF - 16x16
              nn.ReLU(),
              GhostBatchNorm(16, num_splits, weight=False),
              nn.Dropout2d(0.05), # 5% dropout

              )
          self.conv3 = nn.Sequential(
            
              nn.Conv2d(16, 16, 3,padding=1), # output 9X9X16 : RF - 20 X 20
              nn.ReLU(),
              GhostBatchNorm(16, num_splits, weight=False),
              nn.Dropout2d(0.05), # 5% dropout

              nn.Conv2d(16, 10, 1), # output 7X7X10 : RF- 20 X 20
              nn.AvgPool2d(7) # output 1x1x10 : RF - 32x32

              )
        else: # batchnorm == "BN"
          self.conv1 = nn.Sequential(
              
              nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(3, 3), padding=0, bias = True), # output 26X26X4 : RF- 3X3
              nn.ReLU(),
              nn.BatchNorm2d(4), # Batch Normalization after each convolution.
              nn.Dropout2d(0.05), # dropout of 5% at each layer
              
              nn.Conv2d(4, 8, 3), # output 24X24X8 : RF - 5x5
              nn.ReLU(),
              nn.BatchNorm2d(8),  # Batch Normalization after each convolution.
              nn.Dropout2d(0.05),  # dropout of 5% at each layer

              nn.Conv2d(8, 16, 3), # output 22X22X16 : RF - 7X7
              nn.ReLU(),
              nn.BatchNorm2d(16),  # Batch Normalization after each convolution.
              nn.Dropout2d(0.05),  # dropout of 5% at each layer

              nn.MaxPool2d(2, 2)       # output 11X11X16 : RF - 8x8 
              )
          self.conv2 = nn.Sequential(
              
            
              nn.Conv2d(16, 16, 3), # output 9X9X16 : RF - 12x12
              nn.ReLU(),
              nn.BatchNorm2d(16),
              nn.Dropout2d(0.05), # 5% dropout

              nn.Conv2d(16, 16, 3, padding=1), # output 9X9X16 : RF - 16x16
              nn.ReLU(),
              nn.BatchNorm2d(16),
              nn.Dropout2d(0.05), # 5% dropout

              )
          self.conv3 = nn.Sequential(
            
              nn.Conv2d(16, 16, 3,padding=1), # output 9X9X16 : RF - 20 X 20
              nn.ReLU(),
              nn.BatchNorm2d(16),
              nn.Dropout2d(0.05), # 5% dropout

              nn.Conv2d(16, 10, 1), # output 7X7X10 : RF- 20 X 20
              nn.AvgPool2d(7) # output 1x1x10 : RF - 32x32

              )

    def forward(self, x):
        
        x = self.conv1(x)
        x= self.conv2(x)
        x= self.conv3(x)
        x = x.view(-1, 10)
        return F.log_softmax(x, -1)

# Print Summary of Model

In [6]:
!pip install torchsummary
from torchsummary import summary
device = torch.device("cuda" if use_cuda else "cpu")
model = Net("GBN").to(device)
summary(model, input_size=(1, 28, 28))

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m
Defaulting to user installation because normal site-packages is not writeable
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 4, 26, 26]              40
              ReLU-2            [-1, 4, 26, 26]               0
    GhostBatchNorm-3            [-1, 4, 26, 26]               8
         Dropout2d-4            [-1, 4, 26, 26]               0
            Conv2d-5            [-1, 8, 24, 24]             296
              ReLU-6            [-1, 8, 24, 24]               0
    GhostBatchNorm-7            [-1, 8, 24, 24]              16
  

# Define Train and Test functions

In [7]:
from tqdm import tqdm
from torch.autograd import Variable

train_losses = []
test_losses = []
train_acc = []
test_acc = []
l1_factor = 0.00001


  # Function to train 
'''
  Args: 
  Model : created model to be used for training
  device : GPU or cpu
  train_laoded: data on which the training has to be done
  Optimizer : the optimization algorithm to be used
  epoch : no fo epoch 

'''
def train(model, device, train_loader, optimizer, epoch, losstype):
    model.train() # Set the model on training mode
    pbar = tqdm(train_loader)
    correct = 0
    processed = 0
    cross_entropy = nn.CrossEntropyLoss().to(device)


    for batch_idx, (data, target) in enumerate(pbar):
      data, target = data.to(device), target.to(device) # moving the data to device
      optimizer.zero_grad() # zero the graidents 
      output = model(data) # getting the model output

      loss = 0
      if (losstype == "nll") or (losstype == "L2"):
        loss = F.nll_loss(output, target) # calculating the The negative log likelihood loss
      elif (losstype == "L1") or (losstype == "L1L2"):
        # print ("In Train, losstype = L1 / L1L2")
        # Calculate loss
        # loss = F.nll_loss(y_pred, target)
        loss = cross_entropy(output, target)
        # l1_crit = nn.L1Loss(reduce=False)
        reg_loss = 0
        for param in model.parameters():
            reg_loss += torch.sum(abs(param))

        loss += l1_factor * reg_loss

      train_losses.append(loss)
      loss.backward() # flowing the gradients backward.
      optimizer.step() # paameter updated basd on the current gradient.
      
      pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
      correct += pred.eq(target.view_as(pred)).sum().item()
      processed += len(data)        
      
      pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx} Train Accuracy={100*correct/processed:0.2f}')
      train_acc.append(100*correct/processed)
    return train_losses, train_acc

  # Function to test 
'''
  Args: 
  Model : created model to be used for training
  device : GPU or cpu
  test_laoded: data on which the testing has to be done
  

'''
def test(model, device, test_loader): #, losstype):
  model.eval() # seting up the model for evalaution.
  test_loss = 0 # setting the test loss to 0
  correct = 0 # countign the no of coorect classfication.
  with torch.no_grad(): # turn off gradients, since we are in test mode
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)   # copy the data to device.
        output = model(data) # predict the output

        test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

  test_loss /= len(test_loader.dataset) # calculating the test loss.
  test_losses.append(test_loss)

  print('\nTest set: Average loss: {:.4f}, Test Accuracy: {}/{} ({:.2f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))
  test_acc.append(100. * correct / len(test_loader.dataset))
  return test_losses, test_acc

In [8]:
def getmisclassifiedImage(model, device, test_loader):
  misclassified = []
  misclassified_pred = []
  misclassified_target = []
  misclassfiled_list = []
  model.eval()

  with torch.no_grad():
    for data, target in test_loader:
      data, target = data.to(device), target.to(device)
     
      output = model(data)
      pred = output.argmax(dim =1, keepdim =True)
      
      list_misclassified = (pred.eq(target.view_as(pred)) == False)
      batch_misclassified = data[list_misclassified]
      batch_mis_pred = pred[list_misclassified]
      batch_mis_target = target.view_as(pred)[list_misclassified]

      misclassified.append(batch_misclassified)
      misclassified_pred.append(batch_mis_pred)
      misclassified_target.append(batch_mis_target)
                                  
  # group all the batched together
  
  misclassified = torch.cat(misclassified)
  misclassified_pred = torch.cat(misclassified_pred)
  misclassified_target = torch.cat(misclassified_target)  
                                
 
  misclassfiled_list.append(misclassified)
  misclassfiled_list.append(misclassified_pred)
  misclassfiled_list.append(misclassified_pred)

  return list(map(lambda x, y, z: (x, y, z), misclassified, misclassified_pred, misclassified_target))


In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import random
def Plot_misclassifed(model, device, test_loader):
 plt.style.use("dark_background")
 misclassified = getmisclassifiedImage(model, device, test_loader)
 num_images = 25
 fig = plt.figure(figsize=(12, 12))
 for idx, (image, pred, target) in enumerate(random.choices(misclassified, k=num_images)):
     image, pred, target = image.cpu().numpy(), pred.cpu(), target.cpu()
     ax = fig.add_subplot(5, 5, idx+1)
     ax.axis('off')
     ax.set_title('target {}\npred {}'.format(target.item(), pred.item()), fontsize=12)
     ax.imshow(image.squeeze())
 plt.show()

# Run the model

In [None]:
from torch.optim.lr_scheduler import StepLR
import pickle, os


# with L1 + BN
# with L2 + BN
# with L1 and L2 with BN
# with GBN
# with L1 and L2 with GBN 
approach_options = [["nll", "BN"],["L1", "BN"], ["L2", "BN"], ["L1L2", "BN"], ["nll", "GBN"], ["L1L2", "GBN"]]
# approach_options = [["nll", "GBN"], ["L1", "BN"], ["L2", "BN"], ["L1L2", "BN"],  ["L1L2", "GBN"]]
approach_dicts = {}
gbn_misclassified = []
gbn_predictions = []
overwrite = True

for approach in approach_options:
  train_losses = []
  test_losses = []
  train_acc = []
  test_acc = []

  # print ("approach = ", approach)
  label = approach[0] + ' with ' + approach[1]
  if (overwrite == False) and os.path.isfile('EVAS6-' + label + '.pkl'):
    with open ('EVAS6-' + label + '.pkl','rb') as f:
      approach_dicts[label] = pickle.load(f)
    #from google.colab import files
    #files.download('EVAS6-' + label + '.pkl')

  else:
    model = Net(approach[1]).to(device) # move the model to device.
    if (approach[0] == "L2") or (approach[0] == "L1L2"):
      optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-5)  # intiating the SGD optimizer
    else:
      optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)  # intiating the SGD optimizer

    scheduler = StepLR(optimizer, step_size=6, gamma=0.1)
    approach_dict = {}

    if approach[1] == "GBN":
      batch_size = 512
      num_splits = 4   
    else:
      batch_size = 128
      num_splits = 2 
      #train_loader.batch_size = 512
      #test_loader.batch_size = 512
      # dataloader arguments - something you'll fetch these from cmdprmt
    dataloader_args = dict(shuffle=True, batch_size=batch_size, num_workers=4, pin_memory=True) if use_cuda else dict(shuffle=True, batch_size=int(batch_size/2))

    # load the training data and perform standard normalization 
    # parameter for normalization is mean and std dev.
    # train dataloader
    train_loader = torch.utils.data.DataLoader(train_data, **dataloader_args)

    # test dataloader
    test_loader = torch.utils.data.DataLoader(test_data, **dataloader_args)

    for epoch in range(1, 26):
        print("Approach: Loss, BN = ",approach[0], approach[1],"epoch =", epoch)
        train(model, device, train_loader, optimizer, epoch, approach[0])
        scheduler.step()
        test(model, device, test_loader)
    
    approach_dict["train_losses"] = train_losses
    approach_dict["train_acc"] = train_acc
    approach_dict["test_losses"] = test_losses
    approach_dict["test_acc"] = test_acc
    approach_dicts[label] = approach_dict
    val_data = {'test_acc':test_acc,'test_losses':test_losses}
    if (overwrite == True) and os.path.exists('EVAS6-' + label + '.pkl'):
        os.remove('EVAS6-' + label + '.pkl') #this deletes the file
    with open('EVAS6-' + label + '.pkl','wb') as f:
      pickle.dump(val_data,f)
  if (approach[0] == "nll") and (approach[1] == "GBN"):
    #gbn_misclassified, _, gbn_predictions = find_missclassified(model)
    Plot_misclassifed(model, device, test_loader)



  0%|          | 0/938 [00:00<?, ?it/s]

Approach: Loss, BN =  nll BN epoch = 1


loss=0.4058868885040283 batch_id=145 Train Accuracy=75.24:  16%|█▌        | 146/938 [00:15<01:21,  9.72it/s] 

In [None]:
#%matplotlib inline
#import matplotlib.pyplot as plt

#fig, axs = plt.subplots(2,2,figsize=(15,10))
#axs[0, 0].plot(train_losses)
#axs[0, 0].set_title("Training Loss")
#axs[1, 0].plot(train_acc)
#axs[1, 0].set_title("Training Accuracy")
#axs[0, 1].plot(test_losses)
#axs[0, 1].set_title("Test Loss")
#axs[1, 1].plot(test_acc)
#axs[1, 1].set_title("Test Accuracy")

# Validation Accuracy

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pickle
plt.style.use("dark_background")
fig = plt.figure(figsize=(11, 9))


for label, approach_dict in approach_dicts.items():
  plt.plot(approach_dict['test_acc'],label = label)


plt.title('Validation Accuracy vs. Epoch')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
if os.path.exists('evas6-acc.png'):
    os.remove('evas6-acc.png') #this deletes the file
fig.savefig('evas6-acc.png',dpi=150)
plt.show()

from google.colab import files
files.download('evas6-acc.png')


# Test Loss

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("dark_background")
fig = plt.figure(figsize=(11, 9))
from google.colab import files

for label, approach_dict in approach_dicts.items():
  plt.plot(approach_dict['test_losses'],label = label)
  files.download('EVAS6-' + label + '.pkl')


plt.title('Validation Loss vs. Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
fig.savefig('evas6-loss.png',dpi=150)
files.download('evas6-loss.png')
