<a href="https://colab.research.google.com/github/AngeValli/ResNet18_on_MNIST_Ange_Valli/blob/main/RestNet_18_on_MNIST_Ange_VALLI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import torch
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms, models
import csv

import pandas as pd
import seaborn as sns
import numpy as np

# Part one : First script using Numpy



Compute forward and backward pass to update weights using gradient descent

In [None]:
# -*- coding: utf-8 -*-

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 770.0751953125
199 4.263453960418701
299 0.03719189018011093
399 0.0006322634872049093
499 6.916717393323779e-05


# Part two : script using nn.Module







In [None]:
# We import MNIST dataset from torchvision, downloaded in subfolder MNIST torchvision
mnist = datasets.MNIST(root='MNIST torchvision/', download = True)
x = torch.FloatTensor(mnist.data.reshape(mnist.data.shape[0], 28 * 28).tolist())
y = torch.LongTensor(mnist.targets.tolist())

In [None]:
class Net(nn.Module):
  def __init__(self, D_in, H, D_out):
    super(Net, self).__init__()
    self.FC1 = nn.Linear(D_in, H)
    self.relu = nn.ReLU()
    self.FC2 = nn.Linear(H, D_out)
  def forward(self, x):
    A0 = x
    A1 = self.FC1(A0)
    A2 = self.relu(A1)
    A3 = self.FC2(A1)
    A4 = torch.sigmoid(A3)
    return A4

In [None]:
N, D_in, H, D_out = 10000, 784, 200, 10
model = Net(D_in, H, D_out)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=0.00005, weight_decay= 1e-3, momentum = 0.9)

In [None]:
# Write the training
# We save summaries in subfolder runs/
nb_epochs = 20
writer = SummaryWriter('runs')
total = 0
correct = 0

for num_epoch in range(1, nb_epochs+1) :
  for i in range(0, x.shape[0], N):
    inputs = Variable(x[i:i+N])
    labels = Variable(y[i:i+N])
    # zeroes the gradient buffers of all parameters
    optimizer.zero_grad() # re-init the gradients (otherwise they are cumulated)
    outputs = model(inputs) # Forward pass: Compute predicted y by passing  x to the model

    loss = criterion(outputs, labels) # Compute loss
    loss.backward() # perform back-propagation
    # Perform the training parameters update
    optimizer.step() # update the weights

    # accuracy
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
    accuracy = correct / total
    writer.add_scalar('Loss/train', loss, num_epoch)
    writer.add_scalar('Accuracy/train', accuracy, num_epoch)

  print('epoch {}, loss {:.6f}, accuracy {}'.format(num_epoch, loss.item(), accuracy))
writer.close()

epoch 1, loss 2.409241, accuracy 0.07791666666666666
epoch 2, loss 2.390407, accuracy 0.07966666666666666
epoch 3, loss 2.366583, accuracy 0.08221666666666666
epoch 4, loss 2.340013, accuracy 0.085525
epoch 5, loss 2.313879, accuracy 0.08947333333333334
epoch 6, loss 2.290153, accuracy 0.09378611111111111
epoch 7, loss 2.268213, accuracy 0.09829523809523809
epoch 8, loss 2.247641, accuracy 0.10285
epoch 9, loss 2.227880, accuracy 0.10736851851851852
epoch 10, loss 2.207004, accuracy 0.11181833333333334
epoch 11, loss 2.187290, accuracy 0.11612121212121213
epoch 12, loss 2.168538, accuracy 0.12035972222222223
epoch 13, loss 2.150850, accuracy 0.1244948717948718
epoch 14, loss 2.134138, accuracy 0.12853333333333333
epoch 15, loss 2.117911, accuracy 0.13251222222222223
epoch 16, loss 2.102536, accuracy 0.13640208333333334
epoch 17, loss 2.087545, accuracy 0.14016176470588235
epoch 18, loss 2.073259, accuracy 0.14388425925925927
epoch 19, loss 2.058774, accuracy 0.14754649122807018
epoch 2

In [None]:
# Show summaries
%load_ext tensorboard
%tensorboard --logdir=runs

# Part three : ResNet 18

This code has been inspired from https://www.kaggle.com/c/digit-recognizer to implement ResNet 18 algorithm to work on MNIST dataset

In [None]:
# We import training data from the following github repository : https://github.com/wehrley/Kaggle-Digit-Recognizer/blob/master/train.csv
!npx degit --force wehrley/Kaggle-Digit-Recognizer

In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# We compute training dataset from MNIST dataset we downloaded in .csv
train = pd.read_csv("train.csv", dtype = np.float32)

# We compute training dataset and test dataset by splitting
# We define labels and attributes
targets_numpy = train.label.values
features_numpy = train.loc[:,train.columns != "label"].values/255

# We split with 80% for training dataset et 20% for testing dataset. 
features_train, features_test, targets_train, targets_test = train_test_split(features_numpy,
                                                                             targets_numpy,
                                                                             test_size = 0.2,
                                                                             random_state = 42) 

# We use Pytorch Variables for accumulating gradients. We must use tensors.
featuresTrain = torch.from_numpy(features_train)
targetsTrain = torch.from_numpy(targets_train).type(torch.LongTensor)
featuresTest = torch.from_numpy(features_test)
targetsTest = torch.from_numpy(targets_test).type(torch.LongTensor)

# Pytorch train and test sets
train = torch.utils.data.TensorDataset(featuresTrain,targetsTrain)
test = torch.utils.data.TensorDataset(featuresTest,targetsTest)

# We use batch of size 100
batch_size = 100

# We load data using DataLoader of Pytorch
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(test, batch_size = batch_size, shuffle = False)

# For first training, we define 2500 iterations, 7 epochs for our data and 0.05 learning_rate.
n_iters = 2500
num_epochs = n_iters / (len(features_train) / batch_size)
num_epochs = int(num_epochs)
learning_rate = 0.05

In [None]:
# We define 3*3 convolutional neural network 3*3
def conv3x3(in_channels, out_channels, stride=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=3,
                    stride=stride, padding=1, bias=False)


# We define the residual block class useful for defining ResNet 18
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(ResidualBlock, self).__init__()
        self.conv1 = conv3x3(in_channels, out_channels, stride)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(out_channels, out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out


# We define the class to instantiate models based on ResNet 18
class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=10):
        super(ResNet, self).__init__()
        self.in_channels = 16
        self.conv = conv3x3(1, 16)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self.make_layer(block, 16, layers[0])
        self.layer2 = self.make_layer(block, 32, layers[0], 2)
        self.layer3 = self.make_layer(block, 64, layers[1], 2)
        self.avg_pool = nn.AvgPool2d(8)
        self.fc = nn.Linear(64, num_classes)

    def make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if (stride != 1) or (self.in_channels != out_channels):
            downsample = nn.Sequential(
                conv3x3(self.in_channels, out_channels, stride=stride),
                nn.BatchNorm2d(out_channels))
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels
        for i in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv(x)
        out = self.bn(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

In [None]:
# We instatiate a model based on ResNet class we implemented
net_args = {
    "block": ResidualBlock,
    "layers": [2, 2, 2, 2]
}
model = ResNet(**net_args)

In [None]:
# We define criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
# Training of the model. Variables loss_list, iteration_list and accuracy_list are useful to save values of prediction and loss on ranges of iteration
loss_list = []
iteration_list = []
accuracy_list = []
count = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        train  = Variable(images.resize_(batch_size, 1, 32, 32))
        labels = Variable(labels)           
        # Clear gradients
        optimizer.zero_grad()    
        # Forward propagation
        outputs = model(train)        
        # Calculate softmax and ross entropy loss
        loss = criterion(outputs, labels)        
        # Calculating gradients
        loss.backward()        
        # Update parameters
        optimizer.step()        
        count += 1     
        if count % 250 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                images = Variable(images.resize_(batch_size, 1, 32, 32))                
                # Forward propagation
                outputs = model(images)                
                # Get predictions from the maximum value
                predicted = torch.max(outputs.data, 1)[1]                
                # Total number of labels
                total += labels.size(0)              
                correct += (predicted == labels).sum()         
            accuracy = 100 * correct / float(total)           
            # store loss and iteration
            loss_list.append(loss.data)
            iteration_list.append(count)
            accuracy_list.append(accuracy)
            if count % 500 == 0:
                # Print Loss
                print('Iteration: {}  Loss: {}  Accuracy: {} %'.format(count, loss.item(), accuracy))

Iteration: 500  Loss: 2.302534818649292  Accuracy: 8.357142448425293 %
Iteration: 1000  Loss: 2.309339761734009  Accuracy: 10.821428298950195 %
Iteration: 1500  Loss: 2.320518732070923  Accuracy: 10.630952835083008 %
Iteration: 2000  Loss: 2.2974584102630615  Accuracy: 10.821428298950195 %


## Grid Search for hyper parameters optimization

In [None]:
# We define a data_load to reload the data with the batch_size
def data_load(batch_size) :
  return torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False), torch.utils.data.DataLoader(test, batch_size = batch_size, shuffle = False)

# We use this function to create a new network, to train it using parameters and to recover precision after 250 iterations or end of DataLoader
def resnet18_simplified_for_grid_search(optimizer, batch_size=64) :
  net_args = {
      "block": ResidualBlock,
      "layers": [2, 2, 2, 2]
  }
  model = ResNet(**net_args)
  train_loader, test_loader = data_load(batch_size)
  count = 0
  for i, (images, labels) in enumerate(train_loader):
      train  = Variable(images.resize_(batch_size, 1, 32, 32))
      labels = Variable(labels)           
      # Clear gradients
      optimizer.zero_grad()    
      # Forward propagation
      outputs = model(train)        
      # Calculate softmax and ross entropy loss
      loss = criterion(outputs, labels)        
      # Calculating gradients
      loss.backward()        
      # Update parameters
      optimizer.step()        
      count += 1
      if count % 250 == 0 or count == len(list(enumerate(train_loader))):
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                images = Variable(images.resize_(batch_size, 1, 32, 32))                
                # Forward propagation
                outputs = model(images)                
                # Get predictions from the maximum value
                predicted = torch.max(outputs.data, 1)[1]            
                # Total number of labels
                try:
                  correct += (predicted == labels).sum() 
                  total += labels.size(0)
                except:
                  None               
            accuracy = 100 * correct / float(total)
            break
  return accuracy

In [None]:
# We define the values of parameters for performing grid search.
learning_rate_list_1 = [1.0,0.1,0.01,0.001]
weight_decay_list = [0.1,0.01,0.001]
batch_size_1 = [5,200]

learning_rate_list_2 = [0.11, 0.9910, 0.9905]
batch_size_2 = [5,64,200]

In [None]:
# We write the result of grid search in grid_search.csv file
f = open("grid_search.csv", "a")
for i in range(len(learning_rate_list_1)) :
  for j in range(len(weight_decay_list)) :
    optimizer = optim.Adam(model.parameters(),lr=learning_rate_list_1[i],weight_decay=weight_decay_list[j])
    current_accuracy = resnet18_simplified_for_grid_search(optimizer)
    f.write(" Learning rate = {}, Weight Decay = {}, Batch_size = {}, Accuracy = {} ".format(learning_rate_list_1[i],weight_decay_list[j],64,current_accuracy) + '\n')
  for k in range(len(batch_size_1)) :
    optimizer = optim.Adam(model.parameters(),lr=learning_rate_list_1[i],weight_decay=0.001)
    current_accuracy = resnet18_simplified_for_grid_search(optimizer,batch_size=batch_size_1[k])
    f.write(" Learning rate = {}, Weight Decay = {}, Batch_size = {}, Accuracy = {} ".format(learning_rate_list_1[i],0.001,batch_size_1[k],current_accuracy) + '\n')
for i in range(len(learning_rate_list_2)) :
  for k in range(len(batch_size_2)) :
    optimizer = optim.Adam(model.parameters(),lr=learning_rate_list_2[i],weight_decay=0.001)
    current_accuracy = resnet18_simplified_for_grid_search(optimizer,batch_size=batch_size_2[k])
    f.write(" Learning rate = {}, Weight Decay = {}, Batch_size = {}, Accuracy = {} ".format(learning_rate_list_2[i],0.001,batch_size_2[k],current_accuracy) + '\n')
f.close()