In [3]:
#in this script we will train a classifier on mnist and then test it on usps
#we will use the same classifier for both datasets
#classifier is Resnet-50 based on the paper "Deep Residual Learning for Image Recognition"

In [4]:
#in this script we will build a classifier for mnist dataset
#we will use pretraiined Resnet-50 model and train it on mnist dataset

In [5]:
experiment_name = 'mnist_classifier'
version = 'v1'

#concat experiment name and version to get experiment id
experiment_id = experiment_name + '_' + version

In [6]:
#imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torchvision import datasets, transforms
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import time
import copy
import torchvision.models as models
import torch.utils.model_zoo as model_zoo
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from PIL import Image
import pandas as pd
import random
import math
import torch.nn.init as init
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets

In [7]:
from torchvision.models import resnet50, ResNet50_Weights
from torchvision.io import read_image
from torchsummary import summary
#import tenserboard
from torch.utils.tensorboard import SummaryWriter

In [48]:
#we will use the sklearn confusion matrix

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import make_classification

In [9]:
#GPU name
#
GPU_NAME = 'cuda'

In [10]:
#device
device = torch.device(GPU_NAME if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [11]:
#cuda cache clear
torch.cuda.empty_cache()

In [12]:
#defining the hyperparameters
batch_size = 64
learning_rate = 0.001
momentum = 0.9
num_epochs = 2
IMAGE_SIZE = 224
CHANNELS_IMG = 1
NUM_CLASSES = 10



#### Step 1: Initialize model with the best available weights

In [13]:
#creating the model
weights = ResNet50_Weights.DEFAULT
#send weight sto gpu
# weights = weights.to(device)
#sending the model to GPU

model = resnet50(weights=weights).to(device)

In [14]:
#print model summary
summary(model, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [15]:
#print the model
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

#### Step 2: Initialize the inference transforms

In [16]:
preprocess = weights.transforms()
#add on more transform to make channels 3 if there are only 1 channel
# preprocess.transforms.append(transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.shape[0] == 1 else x))

In [17]:
#define the transform for the dataset
transform_mnist_resnet = transforms.Compose(
    [
    #convert to pil image
    # transforms.ToPILImage(),
    #check if PIL Image then leave as it is, else convert to PIL Image
    # transforms.Lambda(lambda x: x if isinstance(x, Image.Image) else transforms.functional.to_pil_image(x)),
    # #if greyscale then convert to 3 channels using , transforms.functional.to_grayscale( num_output_channels=3)
    # #only if the image is greyscale, then convert to 3 channels, else leave as it is
    # transforms.Lambda(lambda x: transforms.functional.to_grayscale(x, num_output_channels=3) if x.shape[0] == 1 else x),

    #print type of image
    # transforms.Lambda(lambda x: print(type(x))),
    # resize to 224x224
    # transforms.Resize(IMAGE_SIZE),

    #apply preprocess transform
    # preprocess,
    #apply : preprocess = weights.transforms()
    #apply preprocess to input image
    # transforms.Lambda(lambda x: preprocess(x)),


    



    

    # if torch tensor then leave as it is, else convert to tensor
    transforms.Lambda(lambda x: x if isinstance(x, torch.Tensor) else transforms.functional.to_tensor(x)),
    #

    #resize to 224x224
    transforms.Resize(IMAGE_SIZE),

    #check if channels are 1, then convert to 3 channels
    transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.shape[0] == 1 else x),

    transforms.Lambda(lambda x: preprocess(x)),

    #if channels are 3, then make them 1
    transforms.Lambda(lambda x: x[0].unsqueeze(0) if x.shape[0] == 3 else x),
    
    # normalize
    transforms.Normalize(
            [0.5 for _ in range(CHANNELS_IMG)], [0.5 for _ in range(CHANNELS_IMG)]
        ),
    ]
)

#### Dataset

In [18]:
# for training we will use MNIST dataset in pytorch library
#for testing we will use USPS dataset

#### train data - MNIST
#### test data - USPS

In [19]:
#load train data
train_data = datasets.MNIST(root='./data/', download=True, transform=transform_mnist_resnet) 
#load train data
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [20]:

#load test data
#USPS dataset
test_data = datasets.USPS(root='./data/', download=True, transform=transform_mnist_resnet)


#load test data
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True)



In [21]:
#print the length of train and test data
print(len(train_data))
#print the shape of train data
print(train_data[0][0].shape)
#print label of train data
print(train_data[0][1])

60000
torch.Size([1, 224, 224])
5


In [22]:
#print length of test data
print(len(test_data))
#print shape of test data
print(test_data[0][0].shape)
#print label of test data
print(test_data[0][1])

7291
torch.Size([1, 224, 224])
6


In [23]:
#get the size of the train data and test data
train_size = len(train_data)
test_size = len(test_data)
#print
print('Train size: ', train_size)
print('Test size: ', test_size)

Train size:  60000
Test size:  7291


In [24]:
#print shape of the data
print('Train data shape: ', train_data.data.shape)

Train data shape:  torch.Size([60000, 28, 28])


In [25]:
#get the size of images in train and test data
# train_image_size = train_data.data.shape
# test_image_size = test_data[0][0].size()
#print
# print('Train image size: ', train_image_size)
# print('Test image size: ', test_image_size)
# train_size  = train_data.data.shape()
# print('Train image size: ', train_size)

In [26]:
# #iterate through the dataset and print the dimensions of first image and then use break
# for i, (images, labels) in enumerate(train_loader):
#     # print('Train image size: ', images[0].size())
#     # print(images.size())
#     # print(labels.size())
#     break

#### we will customize resnet50 for 10 class classifier [mnist]

In [27]:
#we will chnage the linear fc layer to 10 classes
#and make the first  conv layer to 1 channel
def  change_model(model, num_classes=10, channels=1):
    #change the first conv layer to 1 channel
    model.conv1 = torch.nn.Conv2d(channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
    #change the last fc layer to 10 classes
    model.fc = torch.nn.Linear(in_features=2048, out_features=num_classes, bias=True)
    return model

In [28]:
#now change the model
model = change_model(model, num_classes=NUM_CLASSES, channels=CHANNELS_IMG)

In [29]:
#show the model
model

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [30]:
#send model to gpu
model = model.to(device)

In [None]:
#we will write model to tensorboard
#we will use tensorboard to visualize the model
#create a writer
writer = SummaryWriter('runs/models/mnist_resnet50_classifier')
#write the model to tensorboard
writer.add_graph(model, torch.rand(1, 1, 224, 224).to(device))
#close the writer
writer.close()


#### loss function

In [31]:
#we define the loss functoin
#for 10 classes we will use cross entropy loss
criterion = nn.CrossEntropyLoss()

#### optimizer

In [32]:
#we will define the optimizer
#we will use SGD optimizer
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

#### Training routine

In [33]:
 #function to return gradient  norm
#write a function to calculate the gradient penalty
def gradient_norm(dnn, current_batch):

    BATCH_SIZE, C, H, W = current_batch.shape
    #print batch size, c,h,w
    # print("batch size, c, h, w", BATCH_SIZE, C, H, W)
    if BATCH_SIZE%2==1:
        #remove the last element
        current_batch = current_batch[:-1]
    #if batch size is 0 , then just return
    if BATCH_SIZE==0:
        return 0
    
    half_batch = int(BATCH_SIZE / 2)
    # current_batch = current_batch.to(device)
    # current_batch = Variable(current_batch, requires_grad=True)
    #we select the first half of the batch
    first_half = current_batch[:half_batch]
    #we select the second half of the batch
    second_half = current_batch[half_batch:]
    #we create a random number between 0 and 1
    # alpha = torch.rand(half_batch, 1)
    #we expand the alpha to the size of the first half of the batch
    # alpha = alpha.expand(first_half.size())
    #we create alpha as a random number between 0 and 1 which will allow us to interpolate between the first half and the second half
    
    alpha = torch.rand(half_batch, 1, 1, 1).repeat(1, C, H, W)
    #we expand the alpha to the size of the first half of the batch
    # alpha = alpha.expand(first_half.size())


    #we move alpha to the device
    alpha = alpha.to(device)
    #we interpolate between the first half and the second half
    interpolates = alpha * first_half + ((1 - alpha) * second_half)
    #we move interpolates to the device
    interpolates = interpolates.to(device)
    # interpolates = interpolates
    #we create a variable of interpolates
    interpolates = Variable(interpolates, requires_grad=True)
    #we pass interpolates through the cnn
    disc_interpolates = dnn(interpolates)
    #we calculate the gradients
    gradients = torch.autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                                    grad_outputs=torch.ones(disc_interpolates.size()).to(device),
                                    create_graph=True, retain_graph=True, only_inputs=True)[0]
    #we calculate the gradient penalty
    # calculate gradient norm 
    gradients_norm = gradients.norm(2, dim=1)
    #mean of the gradient norm without subtracting 1 or lambda
    gradient_norm_mean = (gradients_norm **2).mean()
    #max of sqrt of the gradient norm without subtracting 1 or lambda
    # gradient_norm_max = (gradients_norm **2).max( dim=0, keepdim=True)[0]

    #delete the variables from the memory
    del first_half
    del second_half
    del alpha
    del interpolates
    del disc_interpolates
    del gradients
    del gradients_norm
    #cache the garbage
    torch.cuda.empty_cache()


    
    # gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()    #have to check this formula    / * LAMBDA
    #gradient penalty  should be max(0, gradient_penalty-1)
    #we return the gradient penalty
    return gradient_norm_mean
    # , gradient_norm_max

    


In [34]:
#we will define the scheduler
#we will use stepLR scheduler
# scheduler = StepLR(optimizer, step_size=1, gamma=gamma)



In [35]:
#we will use tensorboard to visualize the training
#we will plot the loss and accuracy
#we will also track the gradient penalty of the network

#create writer for tensorboard
writer = SummaryWriter(f'runs/'+experiment_id)

2022-11-09 12:06:29.349592: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [34]:
# now we will define the train function, we will also track the gradient norm of the network
def train(model,  train_loader, optimizer, epoch=num_epochs, device = device):

    #make model to train mode
    model.train()
    #loop for each epoch
    epoch_tracker = 0
    total_loss = 0
    total = 0
    correct = 0
    batch_tracker = 0
    #we will add th loss for each batch in the epoch and then divide by the number of batches
    for ep in range(epoch):
        epoch_total = 0
        epoch_correct = 0
        epoch_total_loss = 0
        #loop for each batch
        
        for batch_idx, (data, target) in enumerate(train_loader):

            #if first epoch and first batch then print the shape of data and label
            if ep == 0 and batch_idx == 0:
                print('Train data shape: ', data.shape)
                print('Train label shape: ', target.shape)
            #send data to gpu
            data, target = data.to(device), target.to(device)
            #set the gradients to zero
            optimizer.zero_grad()
            #get the output from the model
            output = model(data)
            #calculate the loss
            loss = criterion(output, target)
            #calculate the gradients
            loss.backward()
            #update the weights

            #we will calculate the gradient norm
            gradient_n = gradient_norm(model, data)
            #we will add the gradient norm to the tensorboard
            writer.add_scalar('Gradient Norm', gradient_n, batch_tracker)
            optimizer.step()
            #write the loss to tensorboard
            writer.add_scalar('Training loss', loss, global_step=batch_tracker)

            #calculate the total loss
            total_loss += loss.item()
            #total epoch loss sum
            epoch_total_loss += loss.item()


            #calculate the accuracy
            #get the max value from the output
            _, predicted = torch.max(output.data, 1)
            #calculate the total number of labels
            temp_total = target.size(0)
            #calculate the correct predictions
            temp_correct = (predicted == target).sum().item()
            #add the total and correct predictions
            total += temp_total
            epoch_total += temp_total
            correct += temp_correct
            epoch_correct += temp_correct
            #calculate the accuracy
            epoch_accuracy = 100 * epoch_correct / epoch_total
            #write the accuracy to tensorboard
            writer.add_scalar('Training accuracy', epoch_accuracy, global_step=batch_tracker)
            #print the loss and accuracy
            #and
            #print the gradient norm
            print('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {:.2f}%\tGradient Norm: {:.6f}'.format(
                ep, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item(),
                epoch_accuracy, gradient_n))
            

            
            #print the loss
            # if batch_idx % log_interval == 0:
            #     print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            #         ep, batch_idx * len(data), len(train_loader.dataset),
            #         100. * batch_idx / len(train_loader), loss.item()))
                
        
            #write the epoch loss to tensorboard
            #first average the loss over the batches in the epoch
            batch_tracker += 1
        epoch_loss = total_loss / len(train_loader)

        #write the loss to tensorboard
        writer.add_scalar('Training - Epoch loss', epoch_loss, global_step=ep)
        #calculate the accuracy
        epoch_accuracy = 100 * correct / total
        #write the accuracy to tensorboard
        writer.add_scalar('Training - Epoch accuracy', epoch_accuracy, global_step=ep)

        # #save the model after every epoch, the name be experiment_id_epoch
        # #wew will save in the folder saved_models
        # torch.save(model.state_dict(), 'saved_models/'+experiment_id+'_'+str(ep)+'.pth')
        # #we will also save the optimizer
        # torch.save(optimizer.state_dict(), 'saved_models/'+experiment_id+'_'+str(ep)+'_optimizer.pth')
        #we will save the best model till now based on loss
        #check if first epoch, then save the model anyway
        if ep == 0:
            #save the model
            torch.save(model.state_dict(), 'saved_models/'+experiment_id+'_'+str(ep)+'.pth')
            #save the optimizer
            torch.save(optimizer.state_dict(), 'saved_models/'+experiment_id+'_'+str(ep)+'_optimizer.pth')
            #save the loss
            best_loss = epoch_total_loss
            #save the epoch
            best_epoch = ep
        else:
            #check if the loss is less than the best loss
            if epoch_total_loss < best_loss:
                #save the model
                torch.save(model.state_dict(), 'saved_models/'+experiment_id+'_'+str(best_epoch)+'.pth')
                #save the optimizer
                torch.save(optimizer.state_dict(), 'saved_models/'+experiment_id+'_'+str(best_epoch)+'_optimizer.pth')
                #save the loss
                best_loss = epoch_loss
                #save the epoch
                best_epoch = ep
       


        epoch_tracker += 1
    
    #print the accuracy
    total_accuracy = 100 * correct / total
    print('Accuracy: ', total_accuracy)

    #return the model
    return model
        

   

In [35]:
#call the train function
model = train(model, train_loader, optimizer, epoch=num_epochs)

Train data shape:  torch.Size([64, 1, 224, 224])
Train label shape:  torch.Size([64])
Accuracy:  95.26166666666667


In [36]:
#now we will test the model on usps test dataset
#now we have the trained model on MNIST, we will use the same model to test on USPS
#the usps test data is in dataloader : test_loader

#we will define a function to test the model
def test(model, test_loader):
    #set the model to evaluation mode
    model.eval()
    #set the total and correct to zero
    total = 0
    correct = 0
    #we will not calculate the gradients
    with torch.no_grad():
        #loop through the test data
        for data, target in test_loader:
            #send the data to gpu
            data, target = data.to(device), target.to(device)
            #get the output from the model
            output = model(data)
            #get the max value from the output
            _, predicted = torch.max(output.data, 1)
            #calculate the total number of labels
            total += target.size(0)
            #calculate the correct predictions
            correct += (predicted == target).sum().item()
    #calculate the accuracy
    accuracy = 100 * correct / total
    #print the accuracy
    print('Accuracy: ', accuracy)
    #return the accuracy
    return accuracy


In [37]:
#call the test function
test(model, test_loader)

Accuracy:  9.971197366616376


9.971197366616376

In [38]:
#save the model
#the name be experiment_id + _final
torch.save(model.state_dict(), 'saved_models/'+experiment_id+'_final.pth')

#save the optimizer
torch.save(optimizer.state_dict(), 'saved_models/'+experiment_id+'_final_optimizer.pth')

#close the tensorboard writer
writer.close()



In [51]:
#define a function to print precision, recall, f1 score and support of each class in a classification report
#input will be a model and a dataloader
def classification_report(model, test_loader):
    #number of classes is in the variable num_classes
    #we will builld a confusion matrix

    #now we build y_true and y_pred 
    #we will use the test_loader
    #we will set the model to evaluation mode
    # model.eval()
    #we will not calculate the gradients
    #build the y_true and y_pred
    y_true = []
    y_pred = []
    with torch.no_grad():
        #loop through the test data
        for data, target in test_loader:
            #send the data to gpu
            data, target = data.to(device), target.to(device)
            #get the output from the model
            output = model(data)
            #get the max value from the output
            _, predicted = torch.max(output.data, 1)
            #get the y_pred
            y_pred_new = predicted.cpu().numpy()
            #concate the y_pred
            y_pred = np.concatenate((y_pred, y_pred_new))
            #get the y_true
            y_true = target.cpu().numpy()
            #concate the y_true
            y_true = np.concatenate((y_true, y_true))
    #print the y_true and y_pred
    #now we will build the confusion matrix
    #we will use the sklearn confusion matrix
    # cm = confusion_matrix(y_true, y_pred)
    #now we will print the classification report
    # print(classification_report(y_true, y_pred))
    #now we will plot class wise precision, recall, f1 score and support
    # we use (classification_report(y_true, y_pred, labels=[list of classes], output_dict=True))
    #create a list of classes
    classes = [i for i in range(NUM_CLASSES)]
    # cr = classification_report(y_true, y_pred, labels=classes, output_dict=True)
    #now we will plot the precision, recall, f1 score and support
    #print 
    print(classification_report(y_true, y_pred))
    #now we will plot confusion matrix from ConfusionMatrixDisplay.from_estimator
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot()
    plt.show()

    #plot graphically 

    




    

In [52]:
#call classification_report
classification_report(model, test_loader)

TypeError: cannot unpack non-iterable numpy.float64 object