In [1]:
from google.colab import drive
drive.mount('/content/drive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install pytorch-ignite

Collecting pytorch-ignite
[?25l  Downloading https://files.pythonhosted.org/packages/35/55/41e8a995876fd2ade29bdba0c3efefa38e7d605cb353c70f3173c04928b5/pytorch_ignite-0.3.0-py2.py3-none-any.whl (103kB)
[K     |███▏                            | 10kB 19.7MB/s eta 0:00:01[K     |██████▎                         | 20kB 1.7MB/s eta 0:00:01[K     |█████████▌                      | 30kB 2.2MB/s eta 0:00:01[K     |████████████▋                   | 40kB 2.5MB/s eta 0:00:01[K     |███████████████▉                | 51kB 2.0MB/s eta 0:00:01[K     |███████████████████             | 61kB 2.3MB/s eta 0:00:01[K     |██████████████████████▏         | 71kB 2.5MB/s eta 0:00:01[K     |█████████████████████████▎      | 81kB 2.7MB/s eta 0:00:01[K     |████████████████████████████▍   | 92kB 2.9MB/s eta 0:00:01[K     |███████████████████████████████▋| 102kB 2.8MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 2.8MB/s 
Installing collected packages: pytorch-ignite
Successful

## Task:
### Covid-19 XRay-Image classification
### Dataset: http://ieee-dataport.org/open-access/covid19action-radiology-cxr
### Class: Covid-19-positive, Pneumonia, Non-Pneumonia
### Model: Developed on AlexNet

Note: Dataset usage guide - citation for the data Source-4 is corrected as https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia

In [0]:
%matplotlib inline
from tqdm import tqdm
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torchvision import transforms, datasets
import torch.optim as optim
import random
import pandas as pd
import os
from ignite.handlers import Timer

from PIL import Image
from torchvision.transforms import ToTensor, ToPILImage
from torchvision import models
import io
from torch.utils.data import Dataset
import zipfile
from torch.utils import data

In [0]:
class RadiologyDataset(Dataset):

    def __init__(self, txt_path='filelist.txt', img_dir='data/', transform=None, train=True, val=False, val_partition=5):
        """
        Initialize data set as a list of IDs corresponding to each item of data set

        :param img_dir: path to image files as a uncompressed tar archive
        :param txt_path: a text file containing names of all of images line by line
        :param transform: apply some transforms like cropping, rotating, etc on input image
        """
        
        df = pd.read_csv(txt_path, encoding='utf-8', engine='python')
        #df['class'] = df['Non-Pneumonia'].astype(str)+'_'+df['Other Pneumonia'].astype(str)+'_'+df['COVID-19'].astype(str)
        #df['class'].replace({"0_0_1": 0, "0_1_0": 1, "1_0_0": 2, "-1_-1_0": 3}, inplace=True)
        if train == True:
            df = df[df.Partition != val_partition].reset_index().drop(['index'], axis=1)
        elif val == True:
            df = df[df.Partition == val_partition].reset_index().drop(['index'], axis=1)
        else:
            df = df
        df['class'] = df['Non-Pneumonia'].astype(str) + '_' + df['Other Pneumonia'].astype(str) + '_' + df['COVID-19'].astype(str)
        df = df[df['class'] != '-1_-1_0'].reset_index().drop(['index'], axis=1)
        df['class'].replace({"0_0_1": 0, "0_1_0": 1, "1_0_0": 2}, inplace=True)
        self.df = df
        self.img_names = df.index.values
        self.img_dir = img_dir
        self.transform = transform
        self.train = train
        self.val = val
        self.source3_image_selector = False
        self.source4_image_selector = False
        self.source3_zip_handle = None
        self.source4_zip_handle = None

    def get_image_from_zip(self, source, zip_name, zip_dir, name):
        """
        Gets a image by a name gathered from file list csv file

        :param name: name of targeted image
        :return: a PIL image
        """
        if source == 'Source-3':
            if self.source3_image_selector == False:
                self.source3_zip_handle = zipfile.ZipFile(self.img_dir + zip_name, 'r')
                self.source3_image_selector = True
            archive = self.source3_zip_handle
        elif source == 'Source-4':
            if self.source4_image_selector == False:
                self.source4_zip_handle = zipfile.ZipFile(self.img_dir + zip_name, 'r')
                self.source4_image_selector = True
            archive = self.source4_zip_handle
        else:
            archive = zipfile.ZipFile(self.img_dir + zip_name, 'r')

        image = archive.read(zip_dir + name)
        image = Image.open(io.BytesIO(image))
        #archive.close()
        
        return image

    def get_image_from_folder(self, name):
        """
        gets a image by a name gathered from file list text file

        :param name: name of targeted image
        :return: a PIL image
        """

        image = Image.open(self.img_dir + name)
        return image

    def __len__(self):
        """
        Return the length of data set using list of IDs

        :return: number of samples in data set
        """
        return len(self.img_names)

    def __getitem__(self, index):
        """
        Generate one item of data set.

        :param index: index of item in IDs list

        :return: a sample of data as a dict
        """
        source = self.df.iloc[index]['Data Source']
        image_name = self.df.iloc[index]['Image Name']

        if source == 'Source-3':
            zip_name = source + '/CheXpert-v1.0-small.zip'
            temp = image_name.split('/')[1].split('__')
            temp[0] = temp[0]+'-small'
            zip_dir = '/'.join(  temp[:-1]) + '/'
            image_name = temp[-1]
            X = self.get_image_from_zip(source, zip_name, zip_dir, image_name)
        
        elif source == 'Source-4':
            if self.train == True or self.val == True:
                zip_dir = 'chest-xray-pneumonia/chest_xray/train/'
            else:
                zip_dir = 'chest-xray-pneumonia/chest_xray/test/'
            zip_name = source + '/chest-xray-pneumonia.zip'
            X = self.get_image_from_zip(source, zip_name, zip_dir, image_name)

        elif source == 'Source-6':
            try:
                X = self.get_image_from_folder(source + '/' + image_name+'.jpg')
            except FileNotFoundError:
                X = self.get_image_from_folder(source + '/' + image_name+'.png')
          
          
        else:
            X = self.get_image_from_folder(source + '/' + image_name)
        
        flag = False
        if X.mode == 'L':
            flag = True
        if X.mode == 'RGBA':
            X = X.convert('RGB')
        #X = X.convert('L')


        # Get you label here using available pandas functions
        y = self.df.iloc[index]['class']

        if self.transform is not None:
            X = self.transform(X)
            if flag:
                X = transforms.Compose([transforms.Lambda(lambda x: x.repeat(3, 1, 1) )])(X)

        if index == (self.__len__() - 1):
            if self.source3_image_selector:  # close tarfile opened in __init__
                self.source3_image_selector = False
                self.source3_zip_handle.close()                
            if self.source4_image_selector:  # close tarfile opened in __init__
                self.source4_image_selector = False
                self.source4_zip_handle.close()

        return X, y

In [None]:
# Parameters
params = {'batch_size': 32,
          'shuffle': True,
          'num_workers': 8}

vparams = {'batch_size': 32,
          'shuffle': False,
          'num_workers': 8}

max_epochs = 100
n_splits = 5


train_transform = transforms.Compose([transforms.Resize(size = (256, 256)),\
                                      transforms.RandomCrop(size = 224),\
                                      transforms.RandomRotation(10, resample=Image.BILINEAR), \
                                      transforms.ToTensor(), \
                                      transforms.Normalize((0.45,), (0.22,)) ])
test_transform = transforms.Compose([ transforms.Resize(size = (256, 256)),\
                                      transforms.RandomCrop(size = 224),\
                                      transforms.ToTensor(), \
                                      transforms.Normalize((0.45,), (0.22,)) ])

# Generators
txt_path = '/content/drive/My Drive/Covid-19-chest-xray-classification/'
img_dir =  '/content/drive/My Drive/Covid-19-chest-xray-classification/images/'

training_set = RadiologyDataset(txt_path + 'Train_Combined.csv', img_dir, transform = train_transform, train=True, val=False, val_partition=5)
trainLoader = data.DataLoader(training_set, **params)

validation_set = RadiologyDataset(txt_path + 'Train_Combined.csv', img_dir, transform = test_transform, train=False, val=True, val_partition=5)
validationLoader = data.DataLoader(validation_set, **vparams)

test_set = RadiologyDataset(txt_path + 'Test_Combined.csv', img_dir, transform = test_transform, train=False, val=False, val_partition=0)
testLoader = data.DataLoader(test_set, batch_size= 32, shuffle=False, num_workers=8)


use_gpu = torch.cuda.is_available()
if use_gpu:
    print('GPU is avaialble!')


In [0]:
def get_model():
    model = torch.hub.load('pytorch/vision:v0.6.0', 'alexnet', pretrained=True)
    # Freeze model weights
    for param in model.parameters():
        param.requires_grad = False
    model.classifier[4] =  nn.Linear(in_features=4096, out_features=1000, bias=True)
    model.classifier[5] =  nn.ReLU(inplace=True)
    model.classifier[6] =  nn.Sequential(nn.Dropout(p=0.3, inplace=False),
                           nn.Linear(in_features=1000, out_features=100, bias=True),
                           nn.ReLU(inplace=True),
                           nn.Dropout(p=0.3, inplace=False),
                           nn.Linear(in_features=100, out_features=3, bias=True),
                           nn.LogSoftmax(dim=1))
    return model


In [11]:
net = get_model()

# Find total parameters and trainable parameters
total_params = sum(p.numel() for p in net.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
    p.numel() for p in net.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')

Downloading: "https://github.com/pytorch/vision/archive/v0.6.0.zip" to /root/.cache/torch/hub/v0.6.0.zip
Downloading: "https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-4df8aa71.pth


HBox(children=(FloatProgress(value=0.0, max=244418560.0), HTML(value='')))


44,419,931 total parameters.
4,197,403 training parameters.


In [0]:
torch.save(net.state_dict(), img_dir+'Alexnetcxr_checkpoint.pt')

In [0]:
def test_accuracy_calculate():
    correct_pred = 0
    t.reset()

    actual_labels = []
    pred_lables = []
    with torch.set_grad_enabled(False):
        for data_ in testLoader:
            inputs,labels = data_
            if use_gpu:
                inputs, labels = inputs.cuda(),labels.cuda()
            t.resume()
            # Feedforward train data batch through model
            output = net(inputs) 
            # Predicted class is the one with maximum probability
            preds = torch.argmax(output,dim=1)
            correct_pred += torch.sum(preds==labels)
            t.pause()
            t.step()

            actual_labels = actual_labels + list(labels.cpu().detach().numpy())
            pred_lables = pred_lables + list(preds.cpu().detach().numpy())

    test_accuracy = correct_pred.item()/len(test_set)
    print('Testing accuracy = ',test_accuracy*100, 'Computational Time = ', t.value())
    return test_accuracy * 100, actual_labels, pred_lables

In [0]:
### Set the Timer to compute the inference 
t = Timer(average=True)
t.reset()
torch.cuda.synchronize()

In [None]:
kfold_train_acc = []
kfold_validation_acc = []
kfold_test_acc = []

kfold_train_loss = []
kfold_validation_loss = []
n_splits=5
for sp_ in range(n_splits):
    fold_ = sp_ + 1
    print("*"*100)
    print("Training for cross-fold number:", fold_)
    print("*"*100)
    ##############  Initialize the Network  ###########

    net = get_model()
    if use_gpu:
        net = net.cuda()
        net = nn.DataParallel(net)

    ##############  Define the Loss function  ##########
    criterion = nn.CrossEntropyLoss()  

    ##############  Define the Optimizer  #################
    optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.5)
    #optimizer = optim.Adam(net.parameters())

    ##############  Define the scheduler  (if require warm restart to set the learning rate)   #################
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.1, steps_per_epoch=len(trainLoader), epochs=num_epochs, anneal_strategy='linear') #steps_per_epoch=len(trainLoader), epochs=num_epochs,
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

    ##############  Set the Loader -- For current k-fold  ###########
    training_set = RadiologyDataset(txt_path + 'Train_Combined.csv', img_dir, transform = train_transform, train=True, val=False, val_partition=fold_)
    trainLoader = data.DataLoader(training_set, **params)

    validation_set = RadiologyDataset(txt_path + 'Train_Combined.csv', img_dir, transform = test_transform, train=False, val=True, val_partition=fold_)
    validationLoader = data.DataLoader(validation_set, **vparams)


    #############   Declare the place holder for storing -- loss and accuracy for each epoch   ########
    train_loss = []
    train_acc = []

    validation_loss =[]
    validation_acc = []


    num_epochs = 50

    for epoch in range(num_epochs):
        t.reset()  ## --- restart timer

        #####################################
        #######          Train      #########   
        ##################################### 
        running_loss = 0.0 
        running_corr = 0
        for i,data_ in tqdm(enumerate(trainLoader)):
            #print(fold_, epoch, i)
            t.resume()   # ---- timer on
            inputs,labels = data_
            if use_gpu:
                inputs, labels = inputs.cuda(),labels.cuda() 
            # Initializing model gradients to zero
            optimizer.zero_grad() 

            
            # Data feed-forward through the network
            outputs = net(inputs)
            # Predicted class is the one with maximum probability
            preds = torch.argmax(outputs,dim=1)
            # Finding the loss
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step() 
            #scheduler.step()
            t.pause()   # ---- timer off

            # Accumulating the loss for each batch
            running_loss += loss 
            # Accumulate number of correct predictions
            running_corr += torch.sum(preds==labels)    
            

        epoch_loss = running_loss.item()/(i+1)   #Total loss for one epoch
        epoch_acc = running_corr.item()/len(training_set)

        train_loss.append(epoch_loss) #Saving the loss over epochs for plotting the graph
        train_acc.append(epoch_acc) #Saving the accuracy over epochs for plotting the graph

        #####################################
        #######     Validation      #########   
        ##################################### 

        val_running_loss = 0.0 
        val_running_corr = 0
        ### Validation 
        with torch.no_grad():   #torch.set_grad_enabled(False):
            for j, data_ in tqdm(enumerate(validationLoader)):
                inputs,labels = data_
                if use_gpu:
                    inputs, labels = inputs.cuda(),labels.cuda() 
                t.resume()   # ---- timer on
                outputs = net(inputs)
                preds = torch.argmax(outputs,dim=1)
                loss = criterion(outputs, labels)
                t.pause()   # ---- timer off
                val_running_loss += loss 
                val_running_corr += torch.sum(preds==labels) 

        val_epoch_loss = val_running_loss.item()/(j+1)   #Total loss for one epoch
        val_epoch_acc = val_running_corr.item()/len(validation_set)

        validation_loss.append(val_epoch_loss) #Saving the loss over epochs for plotting the graph
        validation_acc.append(val_epoch_acc) #Saving the accuracy over epochs for plotting the graph    

        #########   Scheduler to update the learning rate   #######
        scheduler.step(val_epoch_loss)    
            
        t.step()   #-- timer average

        print('Epoch {:.0f}/{:.0f} : Training loss: {:.4f} | Training Accuracy: {:.4f} : Validation loss: {:.4f} | Validation Accuracy: {:.4f} | Computational Time: {:.4f} sec'\
              .format(epoch+1, num_epochs, epoch_loss, epoch_acc*100, val_epoch_loss, val_epoch_acc*100, t.value()))
        


    ############  Calculate Test Accuracy  #######
    test_acc, test_labels, pred_labels = test_accuracy_calculate()


    kfold_train_acc.append(train_acc)
    kfold_validation_acc.append(validation_acc)
    kfold_test_acc.append(test_acc)

    kfold_train_loss.append(train_loss)
    kfold_validation_loss.append(validation_loss)



In [None]:
test_acc, test_labels, preds_labels = test_accuracy_calculate()

In [None]:
from sklearn.metrics import accuracy_score,  multilabel_confusion_matrix

acc = accuracy_score(test_labels, np.round(preds_labels))*100
cm = multilabel_confusion_matrix(test_labels, np.round(preds_labels))


for i in range(3):
    print("#"*20)
    if i == 0:
        print("Covid Vs Non-Covid")
    elif i == 1:
        print("Pneumonia Vs Non-Pneumonia")
    else:
        print("Normal Vs Non-Normal")
    print("#"*20)
    tn, fp, fn, tp = cm[i].ravel()
    print('CONFUSION MATRIX --------')
    print(cm[i])

    print('\nTEST METRICS ------------')
    precision = tp/(tp+fp)*100
    recall = tp/(tp+fn)*100
    print('Accuracy: {}%'.format(acc))
    print('Precision: {}%'.format(precision))
    print('Recall: {}%'.format(recall))
    print('F1-score: {}'.format(2*precision*recall/(precision+recall)))



In [None]:
def plot_train_model(train_loss, validation_loss, train_acc, validation_acc):
    fig = plt.figure(figsize=[15,5]) 
    plt.subplot(121)
    plt.plot(range(len(train_loss)),train_loss,'r-',label='Training Loss') 
    plt.plot(range(len(validation_loss)),validation_loss,'b-',label='Validation Loss') 
    plt.legend(loc='upper right')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.subplot(122)
    plt.plot(range(len(train_acc)),train_acc,'g-',label='Training Accuracy')
    plt.plot(range(len(validation_acc)),validation_acc,'y-',label='Validation Accuracy')  
    plt.legend(loc='lower right')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')

In [None]:
for sp_ in range(n_splits):
    ############  Plot - Learning Curve   ########
    plot_train_model(kfold_train_loss[sp_], kfold_validation_loss[sp_], kfold_train_acc[sp_], kfold_validation_acc[sp_])


In [0]:
def model_summary(model):
    print("model_summary")
    print()
    print("Layer_name"+"\t"*7+"Number of Parameters")
    print("="*100)
    model_parameters = [layer for layer in model.parameters() if layer.requires_grad]
    layer_name = [child for child in model.children()]
    j = 0
    total_params = 0
    for i in layer_name:
        print()
        param = 0
        try:
            if i.bias is not None:
                if len(i.bias) == 0:
                    param =model_parameters[j].numel()
                    j = j+1
                else:
                    param =model_parameters[j].numel()+model_parameters[j+1].numel()
                    j = j+2

        except:
            bias = False  
            param = 0

        print(str(i)+"\t"*3+str(param))
        total_params+=param
    print("="*100)
    print(f"Total Params:{total_params}")       

model_summary(net)

model_summary

Layer_name							Number of Parameters

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Li