In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pytorch-ignite



## Task:
### Covid-19 XRay-Image classification
### Dataset: http://ieee-dataport.org/open-access/covid19action-radiology-cxr
### Class: Covid-19-positive, Pneumonia, Non-Pneumonia
### Model: Developed on LeNet

Note: Dataset usage guide - citation for the data Source-4 is corrected as https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia

In [1]:
%matplotlib inline
from tqdm import tqdm
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torchvision import transforms, datasets
import torch.optim as optim
import random
import pandas as pd
import os
from ignite.handlers import Timer

from PIL import Image
from torchvision.transforms import ToTensor, ToPILImage
from torchvision import models
import io
from torch.utils.data import Dataset
import zipfile
from torch.utils import data

In [2]:
class RadiologyDataset(Dataset):

    def __init__(self, txt_path='filelist.txt', img_dir='data/', transform=None, train=True, val=False, val_partition=5):
        """
        Initialize data set as a list of IDs corresponding to each item of data set

        :param img_dir: path to image files as a uncompressed tar archive
        :param txt_path: a text file containing names of all of images line by line
        :param transform: apply some transforms like cropping, rotating, etc on input image
        """
        
        df = pd.read_csv(txt_path, encoding='utf-8', engine='python')
        #df['class'] = df['Non-Pneumonia'].astype(str)+'_'+df['Other Pneumonia'].astype(str)+'_'+df['COVID-19'].astype(str)
        #df['class'].replace({"0_0_1": 0, "0_1_0": 1, "1_0_0": 2, "-1_-1_0": 3}, inplace=True)
        if train == True:
            df = df[df.Partition != val_partition].reset_index().drop(['index'], axis=1)
        elif val == True:
            df = df[df.Partition == val_partition].reset_index().drop(['index'], axis=1)
        else:
            df = df
        df['class'] = df['Non-Pneumonia'].astype(str) + '_' + df['Other Pneumonia'].astype(str) + '_' + df['COVID-19'].astype(str)
        df = df[df['class'] != '-1_-1_0'].reset_index().drop(['index'], axis=1)
        df['class'].replace({"0_0_1": 0, "0_1_0": 1, "1_0_0": 2}, inplace=True)
        self.df = df
        self.img_names = df.index.values
        self.img_dir = img_dir
        self.transform = transform
        self.train = train
        self.val = val
        self.source3_image_selector = False
        self.source4_image_selector = False
        self.source3_zip_handle = None
        self.source4_zip_handle = None

    def get_image_from_zip(self, source, zip_name, zip_dir, name):
        """
        Gets a image by a name gathered from file list csv file

        :param name: name of targeted image
        :return: a PIL image
        """
        if source == 'Source-3':
            if self.source3_image_selector == False:
                self.source3_zip_handle = zipfile.ZipFile(self.img_dir + zip_name, 'r')
                self.source3_image_selector = True
            archive = self.source3_zip_handle
        elif source == 'Source-4':
            if self.source4_image_selector == False:
                self.source4_zip_handle = zipfile.ZipFile(self.img_dir + zip_name, 'r')
                self.source4_image_selector = True
            archive = self.source4_zip_handle
        else:
            archive = zipfile.ZipFile(self.img_dir + zip_name, 'r')

        image = archive.read(zip_dir + name)
        image = Image.open(io.BytesIO(image))
        #archive.close()
        
        return image

    def get_image_from_folder(self, name):
        """
        gets a image by a name gathered from file list text file

        :param name: name of targeted image
        :return: a PIL image
        """

        image = Image.open(self.img_dir + name)
        return image

    def __len__(self):
        """
        Return the length of data set using list of IDs

        :return: number of samples in data set
        """
        return len(self.img_names)

    def __getitem__(self, index):
        """
        Generate one item of data set.

        :param index: index of item in IDs list

        :return: a sample of data as a dict
        """
        source = self.df.iloc[index]['Data Source']
        image_name = self.df.iloc[index]['Image Name']

        if source == 'Source-3':
            zip_name = source + '/CheXpert-v1.0-small.zip'
            temp = image_name.split('/')[1].split('__')
            temp[0] = temp[0]+'-small'
            zip_dir = '/'.join(  temp[:-1]) + '/'
            image_name = temp[-1]
            X = self.get_image_from_zip(source, zip_name, zip_dir, image_name)
        
        elif source == 'Source-4':
            if self.train == True or self.val == True:
                zip_dir = 'chest-xray-pneumonia/chest_xray/train/'
            else:
                zip_dir = 'chest-xray-pneumonia/chest_xray/test/'
            zip_name = source + '/chest-xray-pneumonia.zip'
            X = self.get_image_from_zip(source, zip_name, zip_dir, image_name)

        elif source == 'Source-6':
            try:
                X = self.get_image_from_folder(source + '/' + image_name+'.jpg')
            except FileNotFoundError:
                X = self.get_image_from_folder(source + '/' + image_name+'.png')
          
          
        else:
            X = self.get_image_from_folder(source + '/' + image_name)
        
        flag = False
        #if X.mode == 'L':
        #  flag = True
        #if X.mode == 'RGBA':
        #  X = X.convert('RGB')
        X = X.convert('L')


        # Get you label here using available pandas functions
        y = self.df.iloc[index]['class']

        if self.transform is not None:
            X = self.transform(X)
            if flag:
                X = transforms.Compose([transforms.Lambda(lambda x: x.repeat(3, 1, 1) )])(X)

        if index == (self.__len__() - 1):
            if self.source3_image_selector:  # close tarfile opened in __init__
                self.source3_image_selector = False
                self.source3_zip_handle.close()                
            if self.source4_image_selector:  # close tarfile opened in __init__
                self.source4_image_selector = False
                self.source4_zip_handle.close()

        return X, y

In [3]:
# Parameters
params = {'batch_size': 32,
          'shuffle': True,
          'num_workers': 8}

vparams = {'batch_size': 32,
          'shuffle': False,
          'num_workers': 8}

max_epochs = 100
n_splits = 5


train_transform = transforms.Compose([transforms.Resize(size = (256, 256)),\
                                      transforms.RandomRotation(10, resample=Image.BILINEAR), \
                                      transforms.ToTensor(), \
                                      transforms.Normalize((0.45,), (0.22,)) ])
test_transform = transforms.Compose([ transforms.Resize(size = (256, 256)),\
                                      transforms.ToTensor(), \
                                      transforms.Normalize((0.45,), (0.22,)) ])

# Generators
txt_path = '/content/drive/My Drive/Covid-19-chest-xray-classification/'
img_dir =  '/content/drive/My Drive/Covid-19-chest-xray-classification/images/'

training_set = RadiologyDataset(txt_path + 'Train_Combined.csv', img_dir, transform = train_transform, train=True, val=False, val_partition=5)
trainLoader = data.DataLoader(training_set, **params)

validation_set = RadiologyDataset(txt_path + 'Train_Combined.csv', img_dir, transform = test_transform, train=False, val=True, val_partition=5)
validationLoader = data.DataLoader(validation_set, **vparams)

test_set = RadiologyDataset(txt_path + 'Test_Combined.csv', img_dir, transform = test_transform, train=False, val=False, val_partition=0)
testLoader = data.DataLoader(test_set, **vparams)


use_gpu = torch.cuda.is_available()
if use_gpu:
    print('GPU is avaialble!')


GPU is avaialble!


In [0]:
print(len(training_set), len(validation_set), len(test_set))

In [4]:
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=3, stride= 1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2,stride=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=3, stride= 1, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2,stride=2) 
        self.conv3 = nn.Conv2d(6, 32, kernel_size=3, stride= 1, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2,stride=2) 
        self.conv4 = nn.Conv2d(6, 48, kernel_size=3, stride= 1, padding=1)
        self.pool4 = nn.MaxPool2d(kernel_size=2,stride=2)   
        self.conv5 = nn.Conv2d(6, 64, kernel_size=3, stride= 1, padding=1)
        self.pool5 = nn.MaxPool2d(kernel_size=4,stride=4)      
        self.fc1 = nn.Linear(1024, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 3)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = self.pool3(x)
        x = F.relu(self.conv4(x))
        x = self.pool4(x)
        x = F.relu(self.conv5(x))
        x = self.pool5(x)
        x = x.view(-1, 1024)
        x = F.relu(self.fc1(x)) 
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x,dim=1)

In [5]:
def get_model():
    net = LeNet()
    return net


In [6]:
net = get_model()

# Find total parameters and trainable parameters
total_params = sum(p.numel() for p in net.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
    p.numel() for p in net.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')

142,279 total parameters.
142,279 training parameters.


In [7]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, save_dir = './' , fold_ = ''):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_acc_max = -np.Inf
        self.train_acc_max = -np.Inf
        self.delta = delta
        self.fold_ = fold_
        self.save_dir = save_dir

    def __call__(self, val_acc, train_acc, model):

        score = val_acc
        tscore = train_acc

        if self.best_score is None:
            self.best_score = score
            self.best_tscore = tscore
            self.save_checkpoint(val_acc, model)
        elif score <= self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
            if tscore > self.best_tscore:
                self.best_tscore = tscore
                self.save_tcheckpoint(train_acc, model)
                self.counter = 0
        else:
            self.best_score = score
            self.save_checkpoint(val_acc, model)
            self.counter = 0

    def save_checkpoint(self, val_acc, model):
        '''Saves model when validation accuracy improves.'''
        if self.verbose:
            print(f'Validation accuracy increased ({self.val_acc_max:.6f} --> {val_acc:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.save_dir+'lenet_checkpoint'+str(self.fold_)+'.pt')
        self.val_acc_max = val_acc

    def save_tcheckpoint(self, train_acc, model):
        '''Saves model when train accuracy improves.'''
        if self.verbose:
            print(f'Train accuracy increased ({self.train_acc_max:.6f} --> {train_acc:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.save_dir+'lenet_checkpoint'+str(self.fold_)+'.pt')
        self.train_acc_max = train_acc

In [8]:
def test_accuracy_calculate():
    correct_pred = 0
    t.reset()

    actual_labels = []
    pred_lables = []
    with torch.set_grad_enabled(False):
        for data_ in testLoader:
            inputs,labels = data_
            if use_gpu:
                inputs, labels = inputs.cuda(),labels.cuda()
            t.resume()
            # Feedforward train data batch through model
            output = net(inputs) 
            # Predicted class is the one with maximum probability
            preds = torch.argmax(output,dim=1)
            correct_pred += torch.sum(preds==labels)
            t.pause()
            t.step()

            actual_labels = actual_labels + list(labels.cpu().detach().numpy())
            pred_lables = pred_lables + list(preds.cpu().detach().numpy())

    test_accuracy = correct_pred.item()/len(test_set)
    print('Testing accuracy = ',test_accuracy*100, 'Computational Time = ', t.value())
    return test_accuracy * 100, actual_labels, pred_lables

In [9]:
### Set the Timer to compute the inference 
t = Timer(average=True)
t.reset()
torch.cuda.synchronize()

In [0]:
kfold_train_acc = []
kfold_validation_acc = []
kfold_test_acc = []

kfold_train_loss = []
kfold_validation_loss = []
n_splits =1
for sp_ in range(n_splits):
    fold_ = sp_ + 1
    print("*"*100)
    print("Training for cross-fold number:", fold_)
    print("*"*100)
    ##############  Initialize the Network  ###########

    #net = get_model()
    net = LeNet()
    if use_gpu:
        net = net.cuda()
        net = nn.DataParallel(net)

    ##############  Define the Loss function  ##########
    criterion = nn.CrossEntropyLoss()  

    ##############  Define the Optimizer  #################
    optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.5)
    #optimizer = optim.Adam(net.parameters())

    ##############  Define the scheduler  (if require warm restart to set the learning rate)   #################
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.1, steps_per_epoch=len(trainLoader), epochs=num_epochs, anneal_strategy='linear') #steps_per_epoch=len(trainLoader), epochs=num_epochs,
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

    ##############  Set the Loader -- For current k-fold  ###########
    training_set = RadiologyDataset(txt_path + 'Train_Combined.csv', img_dir, transform = train_transform, train=True, val=False, val_partition=fold_)
    trainLoader = data.DataLoader(training_set, **params)

    validation_set = RadiologyDataset(txt_path + 'Train_Combined.csv', img_dir, transform = test_transform, train=False, val=True, val_partition=fold_)
    validationLoader = data.DataLoader(validation_set, **vparams)


    #############   Declare the place holder for storing -- loss and accuracy for each epoch   ########
    train_loss = []
    train_acc = []

    validation_loss =[]
    validation_acc = []

    #############  Initialize the Early-Stopping criterea of training   #########
    early_stopping = EarlyStopping(patience=20, verbose=True, delta=0, save_dir = img_dir , fold_ = str(fold_))

    num_epochs = 20

    for epoch in range(num_epochs):
        t.reset()  ## --- restart timer

        #####################################
        #######          Train      #########   
        ##################################### 
        running_loss = 0.0 
        running_corr = 0
        for i,data_ in tqdm(enumerate(trainLoader)):
            #print(fold_, epoch, i)
            t.resume()   # ---- timer on
            inputs,labels = data_
            if use_gpu:
                inputs, labels = inputs.cuda(),labels.cuda() 
            # Initializing model gradients to zero
            optimizer.zero_grad() 

            
            # Data feed-forward through the network
            outputs = net(inputs)
            # Predicted class is the one with maximum probability
            preds = torch.argmax(outputs,dim=1)
            # Finding the loss
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step() 
            #scheduler.step()
            t.pause()   # ---- timer off

            # Accumulating the loss for each batch
            running_loss += loss 
            # Accumulate number of correct predictions
            running_corr += torch.sum(preds==labels)    
            

        epoch_loss = running_loss.item()/(i+1)   #Total loss for one epoch
        epoch_acc = running_corr.item()/len(training_set)

        train_loss.append(epoch_loss) #Saving the loss over epochs for plotting the graph
        train_acc.append(epoch_acc) #Saving the accuracy over epochs for plotting the graph

        #####################################
        #######     Validation      #########   
        ##################################### 

        val_running_loss = 0.0 
        val_running_corr = 0
        ### Validation 
        with torch.no_grad():   #torch.set_grad_enabled(False):
            for j, data_ in tqdm(enumerate(validationLoader)):
                inputs,labels = data_
                if use_gpu:
                    inputs, labels = inputs.cuda(),labels.cuda() 
                t.resume()   # ---- timer on
                outputs = net(inputs)
                preds = torch.argmax(outputs,dim=1)
                loss = criterion(outputs, labels)
                t.pause()   # ---- timer off
                val_running_loss += loss 
                val_running_corr += torch.sum(preds==labels) 

        val_epoch_loss = val_running_loss.item()/(j+1)   #Total loss for one epoch
        val_epoch_acc = val_running_corr.item()/len(validation_set)

        validation_loss.append(val_epoch_loss) #Saving the loss over epochs for plotting the graph
        validation_acc.append(val_epoch_acc) #Saving the accuracy over epochs for plotting the graph    

        #########   Scheduler to update the learning rate   #######
        scheduler.step(val_epoch_loss)    
        
        #########   Check for early-stopping based on validation accuracy and training accuracy   ###########
        early_stopping(val_epoch_acc, epoch_acc, net)
        if early_stopping.early_stop:
            print("Early stopping")
            break
            
        t.step()   #-- timer average

        #if (epoch) % 5 == 0:
        print('Epoch {:.0f}/{:.0f} : Training loss: {:.4f} | Training Accuracy: {:.4f} : Validation loss: {:.4f} | Validation Accuracy: {:.4f} | Computational Time: {:.4f} sec'\
              .format(epoch+1, num_epochs, epoch_loss, epoch_acc*100, val_epoch_loss, val_epoch_acc*100, t.value()))
        
    # load the last checkpoint with the best model
    net.load_state_dict(torch.load(img_dir+'lenet_checkpoint'+str(fold_)+'.pt'))


    ############  Calculate Test Accuracy  #######
    test_acc, test_labels, pred_labels= test_accuracy_calculate()


    kfold_train_acc.append(train_acc)
    kfold_validation_acc.append(validation_acc)
    kfold_test_acc.append(test_acc)

    kfold_train_loss.append(train_loss)
    kfold_validation_loss.append(validation_loss)



In [0]:
#test_acc, test_labels, preds_labels = test_accuracy_calculate()

In [0]:
from sklearn.metrics import accuracy_score,  multilabel_confusion_matrix

acc = accuracy_score(test_labels, np.round(preds_labels))*100
cm = multilabel_confusion_matrix(test_labels, np.round(preds_labels))


for i in range(3):
    print("#"*20)
    if i == 0:
        print("Covid Vs Non-Covid")
    elif i == 1:
        print("Pneumonia Vs Non-Pneumonia")
    else:
        print("Normal Vs Non-Normal")
    print("#"*20)
    tn, fp, fn, tp = cm[i].ravel()
    print('CONFUSION MATRIX --------')
    print(cm[i])

    print('\nTEST METRICS ------------')
    precision = tp/(tp+fp)*100
    recall = tp/(tp+fn)*100
    print('Accuracy: {}%'.format(acc))
    print('Precision: {}%'.format(precision))
    print('Recall: {}%'.format(recall))
    print('F1-score: {}'.format(2*precision*recall/(precision+recall)))



In [10]:
def model_summary(model):
    print("model_summary")
    print()
    print("Layer_name"+"\t"*7+"Number of Parameters")
    print("="*100)
    model_parameters = [layer for layer in model.parameters() if layer.requires_grad]
    layer_name = [child for child in model.children()]
    j = 0
    total_params = 0
    for i in layer_name:
        print()
        param = 0
        try:
            if i.bias is not None:
                if len(i.bias) == 0:
                    param =model_parameters[j].numel()
                    j = j+1
                else:
                    param =model_parameters[j].numel()+model_parameters[j+1].numel()
                    j = j+2

        except:
            bias = False  
            param = 0

        print(str(i)+"\t"*3+str(param))
        total_params+=param
    print("="*100)
    print(f"Total Params:{total_params}")       

model_summary(net)

model_summary

Layer_name							Number of Parameters

Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))			60

MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)			0

Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))			880

MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)			0

Conv2d(6, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))			1760

MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)			0

Conv2d(6, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))			2640

MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)			0

Conv2d(6, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))			3520

MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)			0

Linear(in_features=1024, out_features=120, bias=True)			123000

Linear(in_features=120, out_features=84, bias=True)			10164

Linear(in_features=84, out_features=3, bias=True)			255
Tota