# COVID detection from Chest X-Ray
---
In this module, normal vs COVID classification is performed using Chest X-ray images.
The COVIDx dataset is used for this task. For details see [COVIDNet open source initiative](https://github.com/lindawangg/COVID-Net).

In [None]:
# Connect data
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
! tar -xzf /content/gdrive/MyDrive/covidx.tar.gz

In [None]:
! ls

covidx	gdrive	sample_data


In [None]:
import os
import cv2
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
from torchvision import transforms
from torchvision.models import resnet18
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm import trange
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

## Dataloader

In [None]:
class ImageLoader(Dataset):
  """ Data loader class """
  def __init__(self, path, file_list, aug_list=None, aug_prob=None):
    """
    Args:
      path (str): path where images stored
      file_list (List[str]): list of images in current split
      aug_list (List[str]): list of torchvision transforms
      aug_prob (float): Probability of applying random aug (if aug_list != None)
    """
    self.path = path
    self.file_list = file_list
    self.aug_list = aug_list
    self.aug_prob = aug_prob

  def __len__(self):
    return len(self.file_list)
  
  def __getitem__(self, idx):
    """ Preprocess and return a single sample & label """
    img_name = os.path.join(self.path, self.file_list[idx])
    img = Image.open(img_name)
    # Resize to 256, 256 with LANCZOS interpolation
    img = img.resize((256, 256), Image.LANCZOS)
    img = np.array(img)
    if len(img.shape) < 3:
      img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    if img.shape[2] == 4:
      img = img[:, :, :3]
    # file names are in format covidx_(label)_(name)
    label = int(img_name.split('_')[1])
    if label == 2:
      label -= 1  # as COVID is 2 in file names
    label = torch.Tensor([label])

    # convert to Tensor and change order to channels first  
    img = torch.Tensor(img)
    img = img.permute(2, 0, 1)

    # select and apply random augmentation (if passed)
    if self.aug_list:
      do_aug = np.random.choice([True, False], 1, p=[self.aug_prob,
                                                     1-self.aug_prob])
      if do_aug:
        aug_name = np.random.choice(self.aug_list, 1)
        img = aug_name[0](img)
    img = (img - torch.mean(img)) / torch.std(img)
    return img, label

In [None]:
def get_data_loaders(categories, path, file_lists,
                     augment, aug_prob, batch_size):
  """
  Wrapper function to return dataloader(s)
  Args:
    categories (List[str]): names of processes for which dataloader needed
    path (str): path where images stored
    file_lists (List[List[str]]): list of file lists
    augment (boolean): whether to apply augmentation
    aug_prob (float): Probability of applying random aug
    batch_size (int): batch size
  Returns:
    torch.utils.data.DataLoader object
  """
  loaders = []
  for i, category in enumerate(categories):
    if category == 'train' and augment:
      aug_list = [
          transforms.RandomAffine(0, translate=(0.2, 0.2)),
          transforms.RandomHorizontalFlip(p=1),
          transforms.RandomRotation(degrees=(-10, 10), fill=(0,)),
          transforms.GaussianBlur((17, 17), (11, 11))
      ]
    else:
      aug_list = None
    loader = DataLoader(
        ImageLoader(path, file_lists[i], aug_list, aug_prob),
        batch_size,
        num_workers=1
        )
    loaders.append(loader)
  return loaders

## Train/val/test loop

In [None]:
def learn(model, loader, optimizer, loss_fn, process):
  """ main function for single epoch of train, val or test """
  all_labels = []
  all_preds = []
  running_loss = 0
  num_batches = len(loader)
  with trange(num_batches, desc=process, ncols=100) as t:
    for batch_num, sample in enumerate(loader):
      img_batch, labels = sample
      if process == 'train':
        model.train()
        optimizer.zero_grad()
        preds = model(img_batch.cuda())
        loss = loss_fn(preds, labels.cuda())
        loss.backward()
        optimizer.step()
      else:
        model.eval()
        with torch.no_grad():
          preds = model(img_batch.cuda())
          loss = loss_fn(preds, labels.cuda())
      hard_preds = torch.round(torch.sigmoid(preds))
      all_labels += labels.numpy().tolist()
      all_preds += hard_preds.detach().cpu().numpy().tolist()
      running_loss += loss  
      t.set_postfix(loss=running_loss.item()/(float(batch_num+1)*batch_size))
      t.update()
  acc = accuracy_score(all_labels, all_preds)
  f1 = f1_score(all_labels, all_preds)
  final_loss = running_loss.item()/(num_batches*batch_size)
  return acc, f1, final_loss

In [None]:
def get_splits(all_names, train_size, val_size, test_size, all_labels):
  split1_size = (val_size+test_size)
  split2_size = test_size / (val_size+test_size)
  trn_names, valtst_names, trn_y, valtst_y = train_test_split(
      all_names, all_labels, test_size=split1_size,
      stratify=all_labels, random_state=0)
  val_names, tst_names = train_test_split(valtst_names, test_size=split2_size,
                              stratify=valtst_y, random_state=0)
  return trn_names, val_names, tst_names 

In [None]:
def perform_learning(model, optimizer, loss_fn, path, all_names, batch_size,
                     splits, num_epochs):
  """ Wrapper function to run train, val, test loops """
  all_labels = [int(name.split('_')[1]) for name in all_names]
  train_size, val_size, test_size = splits
  trn_names, val_names, tst_names = get_splits(all_names, train_size, val_size,
                                               test_size, all_labels)
  train_loader, val_loader, test_loader = get_data_loaders(
      ['train', 'val', 'test'],
      path, [trn_names, val_names, tst_names],
      augment=True,
      aug_prob=0.5,
      batch_size=batch_size
      )
  for epoch_num in range(num_epochs):
    trn_acc, trn_f1, trn_loss = learn(model, train_loader, optimizer, loss_fn,
                                      'train')
    print(f'Training Epoch {epoch_num} - Loss: {trn_loss} ; Accuracy: {trn_acc}'
          f' ; F1 Score: {trn_f1}')
    val_acc, val_f1, val_loss = learn(model, val_loader, optimizer, loss_fn,
                                      'val')
    print(f'Validation Epoch {epoch_num} - Loss: {val_loss} ; Accuracy: {val_acc}'
          f' ; F1 Score: {val_f1}')
  tst_acc, tst_f1, tst_loss = learn(model, test_loader, optimizer, loss_fn,
                                    'test')
  print(f'Test - Loss: {tst_loss} ; Accuracy: {tst_acc}'
        f' ; F1 Score: {tst_f1}')  

## Let's run!

In [None]:
path = '/content/gdrive/MyDrive/COVIDx'
all_names = os.listdir(path)[:500]

lr = 1e-4
wt_dec = 1e-4
num_epochs = 5
batch_size = 32
splits = [0.8, 0.1, 0.1]

model = resnet18(pretrained=True)
model.fc = nn.Linear(512, 1)
model = model.cuda()

loss_fn = nn.BCEWithLogitsLoss(reduction='sum')
optimizer = Adam(model.parameters(), lr=lr, weight_decay=wt_dec)

perform_learning(model, optimizer, loss_fn, path, all_names, batch_size,
                 splits, num_epochs)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
train: 100%|████████████████████████████████████████████| 13/13 [02:44<00:00, 12.67s/it, loss=0.555]


Training Epoch 0 - Loss: 0.555163676922138 ; Accuracy: 0.6575 ; F1 Score: 0.5418060200668897


val: 100%|████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.01s/it, loss=0.181]


Validation Epoch 0 - Loss: 0.18111154437065125 ; Accuracy: 0.96 ; F1 Score: 0.9


train: 100%|█████████████████████████████████████████████| 13/13 [00:12<00:00,  1.04it/s, loss=0.18]


Training Epoch 1 - Loss: 0.1795494006230281 ; Accuracy: 0.9425 ; F1 Score: 0.8700564971751412


val: 100%|████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.20it/s, loss=0.141]


Validation Epoch 1 - Loss: 0.14081236720085144 ; Accuracy: 0.94 ; F1 Score: 0.8695652173913043


train: 100%|███████████████████████████████████████████| 13/13 [00:12<00:00,  1.04it/s, loss=0.0886]


Training Epoch 2 - Loss: 0.08855653726137601 ; Accuracy: 0.97 ; F1 Score: 0.9302325581395349


val: 100%|████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.17it/s, loss=0.275]


Validation Epoch 2 - Loss: 0.27523982524871826 ; Accuracy: 0.88 ; F1 Score: 0.7857142857142858


train: 100%|███████████████████████████████████████████| 13/13 [00:12<00:00,  1.04it/s, loss=0.0635]


Training Epoch 3 - Loss: 0.06347772708305946 ; Accuracy: 0.98 ; F1 Score: 0.9545454545454545


val: 100%|████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.16it/s, loss=0.187]


Validation Epoch 3 - Loss: 0.18677273392677307 ; Accuracy: 0.94 ; F1 Score: 0.88


train: 100%|███████████████████████████████████████████| 13/13 [00:12<00:00,  1.04it/s, loss=0.0535]


Training Epoch 4 - Loss: 0.05348442150996281 ; Accuracy: 0.985 ; F1 Score: 0.9642857142857143


val: 100%|███████████████████████████████████████████████| 2/2 [00:01<00:00,  1.16it/s, loss=0.0846]


Validation Epoch 4 - Loss: 0.0845804512500763 ; Accuracy: 0.96 ; F1 Score: 0.9


test: 100%|███████████████████████████████████████████████| 2/2 [00:19<00:00,  9.75s/it, loss=0.222]

Test - Loss: 0.2221614420413971 ; Accuracy: 0.9 ; F1 Score: 0.7058823529411764





In [None]:
torch.save(model.state_dict(), '/content/gdrive/My Drive/covidx_500subset.pt')

In [None]:
# confusion matrix

In [None]:
! pip install captum

In [None]:
# Interpretation
import captum.attr
import matplotlib.pyplot as plt

all_labels = [int(name.split('_')[1]) for name in all_names]
train_size, val_size, test_size = splits
trn_names, val_names, tst_names = get_splits(all_names, train_size, val_size,
                                              test_size, all_labels)
_, _, test_loader = get_data_loaders(
    ['train', 'val', 'test'],
    path, [trn_names, val_names, tst_names],
    augment=True,
    aug_prob=0.5,
    batch_size=1
    )

cnt = 0
for batch_num, sample in enumerate(test_loader):
  X, label = sample

  gcObj = captum.attr.LayerGradCam(model.forward, model.layer4)     
  if label.item() == 0:
    continue           
  import pdb
  pdb.set_trace()                 
  attr = gcObj.attribute(X.cuda(), int(label.item()))                                                                     
  attr = torch.abs(attr)                                                                                  
  attrRescaled = Image.fromarray(attr.detach().cpu()                                                      
                                 .numpy()[0, 0, :, :]).resize(                                            
                                 (X.shape[3], X.shape[2]))
  plt.imshow(img, cmap='gray')
  plt.imshow(attr_map, cmap='jet', alpha=0.3)     
  plt.title('Overlayed Attributions')
  plt.axis('off')
  plt.colorbar()
  plt.show()                 
  cnt += 1
  if cnt >= 10:
    break


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.7/bdb.py", line 332, in set_trace
    sys.settrace(self.trace_dispatch)



> <ipython-input-34-2072b74a31b7>(26)<module>()
-> attr = gcObj.attribute(X.cuda(), int(label.item()))
(Pdb) X.shape
torch.Size([1, 3, 256, 256])
(Pdb) attr = gcObj.attribute(X[0].cuda(), int(label.item()))
*** RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 3, 7, 7], but got 3-dimensional input of size [3, 256, 256] instead
(Pdb) attr = gcObj.attribute(X.cuda(), int(label.item()))
*** IndexError: index 1 is out of bounds for dimension 1 with size 1
(Pdb) model.layer4
Sequential(
  (0): BasicBlock(
    (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (downsample): Sequential(
      (0): Conv2d(256, 512, kernel_size=


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.7/bdb.py", line 357, in set_quit
    sys.settrace(None)



BdbQuit: ignored

In [None]:
len(tst_names)

50