In [None]:
CUDA_VISIBLE_DEVICES=0,1

In [None]:
import os
import shutil
import opendatasets as od
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision.datasets import ImageFolder
import torchvision.transforms as T
from torchvision.utils import make_grid
from torchmetrics.functional import accuracy
import pytorch_lightning as pl
from tqdm import tqdm
from pytorch_lightning.callbacks import TQDMProgressBar

In [None]:
datapath = '../data/histopathologic-cancer-detection'

In [None]:
cancer_labels = pd.read_csv(os.path.join(datapath, 'train_labels.csv'))
cancer_labels.head()

In [None]:
cancer_labels['label'].value_counts()

In [None]:
print('No. of images in train dataset: ', len(os.listdir(os.path.join(datapath, 'train'))))
print('No. of images in test dataset:', len(os.listdir(os.path.join(datapath, 'test'))))

In [None]:
# Setting seed to make the results replicable
np.random.seed(0)
train_imgs_orig = os.listdir(os.path.join(datapath, 'train'))
selected_image_list = []
for img in np.random.choice(train_imgs_orig, 10000):
    selected_image_list.append(img)
print(len(selected_image_list))

In [None]:
fig = plt.figure(figsize=(25, 6))
for idx, img in enumerate(np.random.choice(selected_image_list, 20)):
    ax = fig.add_subplot(2, 10, idx + 1, xticks=[], yticks=[])
    im = Image.open(os.path.join(datapath, 'train', img))
    plt.imshow(im)
    lab = cancer_labels.loc[cancer_labels['id'] == img.split('.')[0], 'label'].values[0]
    ax.set_title(f'Label: {lab}')

In [None]:
np.random.seed(0)
np.random.shuffle(selected_image_list)
cancer_train_idx = selected_image_list[:8000]
cancer_test_idx = selected_image_list[8000:]
print("Number of images in the downsampled training dataset: ", len(cancer_train_idx))
print("Number of images in the downsampled testing dataset: ", len(cancer_test_idx))

In [None]:
os.mkdir(os.path.join(datapath, 'train_dataset'))
for fname in cancer_train_idx:
    src = os.path.join(datapath, 'train', fname)
    dst = os.path.join(datapath, 'train_dataset', fname)
    shutil.copyfile(src, dst)
print('No. of images in downsampled training dataset: ', len(os.listdir(os.path.join(datapath, 'train_dataset'))))

In [None]:
os.mkdir(os.path.join(datapath, 'test_dataset'))
for fname in cancer_test_idx:
    src = os.path.join(datapath, 'train', fname)
    dst = os.path.join(datapath, 'test_dataset', fname)
    shutil.copyfile(src, dst)
print('No. of images in downsampled testing dataset: ', len(os.listdir(os.path.join(datapath, 'test_dataset'))))

In [None]:
data_T_train = T.Compose([
    T.CenterCrop(32), 
    T.RandomHorizontalFlip(), 
    T.RandomVerticalFlip(), 
    T.ToTensor(),
])

In [None]:
data_T_test = T.Compose([
    T.CenterCrop(32), 
    T.ToTensor(), 
])

In [None]:
selected_image_labels = pd.DataFrame()
id_list = []
label_list = []

for img in selected_image_list:
    label_tuple = cancer_labels.loc[cancer_labels['id'] == img.split('.')[0]]
    id_list.append(label_tuple['id'].values[0])
    label_list.append(label_tuple['label'].values[0])

In [None]:
selected_image_labels['id'] = id_list 
selected_image_labels['label'] = label_list 
selected_image_labels.head()

In [None]:
# dictionary with labels and ids of train data
img_label_dict = {k: v for k, v in zip(selected_image_labels.id, selected_image_labels.label)}

Pytorch lightning expects data to be in folders with the classes. We cannot use the DataLoader module directly when all train images are in one folder without subfolders. So, we will write out custom function to carry out the loading.

In [None]:
class LoadCancerDataset(Dataset):
    def __init__(self, data_folder, transform=T.Compose([T.CenterCrop(32), T.ToTensor()]), dict_labels={}):
        self.data_folder = data_folder
        self.list_image_files = [s for s in os.listdir(data_folder)]
        self.transform = transform 
        self.dict_labels = dict_labels 
        self.labels = [dict_labels[i.split('.')[0]] for i in self.list_image_files]
    
    def __len__(self):
        return len(self.list_image_files)
    
    def __getitem__(self, idx):
        img_name = os.path.join(self.data_folder, self.list_image_files[idx])
        image = Image.open(img_name)
        image = self.transform(image)
        img_name_short = self.list_image_files[idx].split('.')[0]
        label = self.dict_labels[img_name_short]
        return image, label

In [None]:
%%time 
# Load train data 
train_set = LoadCancerDataset(data_folder=os.path.join(datapath, 'train_dataset'), transform=data_T_train, dict_labels=img_label_dict)

In [None]:
test_set = LoadCancerDataset(data_folder=os.path.join(datapath, 'test_dataset'), transform=data_T_test, dict_labels=img_label_dict)

In [None]:
batch_size = 256 
train_dataloader = DataLoader(train_set, batch_size, num_workers=4, pin_memory=True, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size, num_workers=4, pin_memory=True)

In [None]:
class CNNImageClassifier(pl.LightningModule):
    def __init__(self, learning_rate=1e-3):
        super().__init__()
        self.learning_rate = learning_rate 
        self.loss = nn.CrossEntropyLoss()
        self.save_hyperparameters()

        self.conv_layer1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2)
        self.conv_layer2 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.fully_connected_1 = nn.Linear(in_features=16 * 16 * 6,out_features=1000)
        self.fully_connected_2 = nn.Linear(in_features=1000,out_features=500)
        self.fully_connected_3 = nn.Linear(in_features=500,out_features=250)
        self.fully_connected_4 = nn.Linear(in_features=250,out_features=120)
        self.fully_connected_5 = nn.Linear(in_features=120,out_features=60)
        self.fully_connected_6 = nn.Linear(in_features=60,out_features=2)

    def forward(self, x):
        x = self.conv_layer1(x)
        x = self.relu1(x)
        x = self.pool(x)
        x = self.conv_layer2(x)
        x = self.relu2(x)
        x = x.view(-1, 6*16*16)
        x = self.fully_connected_1(x)
        x = self.fully_connected_2(x)
        x = self.fully_connected_3(x)
        x = self.fully_connected_4(x)
        x = self.fully_connected_5(x)
        x = self.fully_connected_6(x)
        return x 
    
    def training_step(self, batch, batch_idx):
        inputs, targets = batch 
        outputs = self(inputs)
        train_accuracy = accuracy(outputs, targets)
        loss = self.loss(outputs, targets)
        self.log('train_accuracy', train_accuracy, prog_bar=True)
        self.log('train_loss', loss)
        return {'loss': loss, 'train_accuracy': train_accuracy}
    
    def test_step(self, batch, batch_idx):
        inputs, targets = batch 
        outputs = self(inputs)
        test_accuracy = accuracy(outputs, targets)
        loss = self.loss(outputs, targets)
        self.log('test_accuracy', test_accuracy)
        return {'test_loss': loss, 'test_accuracy': test_accuracy}
    
    def configure_optimizers(self):
        params = self.parameters()
        optimizer = optim.Adam(params=params, lr=self.learning_rate)
        return optimizer 
    
    # Calculate accuracy for each batch at a time 
    def binary_accuracy(self, outputs, targets):
        _, outputs = torch.max(outputs, 1)
        correct_results_sum = (outputs == targets).sum().float()
        acc = correct_results_sum / targets.shape[0]
        return acc 
    
    def predict_step(self, batch, batch_idx):
        return self(batch)


In [None]:
model = CNNImageClassifier()
trainer = pl.Trainer(fast_dev_run=True, accelerator='gpu', devices=1)
trainer.fit(model, train_dataloaders=train_dataloader)

In [None]:
ckpt_dir = 'cnn_model_ckpts'
ckpt_callback = pl.callbacks.ModelCheckpoint(every_n_epochs=10)

model = CNNImageClassifier()
trainer = pl.Trainer(
    default_root_dir=ckpt_dir, 
    accelerator='gpu', 
    devices=1,
    log_every_n_steps=25, 
    max_epochs=500,
    callbacks=[ckpt_callback],
)
trainer.fit(model, train_dataloaders=train_dataloader)

In [None]:
trainer.test(test_dataloaders=test_dataloader)

In [None]:
model.eval()
model = model.cuda()
preds = []
for batch_i, (data, target) in enumerate(test_dataloader):
    data, target = data.cuda(), target.cuda()
    output = model(data)
    pr = output[:, 1].detach().cpu().numpy()
    for i in pr:
        preds.append(i)

In [None]:
test_preds = pd.DataFrame({'imgs': test_set.list_image_files, 'labels':test_set.labels,  'preds': preds})

In [None]:
test_preds['imgs'] = test_preds['imgs'].apply(lambda x: x.split('.')[0])

In [None]:
test_preds.head()

In [None]:
test_preds['predictions'] = 1
test_preds.loc[test_preds['preds'] < 0, 'predictions'] = 0
test_preds.shape

In [None]:
test_preds.head()

In [None]:
len(np.where(test_preds['labels'] == test_preds['predictions'])[0])/test_preds.shape[0]