In [2]:
import os 
import shutil 
import numpy as np 
import pandas as pd 
import opendatasets as od 

from PIL import Image
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

import torch 
from torch import nn, optim 
from torch.utils.data import DataLoader, Dataset 
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision.datasets import ImageFolder 
from torchvision import transforms as T 
from torchvision.utils import make_grid 
from torchvision.models import resnet50 
from torchmetrics.functional import accuracy

import pytorch_lightning as pl 

We use the data downloaded in Chapter 2

In [3]:
# Refer to the book Chapter 2 for downloading cancer dataset 
# I suppose you have downloaded the dataset and split it in Chapter 2

In [4]:
datapath = '../data/histopathologic-cancer-detection'
cancer_labels = pd.read_csv(os.path.join(datapath, 'train_labels.csv'))
cancer_labels.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [5]:
cancer_train_idx = os.listdir(os.path.join(datapath, 'train_dataset'))
cancer_test_idx = os.listdir(os.path.join(datapath, 'test_dataset'))
selected_image_list = cancer_train_idx + cancer_test_idx
print(len(selected_image_list))

9761


In [6]:
print("Number of images in the downsampled training dataset: ", len(cancer_train_idx))
print("Number of images in the downsampled testing dataset: ", len(cancer_test_idx))

Number of images in the downsampled training dataset:  7772
Number of images in the downsampled testing dataset:  1989


In [7]:
selected_image_labels = pd.DataFrame()

id_list = []
label_list = []
for img in selected_image_list:
    label_tuple = cancer_labels.loc[cancer_labels['id'] == img.split('.')[0]]
    id_list.append(label_tuple['id'].values[0])
    label_list.append(label_tuple['label'].values[0])

In [8]:
selected_image_labels['id'] = id_list 
selected_image_labels['label'] = label_list 
selected_image_labels.head()

img_class_dict = {k: v for k, v in zip(selected_image_labels.id, selected_image_labels.label)}

In [9]:
data_T_train = T.Compose([
    T.Resize(224), 
    T.RandomHorizontalFlip(), 
    T.ToTensor()
])

data_T_test = T.Compose([
    T.Resize(224), 
    T.ToTensor(), 
])

In [10]:
class LoadCancerDataset(Dataset):
    def __init__(self, datafolder, transform=T.Compose([T.CenterCrop(32), T.ToTensor()]), labels_dict={}):
        self.datafolder = datafolder 
        self.image_files_list = [s for s in os.listdir(datafolder)]
        self.transform = transform 
        self.labels_dict = labels_dict 
        self.labels = [labels_dict[i.split('.')[0]] for i in self.image_files_list]
    
    def __len__(self):
        return len(self.image_files_list)
    
    def __getitem__(self, idx):
        img_name = os.path.join(self.datafolder, self.image_files_list[idx])
        image = Image.open(img_name)
        image = self.transform(image)
        img_name_short = self.image_files_list[idx].split('.')[0]
        
        label = self.labels_dict[img_name_short]
        return image, label

In [11]:
cancer_train_set = LoadCancerDataset(
    datafolder=os.path.join(datapath, 'train_dataset'), 
    transform=data_T_train, 
    labels_dict=img_class_dict, 
)

cancer_test_set = LoadCancerDataset(
    datafolder=os.path.join(datapath, 'test_dataset'),
    transform=data_T_test, 
    labels_dict=img_class_dict,
)

In [12]:
batch_size = 128 
cancer_train_loader = DataLoader(cancer_train_set, batch_size, num_workers=4, pin_memory=True, shuffle=True)
cancer_test_loader = DataLoader(cancer_test_set, batch_size, num_workers=4, pin_memory=True)

In [13]:
resnet50(pretrained=True)



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [14]:
class CancerImageClassifier(pl.LightningModule):
    
    def __init__(self, learning_rate=1e-3):
        super().__init__()
        
        self.learning_rate = learning_rate
        self.loss = nn.CrossEntropyLoss()

        self.pretrain_model = resnet50(pretrained=True)
        self.pretrain_model.eval()

        for param in self.pretrain_model.parameters():
            param.requires_grad = False 
        
        self.pretrain_model.fc = nn.Linear(2048, 2)
    
    def forward(self, input):
        output = self.pretrain_model(input)
        return output 
    
    def training_step(self, batch, batch_idx):
        inputs, targets = batch 
        outputs = self(inputs)

        preds = torch.argmax(outputs, dim=1)
        train_accuracy = accuracy(preds, targets)
        loss = self.loss(outputs, targets)

        self.log('train_accuracy', train_accuracy, prog_bar=True)
        self.log('train_loss', loss)
        
        return {'loss': loss, 'train_accuracy': train_accuracy}
    
    def test_step(self, batch, batch_idx):
        inputs, targets = batch 
        outputs = self(inputs)
        
        preds = torch.argmax(outputs, dim=1)
        test_accuracy = accuracy(preds, targets)
        loss = self.loss(outputs, targets)
        
        self.log('test_accuracy', test_accuracy, prog_bar=True)
        return {'test_loss': loss, 'test_accuracy': test_accuracy}
    
    def test_epoch_end(self, outputs):
        test_outs = []
        for test_out in outputs:
            out = test_out['test_accuracy']
            test_outs.append(out)
        total_test_accuracy = torch.stack(test_outs).mean()
        self.log('total_test_accuracy', total_test_accuracy, on_step=False, on_epoch=True)
        return total_test_accuracy
    
    def configure_optimizers(self):
        params = self.parameters()
        optimizer = optim.Adam(params=params, lr=self.learning_rate)
        return optimizer

In [15]:
model = CancerImageClassifier()

trainer = pl.Trainer(fast_dev_run=True, accelerator='gpu', devices=1)
trainer.fit(model, cancer_train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6]

  | Name           | Type             | Params
----------------------------------------------------
0 | loss           | CrossEntropyLoss | 0     
1 | pretrain_model | ResNet           | 23.5 M
----------------------------------------------------
4.1 K     Trainable params
23.5 M    Non-trainable params
23.5 M    Total params
94.049    Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.


In [16]:
ckpt_dir = 'transfer_learning_Cancer_Detection_ckpts' 
ckpt_callback = pl.callbacks.ModelCheckpoint(every_n_epochs=10)

model = CancerImageClassifier()
trainer = pl.Trainer(default_root_dir=ckpt_dir, accelerator='gpu' , devices=1, callbacks=[ckpt_callback], log_every_n_steps=30, max_epochs=10)
trainer.fit(model, cancer_train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6]

  | Name           | Type             | Params
----------------------------------------------------
0 | loss           | CrossEntropyLoss | 0     
1 | pretrain_model | ResNet           | 23.5 M
----------------------------------------------------
4.1 K     Trainable params
23.5 M    Non-trainable params
23.5 M    Total params
94.049    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [19]:
trainer.test(model, cancer_test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy         0.8667672276496887
   total_test_accuracy      0.8676828742027283
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_accuracy': 0.8667672276496887,
  'total_test_accuracy': 0.8676828742027283}]

In [27]:
# of course you can test model prediction and calculateing accuracy manually
model.eval()
preds = []
for batch in cancer_test_loader:
    inputs, targets = batch 
    targets = targets.cuda()
    outputs = model(inputs)
    preds.append(torch.argmax(outputs, dim=1).cpu().numpy())
preds = np.concatenate(preds)

In [28]:
test_preds = pd.DataFrame(
    {
        'imgs': cancer_test_set.image_files_list,
        'labels': cancer_test_set.labels,
        'preds': preds,
    }
)
test_preds['images'] = test_preds['imgs'].apply(lambda x: x.split('.')[0])
test_preds.head()

Unnamed: 0,imgs,labels,preds,images
0,bd2483c53a131c1461961f7c9a956815c2bc57cf.tif,1,1,bd2483c53a131c1461961f7c9a956815c2bc57cf
1,348fb8796541e89e8b68a9c79d2448219408842e.tif,0,0,348fb8796541e89e8b68a9c79d2448219408842e
2,efd3aa82d00592883153920afb5fa059d935c867.tif,0,1,efd3aa82d00592883153920afb5fa059d935c867
3,5f68ea08a11d32c6336797b51b860ae08ec0770a.tif,0,0,5f68ea08a11d32c6336797b51b860ae08ec0770a
4,a882824a42481564aff537df4cc09f33e915c528.tif,0,0,a882824a42481564aff537df4cc09f33e915c528
