<a href="https://www.kaggle.com/code/sarthaksshukla/cancer-images-classification?scriptVersionId=170078438" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install torchsummary

Collecting torchsummary
  Obtaining dependency information for torchsummary from https://files.pythonhosted.org/packages/7d/18/1474d06f721b86e6a9b9d7392ad68bed711a02f3b61ac43f13c719db50a6/torchsummary-1.5.1-py3-none-any.whl.metadata
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [2]:
import os,random,shutil
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import DataLoader
from torchvision import transforms,datasets
from fastprogress import master_bar,progress_bar
from torchsummary import summary
from tqdm import tqdm
from PIL import Image

In [3]:
device = "cuda" if torch.cuda.is_available() is True else "cpu"
print(device)

cuda


# Setting up the data paths

In [4]:
image_shape = (96,96)
image_size = (3,96,96)
batch_size = 32
epochs = 30
steps_per_epoch = 1125

## Setting up the directories for different classes

In [5]:
labels_document_path = "/kaggle/input/hcd-cropped/train_labels.csv"
source_path = "/kaggle/input/hcd-cropped/train"
destination_dataset_path = "/kaggle/output/model_dataset"
train_path = os.path.join(destination_dataset_path,"train")
valid_path = os.path.join(destination_dataset_path,'valid')

In [6]:
def extract_labels_mapping(path):
    mapping = {}
    with open(path) as file:
        for line in file.readlines()[1:]:
            line = line.strip().split(",")
            mapping[line[0]] = line[-1]
    return mapping

In [7]:
labels_mapping = extract_labels_mapping(labels_document_path)

In [8]:
target_classes = sorted(list(set(labels_mapping.values())))
print(target_classes)

['0', '1']


In [9]:
if os.path.exists(source_path) is False:
    os.makedirs(destination_dataset_path)
    os.mkdir(train_path)
    os.mkdir(valid_path)

In [10]:
for target in target_classes:
    os.makedirs(os.path.join(train_path,target))
    os.makedirs(os.path.join(valid_path,target))

In [11]:
def convert_tiff_to_png(tiff_file_path,destination_dir):
    file_name = os.path.basename(tiff_file_path)
    png_path = os.path.join(destination_dir,os.path.splitext(file_name)[0] + ".png")
    image = Image.open(tiff_file_path)
    image.save(png_path)

In [12]:
def construct_dataset(source_path,destination_path,mapping,num_images):
    image_paths = random.sample(os.listdir(source_path),num_images)
    for i in tqdm(range(len(image_paths))):
        image_path = image_paths[i]
        image_file_path = os.path.join(source_path,image_path)
        image_file_name = image_path.split(".")[0]
        cpy_path = os.path.join(destination_path,mapping[image_file_name])
        convert_tiff_to_png(tiff_file_path = image_file_path,
                           destination_dir = cpy_path)

## Constructing the datasets and dataloader objects

In [13]:
construct_dataset(
    source_path = source_path,destination_path = train_path,
    mapping = labels_mapping,num_images = 50000
)

100%|██████████| 50000/50000 [03:48<00:00, 218.90it/s]


In [14]:
construct_dataset(source_path = source_path,destination_path = valid_path,
                 mapping = labels_mapping,num_images = 20000)

100%|██████████| 20000/20000 [01:15<00:00, 263.28it/s]


In [15]:
train_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean = [0.485, 0.456, 0.406],
        std = [0.229, 0.224, 0.225]
    ),
    transforms.Resize(image_shape,antialias = True),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip()
])

valid_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean = [0.485, 0.456, 0.406],
        std = [0.229, 0.224, 0.225]
    ),
    transforms.Resize(image_shape,antialias = True)
])

In [16]:
train_dataset = datasets.ImageFolder(root = train_path,transform = train_transforms)
valid_dataset = datasets.ImageFolder(root = valid_path,transform = valid_transforms)

In [17]:
train_dataloader = DataLoader(dataset = train_dataset,batch_size = batch_size,shuffle = True,
                             pin_memory = True,drop_last = True)

valid_dataloader = DataLoader(dataset = valid_dataset,batch_size = batch_size,shuffle = True,
                             pin_memory = True,drop_last = True)

# Building the model

In [18]:
class Conv(nn.Module):
    def __init__(self,in_channels,out_channels,**kwargs):
        super().__init__(**kwargs)
        self.__model = nn.Sequential(*[
            nn.Conv2d(in_channels,out_channels,kernel_size = 3,stride = 1,padding = 1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        ])
    
    def forward(self,inputs):
        return self.__model(inputs)

In [19]:
class ConvStack(nn.Module):
    def __init__(self,in_channels,out_channels,n_conv = 4,**kwargs):
        super().__init__(**kwargs)
        self.__model = nn.Sequential(*[
            *[Conv(in_channels = in_channels,out_channels = in_channels) for _ in range(n_conv - 1)],
            Conv(in_channels = in_channels,out_channels = out_channels),
        ])
    
    def forward(self,inputs):
        return self.__model(inputs)

In [20]:
class ResnetLayer(nn.Module):
    def __init__(self,channels,num_layers = 4,n_conv = 4,**kwargs):
        super().__init__(**kwargs)
        self.__model = nn.Sequential(*[
            ConvStack(in_channels = channels,out_channels = channels,n_conv = n_conv)
            for _ in range(num_layers)
        ])
    
    def forward(self,inputs):
        output = self.__model(inputs)
        return F.relu(inputs + output)

In [21]:
class Pooling(nn.Module):
    def __init__(self,channels,**kwargs):
        super().__init__(**kwargs)
        self.__model = nn.Sequential(*[
            nn.Conv2d(channels,2 * channels,kernel_size = 3,stride = 1,padding = 1),
            nn.AvgPool2d(kernel_size = 2,stride = 2),
            nn.BatchNorm2d(channels * 2),
            nn.ReLU(),
        ])
    
    def forward(self,inputs):
        return self.__model(inputs)

In [22]:
class FullyConnected(nn.Module):
    def __init__(self,channels,num_classes,units = 4096,**kwargs):
        super().__init__(**kwargs)
        self.__model = nn.Sequential(*[
            nn.Flatten(),
            nn.Linear(channels,units),
            nn.ReLU(),
            nn.Linear(units,units),
            nn.ReLU(),
            nn.Linear(units,num_classes),
            nn.Softmax(dim = -1)
        ])
    
    def forward(self,inputs):
        return self.__model(inputs)

In [23]:
class Resnet(nn.Module):
    def __init__(self,image_shape,num_classes,n_conv = 4,units = 4096,num_layers = 4,**kwargs):
        super().__init__(**kwargs)
        channels,height,width = image_shape
        self.__model = []
        self.__input = nn.Sequential(*[
            Conv(in_channels = channels,out_channels = 8),
        ])
        channels = 8
        while height > 1:
            self.__model.append(ResnetLayer(channels = channels,num_layers = num_layers,n_conv = n_conv))
            self.__model.append(Pooling(channels = channels))
            height //= 2
            width //= 2
            channels *= 2
        
        self.__model.append(FullyConnected(channels = channels,num_classes = num_classes,units = units))
        self.__model = nn.Sequential(*self.__model)
    
    def forward(self,inputs):
        output = self.__input(inputs)
        return self.__model(output)

In [24]:
model = Resnet(image_shape = image_size,num_classes = 2)

In [25]:
model = model.to(device)

In [26]:
summary(model,image_size)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 96, 96]             224
       BatchNorm2d-2            [-1, 8, 96, 96]              16
              ReLU-3            [-1, 8, 96, 96]               0
              Conv-4            [-1, 8, 96, 96]               0
            Conv2d-5            [-1, 8, 96, 96]             584
       BatchNorm2d-6            [-1, 8, 96, 96]              16
              ReLU-7            [-1, 8, 96, 96]               0
              Conv-8            [-1, 8, 96, 96]               0
            Conv2d-9            [-1, 8, 96, 96]             584
      BatchNorm2d-10            [-1, 8, 96, 96]              16
             ReLU-11            [-1, 8, 96, 96]               0
             Conv-12            [-1, 8, 96, 96]               0
           Conv2d-13            [-1, 8, 96, 96]             584
      BatchNorm2d-14            [-1, 8,

# Training the model

In [27]:
class Trainer:
    def __init__(self,model,device,train_dataloader,valid_dataloader = None):
        self.__model = model.to(device)
        self.__device = device
        self.__train_dataloader = self.__get_train_batch(train_dataloader)
        self.__valid_dataloader = self.__get_train_batch(valid_dataloader) if valid_dataloader is not None else None
    
    @torch.no_grad()
    def __get_train_batch(self,dataloader):
        while True:
            for images,labels in dataloader:
                yield images.to(self.__device),labels.to(self.__device)
    
    @torch.no_grad()
    def __compute_accuracy(self,probs,labels):
        preds = torch.argmax(probs,dim = -1)
        acc = torch.sum(preds == labels) / len(labels)
        return acc * 100
    
    @torch.no_grad()
    def __valid_step(self,valid_steps_per_epoch):
        losses,accuracy = [],[]
        for step in range(valid_steps_per_epoch):
            images,labels = next(self.__valid_dataloader)
            probs = self.__model(images)
            one_hot = F.one_hot(labels,int(probs.shape[-1])).float()
            loss = F.cross_entropy(probs,one_hot)
            loss = round(loss.item(),3)
            losses.append(loss)
            accuracy.append(float(self.__compute_accuracy(probs,labels)))
        return round(sum(losses) / len(losses),4),round(sum(accuracy) / len(accuracy),4)
    
    def __train_step(self,optimizer):
        images,labels = next(self.__train_dataloader)
        probs = self.__model(images)
        one_hot = F.one_hot(labels,int(probs.shape[-1])).float()
        loss = F.cross_entropy(probs,one_hot)
        optimizer.zero_grad(set_to_none = True)
        loss.backward()
        optimizer.step()
        loss = round(loss.item(),3)
        return loss,self.__compute_accuracy(probs,labels)
        
    
    def train(self,epochs,steps_per_epoch,valid_steps_per_epoch,optimizer = None,lr = 3e-4):
        master_progress_bar = master_bar(range(epochs))
        if optimizer is None:
            optimizer = torch.optim.Adam(self.__model.parameters(),lr = lr)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer = optimizer,mode = "min",factor = 1 / 3,patience = 3)
        
        for epoch in master_progress_bar:
            train_losses,train_accu = [],[]
            for step in progress_bar(range(steps_per_epoch),parent = master_progress_bar):
                train_loss,train_acc = self.__train_step(optimizer)
                train_losses.append(train_loss)
                train_accu.append(float(train_acc))
                master_progress_bar.child.comment = f"Loss: {train_loss}, Acc: {train_acc}"
            
            train_loss = round(sum(train_losses) / len(train_losses),4)
            train_acc = round(sum(train_accu) / len(train_accu),4)
                
            if self.__valid_dataloader is not None:
                valid_loss,valid_acc = self.__valid_step(valid_steps_per_epoch)
                master_progress_bar.write(f"Train loss: {train_loss} - train acc: {train_acc} - valid loss: {valid_loss} - valid acc: {valid_acc}")
                
            lr_scheduler.step(valid_loss)

In [28]:
trainer = Trainer(model = model,device = device,train_dataloader = train_dataloader,valid_dataloader = valid_dataloader)

In [29]:
trainer.train(epochs = epochs,steps_per_epoch = 1125,valid_steps_per_epoch = 200)