In [1]:
import torch.nn as nn
import torchvision
import torch
import pytorch_lightning as pl
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision.models import resnet34
import pandas as pd
import os
from torch.utils.data import Dataset
from PIL import Image  
from torchvision import transforms


train_data = pd.read_csv("/home/woody/iwso/iwso092h/ucb_kaggle/data/train.csv")
image_folder = r'/home/woody/iwso/iwso092h/ucb_kaggle/train_thumbnails'

2023-10-29 15:34:34.232440: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-29 15:34:34.272260: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def preprocess_dataframe(dataframe, image_folder):
    # Get a list of image files in the folder
    image_files = os.listdir(image_folder)

    # Create a set of image IDs from the image files
    image_ids_in_folder = {int(filename.split('_')[0]) for filename in image_files}

    # Filter the dataframe to keep only rows with image IDs present in the folder
    dataframe_filtered = dataframe[dataframe['image_id'].isin(image_ids_in_folder)]
    
    # train_df, val_df = train_test_split(dataframe_filtered, test_size=0.2, stratify=dataframe_filtered.label)
    
    dataframe = pd.get_dummies(dataframe_filtered, columns=['label'])
    # val_op = pd.get_dummies(val_df, columns=['label'])
    
    return dataframe.reset_index(drop=True)

preprocess_data = preprocess_dataframe(train_data, image_folder)

In [28]:
class ImageClassificationDataModule(pl.LightningDataModule):
    def __init__(self, custom_dataset, batch_size=32):
        super().__init__()
        self.custom_dataset = custom_dataset
        self.batch_size = batch_size

    def setup(self, stage=None):
        num_data = len(self.custom_dataset)
        train_size = int(0.8 * num_data)
        val_size = num_data - train_size
        self.train_data, self.val_data = random_split(self.custom_dataset, [train_size, val_size])

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True, num_workers=47)

    def val_dataloader(self):
        return DataLoader(self.val_data, batch_size=self.batch_size, num_workers=47)
    
class ImageClassificationModel(pl.LightningModule):
    def __init__(self, num_classes=5):
        super(ImageClassificationModel, self).__init__()
        self.resnet34 = resnet34(pretrained=True)
        self.resnet34.fc = nn.Linear(self.resnet34.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet34(x)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        return loss

    def training_epoch_end(self, outputs):
        # Log training loss at the end of each epoch
        avg_loss = torch.stack(outputs).mean()
        self.log('train_loss', avg_loss, on_step=False, on_epoch=True)

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        return loss

    def validation_epoch_end(self, outputs):
        # Log validation loss at the end of each epoch
        avg_loss = torch.stack(outputs).mean()
        self.log('val_loss', avg_loss, on_step=False, on_epoch=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=0.001)
        return optimizer


In [20]:
from pytorch_lightning.callbacks import Callback
from sklearn.metrics import balanced_accuracy_score

class BalancedAccuracyCallback(Callback):
    def on_validation_end(self, trainer, pl_module):
        val_preds = []
        val_labels = []
        for batch in trainer.datamodule.val_dataloader():
            inputs, labels = batch
            outputs = pl_module(inputs)
            _, predicted = torch.max(outputs, 1)  # Assuming outputs are logits, get the predicted class index
            val_preds.extend(predicted.cpu().tolist())
            val_labels.extend(labels.cpu().argmax(dim=1).tolist())  # Convert one-hot encoded labels to class indices

        balanced_acc = balanced_accuracy_score(val_labels, val_preds)
        trainer.logger.experiment.add_scalar("val_balanced_acc", balanced_acc, global_step=trainer.global_step)


In [21]:
class CustomCancerDataset(Dataset):
    def __init__(self, metadata_df, image_folder, transform=None):
        self.metadata_df = metadata_df
        self.image_folder = image_folder
        self.transform = transforms.Compose(
                [transforms.Resize((224, 224)),
                 transforms.ToTensor(), 
                 transforms.Normalize(mean=[0.48828688, 0.42932517, 0.49162089], std=[0.41380908, 0.37492874, 0.41795654])]
            )

    def __len__(self):
        return len(self.metadata_df)

    def __getitem__(self, idx):
        image_ids = self.metadata_df.image_id[idx]  
        image_name = os.path.join(self.image_folder, "{}_thumbnail.png".format(image_ids))
        # print(image_name)
        image = Image.open(image_name)
        label_CC  = self.metadata_df.label_CC[idx].astype(int)  
        label_EC  = self.metadata_df.label_EC[idx].astype(int)    
        label_HGSC  = self.metadata_df.label_HGSC[idx].astype(int)    
        label_LGSC  = self.metadata_df.label_LGSC[idx].astype(int)    
        label_MC  = self.metadata_df.label_MC[idx].astype(int)    
        
        if self.transform:
            image = self.transform(image)

        return image, torch.tensor([label_CC, label_EC, label_HGSC, label_LGSC, label_MC], dtype=torch.float)

In [29]:
model = ImageClassificationModel(num_classes=5)

# from datasets import CustomCancerDataset
image_folder = r'/home/woody/iwso/iwso092h/ucb_kaggle/train_thumbnails'
custom_dataset = CustomCancerDataset(metadata_df=preprocess_data, image_folder=image_folder)

In [30]:
data_module = ImageClassificationDataModule(custom_dataset, batch_size=32)

In [31]:
from pytorch_lightning.loggers import TensorBoardLogger  # Import the TensorBoardLogger
logger = TensorBoardLogger("logs", name="image_classification_logs")  # Specify the log directory

In [32]:
trainer = pl.Trainer(
    max_epochs=10, 
    accelerator = 'cpu',
    logger=logger, 
    log_every_n_steps=1, 
    callbacks=[BalancedAccuracyCallback()]
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [33]:
trainer.fit(model, data_module)

NotImplementedError: Support for `training_epoch_end` has been removed in v2.0.0. `ImageClassificationModel` implements this method. You can use the `on_train_epoch_end` hook instead. To access outputs, save them in-memory as instance attributes. You can find migration examples in https://github.com/Lightning-AI/lightning/pull/16520.