In [72]:

import torch.nn as nn
import torchvision
import torch
import pytorch_lightning as pl
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.models import resnet34

In [50]:
import pandas as pd
import os

train_data = pd.read_csv("data/train.csv")
image_folder = r'C:\Users\ADE17\Desktop\Masters\Projects\Ovarian_Cancer_Kaggle\train_thumbnails'

In [75]:
def preprocess_dataframe(dataframe, image_folder):
    # Get a list of image files in the folder
    image_files = os.listdir(image_folder)

    # Create a set of image IDs from the image files
    image_ids_in_folder = {int(filename.split('_')[0]) for filename in image_files}

    # Filter the dataframe to keep only rows with image IDs present in the folder
    dataframe_filtered = dataframe[dataframe['image_id'].isin(image_ids_in_folder)]
    
    # train_df, val_df = train_test_split(dataframe_filtered, test_size=0.2, stratify=dataframe_filtered.label)
    
    dataframe = pd.get_dummies(dataframe_filtered, columns=['label'])
    # val_op = pd.get_dummies(val_df, columns=['label'])
    
    return dataframe

In [76]:
preprocess_data = preprocess_dataframe(train_data, image_folder)

In [73]:
class ImageClassificationDataModule(pl.LightningDataModule):
    def __init__(self, custom_dataset, batch_size=32):
        super().__init__()
        self.custom_dataset = custom_dataset
        self.batch_size = batch_size

    def setup(self, stage=None):
        num_data = len(self.custom_dataset)
        train_size = int(0.8 * num_data)
        val_size = num_data - train_size
        self.train_data, self.val_data = random_split(self.custom_dataset, [train_size, val_size])

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_data, batch_size=self.batch_size)

In [74]:
class ImageClassificationModel(pl.LightningModule):
    def __init__(self, num_classes=5):
        super(ImageClassificationModel, self).__init__()
        self.resnet34 = resnet34(pretrained=True)
        self.resnet34.fc = nn.Linear(self.resnet34.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet34(x)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        self.log('val_loss', loss)
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=0.001)
        return optimizer

In [78]:
model = ImageClassificationModel(num_classes=5)

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to C:\Users\ADE17/.cache\torch\hub\checkpoints\resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [00:32<00:00, 2.67MB/s]


In [80]:
from datasets import CustomCancerDataset
image_folder = r'C:\Users\ADE17\Desktop\Masters\Projects\Ovarian_Cancer_Kaggle\train_thumbnails'
custom_dataset = CustomCancerDataset(metadata_df=preprocess_data, image_folder=image_folder)

In [81]:
data_module = ImageClassificationDataModule(custom_dataset, batch_size=32)