In [1]:
import torch.nn as nn
import torchvision
import torch
import pytorch_lightning as pl
import torch.optim as optim
from torch.utils.data import Dataset
from PIL import Image  
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
from torchvision.models import resnet34
import pandas as pd
import os



In [2]:
train_data = pd.read_csv("/kaggle/input/UBC-OCEAN/train.csv")
image_folder = r'/kaggle/input/UBC-OCEAN/train_thumbnails'

In [3]:
def preprocess_dataframe(dataframe, image_folder):
    image_files = os.listdir(image_folder)
    image_ids_in_folder = {int(filename.split('_')[0]) for filename in image_files}
    dataframe_filtered = dataframe[dataframe['image_id'].isin(image_ids_in_folder)]    
    dataframe = pd.get_dummies(dataframe_filtered, columns=['label'])    
    return dataframe

In [4]:
preprocess_data = preprocess_dataframe(train_data, image_folder)

In [5]:
class CustomCancerDataset(Dataset):
    def __init__(self, metadata_df, image_folder, transform=None):
        self.metadata_df = metadata_df
        self.image_folder = image_folder
        self.transform = transforms.Compose(
                [transforms.Resize((224, 224)),
                 transforms.ToTensor(), 
                 transforms.Normalize(mean=[0.48828688, 0.42932517, 0.49162089], std=[0.41380908, 0.37492874, 0.41795654])]
            )

    def __len__(self):
        return len(self.metadata_df)

    def __getitem__(self, idx):
        image_ids = self.metadata_df.image_id[idx]  
        image_name = os.path.join(self.image_folder, "{}_thumbnail.png".format(image_ids))
        # print(image_name)
        image = Image.open(image_name)
        label_CC  = self.metadata_df.label_CC[idx]  
        label_EC  = self.metadata_df.label_EC[idx]  
        label_HGSC  = self.metadata_df.label_HGSC[idx]  
        label_LGSC  = self.metadata_df.label_LGSC[idx]  
        label_MC  = self.metadata_df.label_MC[idx]  
        
        if self.transform:
            image = self.transform(image)

        return image, [label_CC, label_EC, label_HGSC, label_LGSC, label_MC]

In [6]:
image_folder = r'/kaggle/input/UBC-OCEAN/train_thumbnails'
custom_dataset = CustomCancerDataset(metadata_df=preprocess_data, image_folder=image_folder)

In [7]:
class ImageClassificationDataModule(pl.LightningDataModule):
    def __init__(self, custom_dataset, batch_size=32):
        super().__init__()
        self.custom_dataset = custom_dataset
        self.batch_size = batch_size

    def setup(self, stage=None):
        num_data = len(self.custom_dataset)
        train_size = int(0.8 * num_data)
        val_size = num_data - train_size
        self.train_data, self.val_data = random_split(self.custom_dataset, [train_size, val_size])

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_data, batch_size=self.batch_size)

In [8]:
data_module = ImageClassificationDataModule(custom_dataset, batch_size=32)

In [9]:
class ImageClassificationModel(pl.LightningModule):
    def __init__(self, num_classes=5):
        super(ImageClassificationModel, self).__init__()
        self.resnet34 = resnet34(pretrained=True)
        self.resnet34.fc = nn.Linear(self.resnet34.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet34(x)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        self.log('val_loss', loss)
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=0.001)
        return optimizer

In [10]:
model = ImageClassificationModel(num_classes=5)



In [11]:
trainer = pl.Trainer(max_epochs=10, accelerator='cuda')

In [None]:
trainer.fit(model, data_module)

  rank_zero_warn(


In [None]:
class ImageClassificationTestDataset(Dataset):
    def __init__(self, metadata_df, image_folder, transform=None):
        self.metadata_df = metadata_df
        self.image_folder = image_folder
        self.transform = transforms.Compose(
                [transforms.Resize((224, 224)),
                 transforms.ToTensor(), 
                 transforms.Normalize(mean=[0.48828688, 0.42932517, 0.49162089], std=[0.41380908, 0.37492874, 0.41795654])]
            )
    def __len__(self):
        return len(self.metadata_df)

    def __getitem__(self, idx):
        image_ids = self.metadata_df.image_id[idx]
        image_name = os.path.join(self.image_folder, "{}_thumbnail.png".format(image_ids))
        image = Image.open(image_name)

        if self.transform:
            image = self.transform(image)

        return image

In [None]:
test_df = pd.read_csv('/kaggle/input/UBC-OCEAN/test.csv')
test_image_folder = '/kaggle/input/UBC-OCEAN/test_thumbnails'
test_data = preprocess_dataframe(test_df, test_image_folder)

In [None]:
test_dataset = ImageClassificationTestDataset(metadata_df=test_data, image_folder=test_image_folder)
test_dataloader = DataLoader(test_dataset, batch_size=32)

In [None]:
model.eval()

# List to store predictions
predictions = []

# Make predictions on the test data
with torch.no_grad():
    for batch in test_dataloader:
        batch = batch.to(model.device)  # Send the batch to the same device as the model
        outputs = model(batch)
        _, predicted = torch.max(outputs, 1)  # Get the class with the highest probability
        predictions.extend(predicted.tolist())

# Convert predictions to a list
predictions_list = predictions

In [None]:
print(predictions_list)