In [1]:
#importing all the libraries

import os
import numpy as np
import pandas as pd
import torch
import glob
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
from sklearn.model_selection import train_test_split

In [2]:
class SkinCancerDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform = None):
        self.annotation = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotation)

    def __getitem__(self, index):
        img_id = self.annotation.iloc[index, 1]#ImageID on column 2
        img_name = os.path.join(self.root_dir, img_id + '.jpg')
        image = Image.open(img_name).convert('RGB')

        y_label = torch.tensor(int(self.annotation.iloc[index]['dx']))

        if self.transform:
            image = self.transform(image)

        return image, y_label

In [3]:
meta = pd.read_csv("/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv")
unique_lesion_ids = meta['lesion_id'].unique()

train_ids, temp_ids = train_test_split(unique_lesion_ids, test_size=0.3)
test_ids, validation_ids = train_test_split(temp_ids, test_size=0.5)

train_meta = meta[meta['lesion_id'].isin(train_ids)]
test_meta = meta[meta['lesion_id'].isin(test_ids)]
validation_meta = meta[meta['lesion_id'].isin(validation_ids)]

print(f"train ({100 * len(train_meta) / (len(train_meta) + len(test_meta) + len(validation_meta))}%):\n{train_meta.head()}\n")
print(f"test ({100 * len(test_meta) / (len(train_meta) + len(test_meta) + len(validation_meta))}%):\n{test_meta.head()}\n")
print(f"validation ({100 * len(validation_meta) / (len(train_meta) + len(test_meta) + len(validation_meta))}%):\n{validation_meta.head()}\n")


train (70.11482775836245%):
     lesion_id      image_id   dx dx_type   age     sex localization
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0    male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0    male        scalp
6  HAM_0002761  ISIC_0029176  bkl   histo  60.0    male         face
7  HAM_0002761  ISIC_0029068  bkl   histo  60.0    male         face
8  HAM_0005132  ISIC_0025837  bkl   histo  70.0  female         back

test (14.837743384922616%):
      lesion_id      image_id   dx dx_type   age   sex localization
0   HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1   HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
4   HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear
5   HAM_0001466  ISIC_0027850  bkl   histo  75.0  male          ear
13  HAM_0001949  ISIC_0025767  bkl   histo  70.0  male        trunk

validation (15.047428856714928%):
      lesion_id      image_id   dx dx_type   age     sex     localization
20  HAM_0006

In [4]:
from torchvision import transforms

# Training transforms (Randomness added)
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),       # Standard for ResNet/EfficientNet
    transforms.RandomHorizontalFlip(),   # Tumor symmetry invariance
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),       # Medical imaging has no fixed "up"
    transforms.ToTensor(),               # Converts [0, 255] to [0.0, 1.0]
    transforms.Normalize(                # Standardizes to ImageNet distribution
        mean=[0.485, 0.456, 0.406], 
        std=[0.229, 0.224, 0.225]
    )
])

# Validation transforms (No Randomness, just resizing)
val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])