In [25]:
import os

kaggle_dir = os.path.expanduser("~/.kaggle")
os.makedirs(kaggle_dir, exist_ok=True)

import shutil
shutil.copy("kaggle.json", os.path.join(kaggle_dir, "kaggle.json"))

os.chmod(os.path.join(kaggle_dir, "kaggle.json"), 0o600)

In [26]:
import kaggle
kaggle.api.authenticate()

# Check if dataset already exists before downloading
if not os.path.exists('data') or not os.path.exists(os.path.join('data', 'HAM10000_metadata.csv')):
    kaggle.api.dataset_download_files('kmader/skin-cancer-mnist-ham10000', path='data', unzip=True)
    print("Dataset downloaded")
else:
    print("Dataset already exists, skipping download")

Dataset already exists, skipping download


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms, models
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [29]:
CSV_PATH = os.path.join("data", "HAM10000_metadata.csv")
IMAGE_DIR = os.path.join("data", "HAM10000_images_part_1")
BATCH_SIZE = 32
IMAGE_SIZE = 224
LR = 0.01
EPOCHS = 10

In [30]:
# There is multiple class in label. Let's make this Binary Classification problem by mapping to one of two classes
# 0 -> Benign
# 1 -> Malignant
CLASS_MAPPING={
    'nv': 0, 'bkl':0, 'df':0, 'vasc':0, 'mel':1, 'bcc':1, 'akiec':1
}

In [31]:
class SkinCancerDataset(Dataset):
  def __init__(self, csv_path, image_dir, transform = None):
    self.data = pd.read_csv(csv_path)
    self.image_dir = image_dir
    self.transform = transform
    self.data['target'] = self.data['dx'].map(CLASS_MAPPING)

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, idx):
    img_name = os.path.join(self.image_dir, self.data.iloc[idx, 'image_id'] + '.jpg')
    image = Image.open(img_name)
    label = self.data.iloc[idx, 'target']

    if self.transform:
      self.transform(image)

    return image, torch.tensor(label, dtype = torch.long)

In [32]:
transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])

In [33]:
dataset_full = SkinCancerDataset(CSV_PATH, IMAGE_DIR, transform = transform)
train_size = int(0.8 * len(dataset_full))
test_size = len(dataset_full) - train_size
train_dataset, test_dataset = random_split(dataset_full, [train_size, test_size])

In [34]:
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle = False)