<a href="https://colab.research.google.com/github/Euan-Kearney/cancer-cam/blob/main/model_0_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Uninstalls existing kaggle files to resolve potential API issues
!pip uninstall -y kaggle kagglesdk

Found existing installation: kaggle 1.7.4.5
Uninstalling kaggle-1.7.4.5:
  Successfully uninstalled kaggle-1.7.4.5
[0m

In [2]:
%%shell
pip install -q kagglesdk
pip install -q kaggle
pip install -q opendatasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h



In [3]:
from google.colab import userdata
import json
import os

# Retrieve kaggle key and import json
kaggle_json_str = userdata.get('KAGGLE_JSON')
kaggle_dict = json.loads(kaggle_json_str)
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_dict, f)

!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000 -p /content/HAM10000 --unzip --quiet

Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0


In [4]:
from sklearn.model_selection import train_test_split
from pathlib import Path
import pandas as pd
import shutil

base = 'HAM10000_organised'
splits = ['train', 'validate', 'test']
os.makedirs(base, exist_ok=True)

classes = [
    'melanoma',
    'BCC',
    'SCC',
    'low_risk'
]

# Setup organised directories to allow for the creation of dataframes
for split in splits:
    for cls in classes:
        path = os.path.join(base, split, cls)
        os.makedirs(path, exist_ok=True)

os.makedirs(base, exist_ok=True)

"""
Maps HAM10000 classes to the 4 cancer-cam classes
"""
def map_classes(cls):
  if cls == 'mel':
    return 'melanoma'
  elif cls == 'bcc':
    return 'BCC'
  elif cls == 'akiec':
    return 'SCC'
  elif cls == 'bkl':
    return 'low_risk'
  elif cls == 'df':
    return 'low_risk'
  elif cls == 'nv':
    return 'low_risk'
  elif cls == 'vasc':
    return 'low_risk'
"""
Copies images from HAM10000 folder to the HAM10000_organised folder
"""
def copy_images(split, dataframe):
  for index, row in dataframe.iterrows():
    image_name = row['image_id'] + '.jpg'
    cls = map_classes(row['dx'])
    part_1_source = os.path.join('HAM10000/HAM10000_images_part_1/', image_name)
    part_2_source = os.path.join('HAM10000/HAM10000_images_part_2/', image_name)
    destination = os.path.join('HAM10000_organised/', split, cls, image_name)
    # Checks if image already exists in new dir, ceasing the for loop if so
    if os.path.exists(destination):
      break
    # Checks if the image exists in the part_1 HAM10000 dir, or the part_2 folder
    if os.path.exists(part_1_source):
      shutil.copyfile(part_1_source, destination)
    elif os.path.exists(part_2_source):
      shutil.copyfile(part_2_source, destination)

all_data = pd.read_csv('HAM10000/HAM10000_metadata.csv')
train_df, temp_df = train_test_split(all_data, test_size=0.2)
validate_df, test_df = train_test_split(temp_df, test_size=0.5)

for split in splits:
  if split == 'train':
    copy_images(split, train_df)
  elif split == 'validate':
    copy_images(split, validate_df)
  else:
    copy_images(split, test_df)




In [5]:
# Verifies number of images in each dir
def count_images(dir_path):
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
count_images('HAM10000_organised/')

There are 3 directories and 0 images in 'HAM10000_organised/'.
There are 4 directories and 0 images in 'HAM10000_organised/test'.
There are 0 directories and 46 images in 'HAM10000_organised/test/BCC'.
There are 0 directories and 31 images in 'HAM10000_organised/test/SCC'.
There are 0 directories and 817 images in 'HAM10000_organised/test/low_risk'.
There are 0 directories and 108 images in 'HAM10000_organised/test/melanoma'.
There are 4 directories and 0 images in 'HAM10000_organised/train'.
There are 0 directories and 410 images in 'HAM10000_organised/train/BCC'.
There are 0 directories and 268 images in 'HAM10000_organised/train/SCC'.
There are 0 directories and 6437 images in 'HAM10000_organised/train/low_risk'.
There are 0 directories and 897 images in 'HAM10000_organised/train/melanoma'.
There are 4 directories and 0 images in 'HAM10000_organised/validate'.
There are 0 directories and 58 images in 'HAM10000_organised/validate/BCC'.
There are 0 directories and 28 images in 'HAM100

In [6]:
import torch
import torchvision

from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Removes non image files from image directories
!rm -R HAM10000_organised/train/.ipynb_checkpoints
!rm -R HAM10000_organised/validate/.ipynb_checkpoints
!rm -R HAM10000_organised/test/.ipynb_checkpoints

# Lighweight B0 model has its weight's and image transformations retrieved
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
auto_transforms = weights.transforms()

train_data = datasets.ImageFolder("HAM10000_organised/train", transform=auto_transforms)
validate_data = datasets.ImageFolder("HAM10000_organised/validate", transform=auto_transforms)
test_data = datasets.ImageFolder("HAM10000_organised/test", transform=auto_transforms)

train_dataloader = DataLoader(
    train_data,
    batch_size=64,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

validate_dataloader = DataLoader(
    validate_data,
    batch_size=64,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

test_dataloader = DataLoader(
    test_data,
    batch_size=62,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)





rm: cannot remove 'HAM10000_organised/train/.ipynb_checkpoints': No such file or directory
rm: cannot remove 'HAM10000_organised/validate/.ipynb_checkpoints': No such file or directory
rm: cannot remove 'HAM10000_organised/test/.ipynb_checkpoints': No such file or directory


In [7]:
# Uses GPU for training if available, otherwise uses CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = torchvision.models.efficientnet_b0(weights=weights).to(device)

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth


100%|██████████| 20.5M/20.5M [00:00<00:00, 128MB/s] 


In [8]:
from torchsummary import summary

# Freeze existing layers to ensure they remain unaffected by further training
for param in model.features.parameters():
    param.requires_grad = False

#
model.classifier = torch.nn.Sequential(
    torch.nn.Dropout(p=0.2, inplace=True),
    torch.nn.Linear(in_features=1280,
                    out_features=4,
                    bias=True)).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)