In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip -d ./dataverse_files /content/drive/MyDrive/dataverse_files.zip

!unzip -d ./dataverse_files/HAM10000_images_part_1 /content/dataverse_files/HAM10000_images_part_1.zip
!unzip -d ./dataverse_files/HAM10000_images_part_2 /content/dataverse_files/HAM10000_images_part_2.zip
!unzip -d ./dataverse_files /content/dataverse_files/ISIC2018_Task3_Test_Images.zip

# 1. Preprocessing

In [3]:
import pandas as pd
import os
import shutil

In [4]:
# move training and test images into folders

# create train and test folder
if not os.path.exists('dataverse_files/train'):
    os.mkdir('dataverse_files/train')
    
if not os.path.exists('dataverse_files/test'):
    os.mkdir('dataverse_files/test')
    
for img_name in os.listdir('dataverse_files/HAM10000_images_part_1/'):
    src_path = os.path.join('dataverse_files/HAM10000_images_part_1', img_name)
    tgt_path = 'dataverse_files/train'
    shutil.move(src_path, tgt_path)
    
for img_name in os.listdir('dataverse_files/HAM10000_images_part_2/'):
    src_path = os.path.join('dataverse_files/HAM10000_images_part_2', img_name)
    tgt_path = 'dataverse_files/train'
    shutil.move(src_path, tgt_path)

for img_name in os.listdir('dataverse_files/ISIC2018_Task3_Test_Images/'):
    src_path = os.path.join('dataverse_files/ISIC2018_Task3_Test_Images', img_name)
    tgt_path = 'dataverse_files/test'
    shutil.move(src_path, tgt_path)

In [5]:
train_df = pd.read_csv('dataverse_files/HAM10000_metadata')
test_df = pd.read_csv('dataverse_files/ISIC2018_Task3_Test_NatureMedicine_AI_Interaction_Benefit.csv')

In [6]:
# convert to numerical representation
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
lb.fit_transform(train_df['dx'])
cls2lbl = {cls: idx for idx, cls in enumerate(lb.classes_)}
lbl2cls = {idx: cls for idx, cls in enumerate(lb.classes_)}
train_df['label'] = [cls2lbl[cls] for cls in train_df['dx']]

In [7]:
# extract label in test set
test_df['label'] = test_df[['prob_h_dx_akiec', 
                            'prob_h_dx_bcc',
                            'prob_h_dx_bkl', 
                            'prob_h_dx_df', 
                            'prob_h_dx_mel', 
                            'prob_h_dx_nv',
                            'prob_h_dx_vasc']].values.argmax(axis=1)

# 2. Data loader

In [13]:
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, RandomAffine, Resize, CenterCrop, Normalize

class MyDataset(Dataset):
    def __init__(self, train=True):
        self.meta = train_df if train else test_df
        self.root_dir = 'dataverse_files/train' if train else 'dataverse_files/test'
    def __len__(self):
        return len(self.meta)
    def __getitem__(self, idx):
        img_name = self.meta['image_id'].iloc[idx] + '.jpg'
        img_path = os.path.join(self.root_dir, img_name)

        image = Image.open(img_path)
        image = Resize((224, 224))(image)
        image = CenterCrop(224)(image)
        image = RandomAffine(degrees=(-10, 10), translate=(0.05, 0.1), scale=(0.9, 1.1))(image)
        image = ToTensor()(image)
        image = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(image)
        
        label = self.meta['label'].iloc[idx]
        return image, label

train_dataset = MyDataset(train=True)
test_dataset = MyDataset(train=False)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# 3. Models

In [14]:
import torch.nn as nn
from torchvision.models import resnet18

model = resnet18()
model.fc = nn.Linear(in_features=512, out_features=len(lbl2cls))
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

# 4. Train

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

n_epochs = 10
lr = 0.001
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
loss_func = torch.nn.CrossEntropyLoss()
best_test_acc = 0

for i in range(n_epochs):
    for idx, (image, label) in enumerate(train_loader):
        image = image.to(device)
        label = label.to(device)
        
        y_pred = model(image)
        loss = loss_func(y_pred, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx + 1) % 10 == 0:
            print(f'Iteration: {i+1}/{n_epochs}, Step: {idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}')
    
    model.eval()
    with torch.no_grad():
        n_instances = len(test_loader)
        n_correct = 0
        for image, label in test_loader:
            image = image.to(device)
            label = label.to(device)
            
            y_pred = model(image)
            pred = torch.argmax(y_pred, dim=1)
            n_correct += (pred == label).sum().item()
        test_acc = n_correct / n_instances
        print(f'Iteration: {i+1}/{n_epochs}, Test accuracy: {test_acc:.4f}')
        
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            torch.save(model, 'best_model.pth')
        
    model.train()
    
print(f'Best test accuracy: {best_test_acc:.4f}')

Iteration: 1/10, Step: 10/313, Loss: 1.4733
Iteration: 1/10, Step: 20/313, Loss: 1.1430
Iteration: 1/10, Step: 30/313, Loss: 0.9174
Iteration: 1/10, Step: 40/313, Loss: 0.9154
Iteration: 1/10, Step: 50/313, Loss: 0.7123
Iteration: 1/10, Step: 60/313, Loss: 1.1752
Iteration: 1/10, Step: 70/313, Loss: 1.0660
Iteration: 1/10, Step: 80/313, Loss: 0.7440
Iteration: 1/10, Step: 90/313, Loss: 0.7250
Iteration: 1/10, Step: 100/313, Loss: 1.1006
Iteration: 1/10, Step: 110/313, Loss: 0.8034
Iteration: 1/10, Step: 120/313, Loss: 1.0822
Iteration: 1/10, Step: 130/313, Loss: 1.2607
Iteration: 1/10, Step: 140/313, Loss: 0.9362
Iteration: 1/10, Step: 150/313, Loss: 0.9111
Iteration: 1/10, Step: 160/313, Loss: 0.8648
Iteration: 1/10, Step: 170/313, Loss: 1.1544
Iteration: 1/10, Step: 180/313, Loss: 0.9275
Iteration: 1/10, Step: 190/313, Loss: 0.7070
Iteration: 1/10, Step: 200/313, Loss: 0.6877
Iteration: 1/10, Step: 210/313, Loss: 0.8753
Iteration: 1/10, Step: 220/313, Loss: 0.7197
Iteration: 1/10, St

In [17]:
!cp /content/best_model.pth /content/drive/MyDrive

# 5. Evaluate

In [24]:
best_model = torch.load('best_model.pth')
best_model.eval()
with torch.no_grad():
    y_preds = []
    y_trues = []
    for image, label in test_loader:
        image = image.to(device)
        label = label.to(device)
        
        y_pred = model(image)
        pred = torch.argmax(y_pred, dim=1)

        y_trues.append(label.cpu().numpy())
        y_preds.append(pred.cpu().numpy())


In [27]:
import numpy as np
from sklearn.metrics import recall_score

y_preds = np.concatenate(y_preds)
y_trues = np.concatenate(y_trues)

scores = recall_score(y_trues, y_preds, average=None)
scores

array([0.21875   , 0.48324022, 0.30205656, 0.        , 0.27906977,
       0.93622985, 0.38607595])