In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip -d ./dataverse_files /content/drive/MyDrive/dataverse_files.zip

!unzip -d ./dataverse_files/HAM10000_images_part_1 /content/dataverse_files/HAM10000_images_part_1.zip
!unzip -d ./dataverse_files/HAM10000_images_part_2 /content/dataverse_files/HAM10000_images_part_2.zip
!unzip -d ./dataverse_files /content/dataverse_files/ISIC2018_Task3_Test_Images.zip

# 1. Preprocessing

In [5]:
import pandas as pd
import os
import shutil

In [6]:
# move training and test images into folders

# create train and test folder
if not os.path.exists('dataverse_files/train'):
    os.mkdir('dataverse_files/train')
    
if not os.path.exists('dataverse_files/test'):
    os.mkdir('dataverse_files/test')
    
for img_name in os.listdir('dataverse_files/HAM10000_images_part_1/'):
    src_path = os.path.join('dataverse_files/HAM10000_images_part_1', img_name)
    tgt_path = 'dataverse_files/train'
    shutil.move(src_path, tgt_path)
    
for img_name in os.listdir('dataverse_files/HAM10000_images_part_2/'):
    src_path = os.path.join('dataverse_files/HAM10000_images_part_2', img_name)
    tgt_path = 'dataverse_files/train'
    shutil.move(src_path, tgt_path)

for img_name in os.listdir('dataverse_files/ISIC2018_Task3_Test_Images/'):
    src_path = os.path.join('dataverse_files/ISIC2018_Task3_Test_Images', img_name)
    tgt_path = 'dataverse_files/test'
    shutil.move(src_path, tgt_path)

In [7]:
train_df = pd.read_csv('dataverse_files/HAM10000_metadata')
test_df = pd.read_csv('dataverse_files/ISIC2018_Task3_Test_NatureMedicine_AI_Interaction_Benefit.csv')

In [8]:
# convert to numerical representation
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
lb.fit_transform(train_df['dx'])
cls2lbl = {cls: idx for idx, cls in enumerate(lb.classes_)}
lbl2cls = {idx: cls for idx, cls in enumerate(lb.classes_)}
train_df['label'] = [cls2lbl[cls] for cls in train_df['dx']]

In [9]:
# extract label in test set
test_df['label'] = test_df[['prob_h_dx_akiec', 
                            'prob_h_dx_bcc',
                            'prob_h_dx_bkl', 
                            'prob_h_dx_df', 
                            'prob_h_dx_mel', 
                            'prob_h_dx_nv',
                            'prob_h_dx_vasc']].values.argmax(axis=1)

# 2. Data loader

In [17]:
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, RandomAffine, Resize, CenterCrop, Normalize

class MyDataset(Dataset):
    def __init__(self, train=True):
        self.meta = train_df if train else test_df
        self.root_dir = 'dataverse_files/train' if train else 'dataverse_files/test'
    def __len__(self):
        return len(self.meta)
    def __getitem__(self, idx):
        img_name = self.meta['image_id'].iloc[idx] + '.jpg'
        img_path = os.path.join(self.root_dir, img_name)

        image = Image.open(img_path)
        image = Resize((299, 299))(image)
        image = CenterCrop(299)(image)
        # image = RandomAffine(degrees=(-10, 10), translate=(0.05, 0.1), scale=(0.9, 1.1))(image)
        image = ToTensor()(image)
        image = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(image)
        
        label = self.meta['label'].iloc[idx]
        return image, label

train_dataset = MyDataset(train=True)
test_dataset = MyDataset(train=False)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# 3. Models

In [18]:
import torch.nn as nn
from torchvision.models import inception_v3

model = inception_v3()
model.fc = nn.Linear(in_features=2048, out_features=len(lbl2cls))
model

Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stri

# 4. Train

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

n_epochs = 10
lr = 0.001
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
loss_func = torch.nn.CrossEntropyLoss()
best_test_acc = 0

for i in range(n_epochs):
    for idx, (image, label) in enumerate(train_loader):
        image = image.to(device)
        label = label.to(device)
        
        y_pred = model(image)[0]
        loss = loss_func(y_pred, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx + 1) % 10 == 0:
            print(f'Iteration: {i+1}/{n_epochs}, Step: {idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}')
    
    model.eval()
    with torch.no_grad():
        n_instances = len(test_loader)
        n_correct = 0
        for image, label in test_loader:
            image = image.to(device)
            label = label.to(device)
            
            y_pred = model(image)
            pred = torch.argmax(y_pred, dim=1)
            n_correct += (pred == label).sum().item()
        test_acc = n_correct / n_instances
        print(f'Iteration: {i+1}/{n_epochs}, Test accuracy: {test_acc:.4f}')
        
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            torch.save(model, 'best_model.pth')
        
    model.train()
    
print(f'Best test accuracy: {best_test_acc:.4f}')

Iteration: 1/10, Step: 10/313, Loss: 1.8915
Iteration: 1/10, Step: 20/313, Loss: 1.2862
Iteration: 1/10, Step: 30/313, Loss: 0.9231
Iteration: 1/10, Step: 40/313, Loss: 0.7849
Iteration: 1/10, Step: 50/313, Loss: 1.1240
Iteration: 1/10, Step: 60/313, Loss: 1.1732
Iteration: 1/10, Step: 70/313, Loss: 1.1472
Iteration: 1/10, Step: 80/313, Loss: 0.9072
Iteration: 1/10, Step: 90/313, Loss: 0.7213
Iteration: 1/10, Step: 100/313, Loss: 0.9384
Iteration: 1/10, Step: 110/313, Loss: 0.8257
Iteration: 1/10, Step: 120/313, Loss: 0.6709
Iteration: 1/10, Step: 130/313, Loss: 0.6229
Iteration: 1/10, Step: 140/313, Loss: 0.9555
Iteration: 1/10, Step: 150/313, Loss: 0.8267
Iteration: 1/10, Step: 160/313, Loss: 0.6357
Iteration: 1/10, Step: 170/313, Loss: 0.5327
Iteration: 1/10, Step: 180/313, Loss: 0.6272
Iteration: 1/10, Step: 190/313, Loss: 0.9773
Iteration: 1/10, Step: 200/313, Loss: 0.8413
Iteration: 1/10, Step: 210/313, Loss: 0.8176
Iteration: 1/10, Step: 220/313, Loss: 0.7755
Iteration: 1/10, St

In [20]:
!cp /content/best_model.pth /content/drive/MyDrive