In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -d ./dataverse_files /content/drive/MyDrive/dataverse_files.zip

!unzip -d ./dataverse_files/HAM10000_images_part_1 /content/dataverse_files/HAM10000_images_part_1.zip
!unzip -d ./dataverse_files/HAM10000_images_part_2 /content/dataverse_files/HAM10000_images_part_2.zip
!unzip -d ./dataverse_files /content/dataverse_files/ISIC2018_Task3_Test_Images.zip

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032350.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032351.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032352.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032353.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032354.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032355.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032356.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032357.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032358.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032359.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032360.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032361.jpg  
  inflating: ./dataverse_files/HAM10000_images_part_2/ISIC_0032362.jpg  
  inflatin

# 1. Preprocessing

In [3]:
import pandas as pd
import os
import shutil

In [4]:
# move training and test images into folders

# create train and test folder
if not os.path.exists('dataverse_files/train'):
    os.mkdir('dataverse_files/train')
    
# if not os.path.exists('dataverse_files/test'):
#     os.mkdir('dataverse_files/test')
    
for img_name in os.listdir('dataverse_files/HAM10000_images_part_1/'):
    src_path = os.path.join('dataverse_files/HAM10000_images_part_1', img_name)
    tgt_path = 'dataverse_files/train'
    shutil.move(src_path, tgt_path)
    
for img_name in os.listdir('dataverse_files/HAM10000_images_part_2/'):
    src_path = os.path.join('dataverse_files/HAM10000_images_part_2', img_name)
    tgt_path = 'dataverse_files/train'
    shutil.move(src_path, tgt_path)

# for img_name in os.listdir('dataverse_files/ISIC2018_Task3_Test_Images/'):
#     src_path = os.path.join('dataverse_files/ISIC2018_Task3_Test_Images', img_name)
#     tgt_path = 'dataverse_files/test'
#     shutil.move(src_path, tgt_path)

In [5]:
from sklearn.model_selection import train_test_split
train_df = pd.read_csv('dataverse_files/HAM10000_metadata')

train, test = train_test_split(train_df, test_size=0.2)

In [6]:
# convert to numerical representation
from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
lb.fit_transform(train_df['dx'])
cls2lbl = {cls: idx for idx, cls in enumerate(lb.classes_)}
lbl2cls = {idx: cls for idx, cls in enumerate(lb.classes_)}
train['label'] = [cls2lbl[cls] for cls in train['dx']]
test['label'] = [cls2lbl[cls] for cls in test['dx']]

# 2. Data loader

In [14]:
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, RandomAffine, Resize, CenterCrop, Normalize

class MyDataset(Dataset):
    def __init__(self, mode='train'):
        self.meta = train if mode == 'train' else test
        self.root_dir = 'dataverse_files/train'
    def __len__(self):
        return len(self.meta)
    def __getitem__(self, idx):
        img_name = self.meta['image_id'].iloc[idx] + '.jpg'
        img_path = os.path.join(self.root_dir, img_name)

        image = Image.open(img_path)
        image = Resize((299, 299))(image)
        image = RandomAffine(degrees=(-10, 10), translate=(0.05, 0.1), scale=(0.9, 1.1))(image)
        image = ToTensor()(image)
        image = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(image)
        
        label = self.meta['label'].iloc[idx]
        return image, label

train_dataset = MyDataset(mode='train')
test_dataset = MyDataset(mode='test')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# 3. Models

In [21]:
import torch.nn as nn
from torchvision.models import inception_v3

model = inception_v3(pretrained=False)
# for param in model.parameters():
#     param.requires_grad = False

model.fc = nn.Sequential(nn.Linear(2048, 7))
model

Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stri

# 4. Train

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

n_epochs = 10
lr = 0.001
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
loss_func = torch.nn.CrossEntropyLoss()
best_test_acc = 0

for i in range(n_epochs):
    for idx, (image, label) in enumerate(train_loader):
        image = image.to(device)
        label = label.to(device)
        
        y_pred = model(image)[0]
        loss = loss_func(y_pred, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx + 1) % 10 == 0:
            print(f'Iteration: {i+1}/{n_epochs}, Step: {idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}')
    
    model.eval()
    with torch.no_grad():
        n_instances = len(test_loader)
        n_correct = 0
        for image, label in test_loader:
            image = image.to(device)
            label = label.to(device)
            
            y_pred = model(image)
            pred = torch.argmax(y_pred, dim=1)
            n_correct += (pred == label).sum().item()
        test_acc = n_correct / n_instances
        print(f'Iteration: {i+1}/{n_epochs}, Test accuracy: {test_acc:.4f}')
        
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            torch.save(model, 'best_model.pth')
        
    model.train()
    
print(f'Best test accuracy: {best_test_acc:.4f}')

Iteration: 1/10, Step: 10/251, Loss: 0.9878
Iteration: 1/10, Step: 20/251, Loss: 1.1875
Iteration: 1/10, Step: 30/251, Loss: 1.2116
Iteration: 1/10, Step: 40/251, Loss: 1.3940
Iteration: 1/10, Step: 50/251, Loss: 0.8560
Iteration: 1/10, Step: 60/251, Loss: 1.0241
Iteration: 1/10, Step: 70/251, Loss: 0.6704
Iteration: 1/10, Step: 80/251, Loss: 0.7352
Iteration: 1/10, Step: 90/251, Loss: 0.7494
Iteration: 1/10, Step: 100/251, Loss: 0.6752
Iteration: 1/10, Step: 110/251, Loss: 0.8036
Iteration: 1/10, Step: 120/251, Loss: 1.2026
Iteration: 1/10, Step: 130/251, Loss: 0.8095
Iteration: 1/10, Step: 140/251, Loss: 1.1851
Iteration: 1/10, Step: 150/251, Loss: 0.5914
Iteration: 1/10, Step: 160/251, Loss: 0.7091
Iteration: 1/10, Step: 170/251, Loss: 0.9440
Iteration: 1/10, Step: 180/251, Loss: 0.7660
Iteration: 1/10, Step: 190/251, Loss: 0.9185
Iteration: 1/10, Step: 200/251, Loss: 0.7161
Iteration: 1/10, Step: 210/251, Loss: 1.2359
Iteration: 1/10, Step: 220/251, Loss: 1.0019
Iteration: 1/10, St

In [23]:
!cp /content/best_model.pth /content/drive/MyDrive

# 5. Evaluate

In [24]:
best_model = torch.load('best_model.pth')
best_model.eval()
with torch.no_grad():
    y_preds = []
    y_trues = []
    for image, label in test_loader:
        image = image.to(device)
        label = label.to(device)
        
        y_pred = model(image)[0]
        pred = torch.argmax(y_pred, dim=1)

        y_trues.append(label.cpu().numpy())
        y_preds.append(pred.cpu().numpy())


In [25]:
import numpy as np
from sklearn.metrics import recall_score

y_preds = np.concatenate(y_preds)
y_trues = np.concatenate(y_trues)

scores = recall_score(y_trues, y_preds, average=None)
scores

array([0.24358974, 0.35849057, 0.37850467, 0.        , 0.27876106,
       0.96837349, 0.5       ])

In [26]:
from sklearn.metrics import accuracy_score

test_acc = accuracy_score(y_trues, y_preds)
test_acc

0.7483774338492262