# Import

In [1]:
# system
import sys
import os 
import time

# data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2 as cv
from sklearn.metrics import accuracy_score

# deep learning 
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader 

# custom helpers
import trainer
import metrics
import process

# for formatting
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')

# Notebook Toggles

In [2]:
COMPUTE_DISTRIBUTION = False

## Hyperparameters 

In [3]:
device = torch.device('cuda:0')
epochs = 100
learning_rate = 3e-4
weight_decay = 1e-4
step_period = 15
lr_decay = 0.95
num_workers = 4
batch_size = 32

# Data

Data is provided by "The ISIC 2020 Challenge". The dataset is annotated for binary classification of skin lesions for melanoma detection. More information may be found at "https://challenge2020.isic-archive.com/"

Data is sources from 2000 patients and includes 33,126 dermoscopic training images

## Loading and Statistics

In [4]:
def toPath(root, image_id):
    return os.path.join(root, image_id + '.npy')

def toLabel(key, mapping):
    return mapping[key]

### HAM

In [5]:
# paths
ham_path = '/usr/local/faststorage/ezimmer/data/'

# parse data
metadata = pd.read_csv('./data/HAM/metadata.csv')
headers = metadata.head()

# verify data
label_mapping = {label : idx for idx, label in enumerate(sorted(np.unique(metadata['dx'])))}
data_file = {ID : {'image' : None, 'label' : None} for ID in metadata['image_id']}

for ID in metadata['image_id']:
    data_file[ID]['image'] = toPath(ham_path, ID)
    data_file[ID]['label'] = toLabel(metadata[metadata['image_id'] == ID]['dx'].values[0], label_mapping)

# get class distribution
class_counts = {idx : 0 for idx in range(len(np.unique(metadata['dx'])))}
for ID in data_file.keys():
    class_counts[data_file[ID]['label']] += 1
    
ncls = len(class_counts.keys())

In [6]:
print("HAM 10000 Metadata")
print('------------------------------------------------------------------')
print(headers, '\n')
print("Number of Classes")
print(ncls, '\n')
print('Unique Labels')
print(label_mapping, '\n')
print("Class Balance")
print(class_counts)

HAM 10000 Metadata
------------------------------------------------------------------
     lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear 

Number of Classes
7 

Unique Labels
{'akiec': 0, 'bcc': 1, 'bkl': 2, 'df': 3, 'mel': 4, 'nv': 5, 'vasc': 6} 

Class Balance
{0: 327, 1: 514, 2: 1099, 3: 115, 4: 1113, 5: 6705, 6: 142}


In [7]:
if COMPUTE_DISTRIBUTION: 
    ys, xs = [], []
    for ID in data_file.keys():
        img = cv.imread(data_file[ID]['image'])
        y, x, _ = img.shape
        ys.append(y)
        xs.append(x)
    
    print("Dataset Image Size Distribution")
    print("Num Patients", len(ys), len(xs))
    print("Unique Values", np.unique(ys), np.unique(xs))
    plt.scatter(y,x)
    plt.title("Image dimensions")
    plt.show()

## Experiment, Dataset, Dataloader

Since the dataset is imbalanced, partition train/validation/test by number of classes and then wrap in an oversampler

In [8]:
# generate partitioned exp
partitioned_data = process.generateExperiment(data_file, ncls)

# generate datasets
train_sets = [process.SkinSet(partitioned_data[cls]['train']) for cls in range(ncls)]
val_sets = [process.SkinSet(partitioned_data[cls]['validation']) for cls in range(ncls)]
test_set = process.SkinSet(partitioned_data['test'])

# oversampler on training and validation
train_set = process.Oversampler(train_sets)
val_set = process.Oversampler(val_sets)

# Loaders
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, pin_memory=True, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, num_workers=num_workers)
test_loader = DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)

# Model

Randomly initialized ResNet-50

In [9]:
def generateModel(init=None, device=None):
    model = torchvision.models.resnet50(pretrained=False)
    model.fc.out_features=2
        
    if init is not None:
        model.apply(init)
        
    if device is not None:
        model = model.to(device)
    
    return model

def init_weights(m):
    if (type(m) == nn.Conv2d or type(m) == nn.Linear):
        nn.init.kaiming_normal_(m.weight)
    
model = generateModel(init_weights, device)

# Optimization and Criteria

Optimizer: AdamW for super convergence and fast training

Scheduler: Step LR decay 

In [10]:
optimizer = optim.AdamW(model.parameters())
scheduler = optim.lr_scheduler.StepLR(optimizer, step_period, lr_decay)
criterion = nn.CrossEntropyLoss().to(device)

# Metrics and Aggregation

In [11]:
# Metric storage
train_stats = metrics.Aggregator()
val_stats = metrics.Aggregator()
test_stats = metrics.Aggregator()

# add stats
train_stats.addStat('loss')
val_stats.addStat('loss')
train_stats.addStat('acc', accuracy_score)
val_stats.addStat('acc', accuracy_score)
test_stats.addStat('acc', accuracy_score)

# Training 

In [14]:
best_stat = 0
best_model = 0
for epoch in range(epochs):
    
    t = time.time()
    
    # train
    preds, labels, t_loss = trainer.train(model, criterion, optimizer, scheduler, train_loader, device)
    train_stats.logStat('loss', (t_loss,))
    train_stats.logStat('acc', (labels, preds))
    
    # validate
    preds, labels, v_loss = trainer.train(model, criterion, optimizer, scheduler, val_loader, device)
    val_stats.logStat('loss', (v_loss,))
    val_stats.logStat('acc', (labels, preds))
    
    if val_stats.getStats('acc')[-1] > best_stat:
        best_stat = val_stats.getStats('acc')[-1]
        best_model = model.state_dict()
    
    t = time.time() - t
        
    print("Epoch:", epoch)
    print("--------------------------------")
    print("Time:", t)
    print("Training Loss:       ", t_loss)
    print("Validation Loss:     ", v_loss)
    print("Training Accuracy:   ", train_stats.getStats('acc')[-1])
    print("Validation Accuracy: ", val_stats.getStats('acc')[-1])
    
    
metrics.Plotter.plot(train_stats.getStats('loss'), val_stats.getStats('loss'), 'Epochs, Loss', 'Losses')
metrics.Plotter.plot(train_stats.getStats('acc'), val_stats.getStats('acc'), 'Epochs, Accuracy', 'Accuracies')

Epoch: 0
--------------------------------
Time: 35.20189571380615
Training Loss:        1.1452278
Validation Loss:      3.699743
Training Accuracy:    0.5826759061833688
Validation Accuracy:  0.655223880597015


KeyboardInterrupt: 