In [1]:
import os
import zipfile
from google.colab import files

! pip install -q kaggle
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! mkdir /content/data
!kaggle datasets download -d andrewmvd/isic-2019

with zipfile.ZipFile('/content/isic-2019.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/data')

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# from google.colab import files

# with open('example.txt', 'w') as f:
#   f.write('some content')

# files.download('example.txt')

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, Subset

import torchvision
from torchvision import datasets, models, transforms

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score
from PIL import Image
import matplotlib.pyplot as plt

import os
import copy
import time
import pickle
import gc
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
!pip install pretrainedmodels
import pretrainedmodels



In [4]:
maindir = os.getcwd()
df = pd.read_csv('data/ISIC_2019_Training_GroundTruth.csv')

In [5]:
mapping = {e:i for i, e in enumerate(df.iloc[:,1:].idxmax(1).unique())}
df['label_'] = df.iloc[:,1:].idxmax(1)
df['label']  = df['label_'].map(mapping)
df['fold']   = -1

fold_n = 10
random_state=111

skf = StratifiedKFold(n_splits=fold_n, random_state=random_state )
for fold, (train_idx, test_idx) in enumerate(skf.split(df['image'], df['label']), start=1):
    df.loc[test_idx, 'fold'] =  fold
    
debug = False
if debug:
    df.groupby(['label','fold']).head(10).to_csv('data/ISIC_2019_Training_GroundTruth_folded.csv', index=False)
else:
    df.to_csv('data/ISIC_2019_Training_GroundTruth_folded.csv', index=False)
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



In [6]:
def upsampler(df):
  n_upsampler = (1 / (df['label_'].value_counts() / df['label_'].value_counts().max())).round()
  for label in n_upsampler.index:
    n = n_upsampler[label] - 1
    sub_df = df[df['label_']==label]
    for i in range(int(n)):
      df = df.append(sub_df)
  return df

In [7]:
class MyCustomDataset(Dataset):
    def __init__(self, folder, fold, dtype, transform=None):
        self.dtype = dtype
        if self.dtype == 'train':
            self.df = pd.read_csv('data/ISIC_2019_Training_GroundTruth_folded.csv')
            self.df = self.df[self.df['fold'] != fold]
            self.df = upsampler(self.df)

        elif self.dtype == 'valid':
            self.df = pd.read_csv('data/ISIC_2019_Training_GroundTruth_folded.csv')
            self.df = self.df[self.df['fold'] == fold]
        else:
            self.df = pd.read_csv('data/ISIC_2019_Training_GroundTruth_folded.csv')
            self.df = self.df.groupby(['label_']).sample(10)
            
        self.folder     = folder
        self.transform  = transform
        self.classes    = self.df['label_'].unique()
        self.n_classes  = len(self.classes)
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, index):
        path = os.path.join(self.folder, self.df.iloc[index]['image'] + '.jpg')
        x = Image.open(path)
        if self.transform: x = self.transform(x)
        y = torch.tensor(self.df.iloc[index]['label'], dtype=torch.long)
    
        return x, y, index

In [8]:
mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]
size = (512, 512)
tmfs = {
        'train' : transforms.Compose([
                        transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1),
                        transforms.RandomPerspective(distortion_scale=0.3, p=0.5, interpolation=3, fill=0),
                        transforms.RandomRotation(45, resample=False, expand=False, center=None, fill=None),
                        transforms.RandomResizedCrop((512,512), scale=(0.5, 1.0), ratio=(0.75, 1.3333333333333333), interpolation=2),
                        transforms.RandomVerticalFlip(p=0.5),
                        transforms.Resize(size, interpolation=2),
                        transforms.ToTensor(),
                        transforms.Normalize(mean=mean, std=std, inplace=False)
                   ]),
        'valid'  : transforms.Compose([
                        transforms.Resize(size, interpolation=2),
                        transforms.ToTensor(),
                        transforms.Normalize(mean=mean, std=std, inplace=False)
                   ]),
        'test' : transforms.Compose([
                        transforms.Resize(size, interpolation=2),
                        transforms.ToTensor(),
                        transforms.Normalize(mean=mean, std=std, inplace=False)
                   ])
       }

In [9]:
def train_model(model, dataloader, dataset_sizes, criterion, optimizer, scheduler, num_epochs=10):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'valid']:
            if phase == 'train': model.train()
            else               : model.eval()
                
            running_loss     = 0.0
            running_corrects = 0

            for inputs, labels, index in tqdm(dataloader[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            if phase == 'valid' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model

In [10]:
def main():
    datadir = os.path.join(maindir, 'data/ISIC_2019_Training_Input/ISIC_2019_Training_Input')
    out_models = {}
    for fold in range(1, fold_n+1):
        print(f'Fold : {fold}')
        transformed_dataset = {e:MyCustomDataset(datadir, fold, e, transform=tmfs[e]) for e in ['train','valid']}
        dataset_sizes       = {e: len(transformed_dataset[e]) for e in ['train', 'valid']}
        dataloader          = {e:DataLoader(transformed_dataset[e], batch_size=16, shuffle=True, num_workers=0) for e in ['train','valid']}
        n_classes           = transformed_dataset['train'].n_classes

        #model     = models.resnet18(pretrained=True)
        #num_ftrs  = model.fc.in_features
        #num_ftrs = model.fc.in_features
        #model.fc  = nn.Linear(num_ftrs, n_classes)
        
        model_name = 'inceptionv4'
        model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')        
        num_ftrs = last_linear = model.last_linear.in_features
        model.last_linear  = nn.Linear(num_ftrs, n_classes)
        
        model     = model.to(device)
        criterion = nn.CrossEntropyLoss()

        optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

        exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)    

        model = train_model(model, dataloader, dataset_sizes, criterion, optimizer, exp_lr_scheduler)
        out_models[f'Fold_{fold}'] = model
    
    return out_models

In [None]:
if os.path.exists('trianed_cv_models.pickle'):
    with open('trianed_cv_models.pickle', 'rb') as handle:
        out_models = pickle.load(handle)
    pass
else:
    out_models = main()
    with open('trianed_cv_models.pickle', 'wb') as handle:
        pickle.dump(out_models, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold : 1
Epoch 0/9
----------


HBox(children=(FloatProgress(value=0.0, max=5902.0), HTML(value='')))