# PADUFES20



Libraries

In [2]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
import time
import gc
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.models.feature_extraction import get_graph_node_names
from torchvision.models.feature_extraction import create_feature_extractor
import copy
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 7]

from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

from efficientnet_pytorch import EfficientNet
import torchextractor as tx

from itertools import chain, combinations

def get_combs(l):
    return list(chain.from_iterable(combinations(l, r) for r in range(1, len(l)+1)))

In [3]:
sys.path.append('..')

from utils.train import train
from utils.metrics import get_scores, get_metrics
from utils.dataset import get_data_loader
from utils.models import get_model, BaseMetaModel, MetaModel

# Dataset

In [4]:
df = pd.read_csv('train_meta.csv')
df.head()

Unnamed: 0,img_id,patient_id,lesion_id,biopsed,diagnostic,diagnostic_number,age,smoke_False,smoke_True,drink_False,...,hurt_UNK,changed_False,changed_True,changed_UNK,bleed_False,bleed_True,bleed_UNK,elevation_False,elevation_True,elevation_UNK
0,PAT_1516_1765_530.png,PAT_1516,1765,False,NEV,3,8,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,PAT_46_881_939.png,PAT_46,881,True,BCC,1,55,1,0,1,...,0,0,1,0,0,1,0,0,1,0
2,PAT_1545_1867_547.png,PAT_1545,1867,False,ACK,0,77,0,0,0,...,0,1,0,0,1,0,0,1,0,0
3,PAT_1989_4061_934.png,PAT_1989,4061,False,ACK,0,75,0,0,0,...,0,1,0,0,1,0,0,1,0,0
4,PAT_1549_1882_230.png,PAT_1549,1882,False,SEK,5,53,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [5]:
open_file = open('train_idcs', "rb")
train_folds = pickle.load(open_file)
open_file.close()

open_file = open('val_idcs', "rb")
val_folds = pickle.load(open_file)
open_file.close()

open_file = open('test_idcs', "rb")
test_idcs = pickle.load(open_file)
open_file.close()

## Metadata columns

In [6]:
meta_prefixes = ['smoke', 'drink', 'background_father',
       'background_mother', 'age', 'pesticide', 'gender',
       'skin_cancer_history', 'cancer_history', 'has_piped_water',
       'has_sewage_system', 'fitspatrick', 'region', 'diameter_1',
       'diameter_2', 'itch', 'grew', 'hurt', 'changed', 'bleed',
       'elevation']

In [7]:
# meta_prefixes = ['smoke', 'drink', 'background', 'age', 'pesticide', 'gender',
#        'skin_cancer_history', 'cancer_history', 'has_piped_water',
#        'has_sewage_system', 'fitspatrick', 'region', 'diameter_1',
#        'diameter_2', 'itch', 'grew', 'hurt', 'changed', 'bleed',
#        'elevation']

In [8]:
history_meta  = ['background_father','background_mother', 'age', 'gender', 'skin_cancer_history',
                 'cancer_history', 'fitspatrick']

habits_meta = ['smoke', 'drink', 'pesticide', 'has_piped_water', 'has_sewage_system']

lesion_meta = ['region', 'diameter_1', 'diameter_2', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']
meta_types_dict = {'history': history_meta, 'habits': habits_meta, 'lesion': lesion_meta} 

In [9]:
metacols_dict = {meta_type: [col  for prefix in meta_types_dict[meta_type] for col in df.columns if col.startswith(prefix)] for meta_type in  ['history', 'habits', 'lesion']}
metacols_dict

{'history': ['background_father_POMERANIA',
  'background_father_GERMANY',
  'background_father_BRAZIL',
  'background_father_NETHERLANDS',
  'background_father_ITALY',
  'background_father_POLAND',
  'background_father_UNK',
  'background_father_PORTUGAL',
  'background_father_BRASIL',
  'background_father_CZECH',
  'background_father_AUSTRIA',
  'background_father_SPAIN',
  'background_father_ISRAEL',
  'background_mother_POMERANIA',
  'background_mother_ITALY',
  'background_mother_GERMANY',
  'background_mother_BRAZIL',
  'background_mother_UNK',
  'background_mother_POLAND',
  'background_mother_NORWAY',
  'background_mother_PORTUGAL',
  'background_mother_NETHERLANDS',
  'background_mother_FRANCE',
  'background_mother_SPAIN',
  'age',
  'gender_FEMALE',
  'gender_MALE',
  'skin_cancer_history_True',
  'skin_cancer_history_False',
  'cancer_history_True',
  'cancer_history_False',
  'fitspatrick_3.0',
  'fitspatrick_1.0',
  'fitspatrick_2.0',
  'fitspatrick_4.0',
  'fitspatrick_5

In [10]:
# metacols_dict = {prefix: [col for col in df.columns if col.startswith(prefix)] for prefix in meta_prefixes}
# #metacols_dict

# Training

In [11]:
# usemeta = ['age', 'gender', 'region']
usemeta = ['history', 'habits', 'lesion']

In [12]:
metacombs = get_combs(usemeta)
len(metacombs)

7

In [13]:
metacombs

[('history',),
 ('habits',),
 ('lesion',),
 ('history', 'habits'),
 ('history', 'lesion'),
 ('habits', 'lesion'),
 ('history', 'habits', 'lesion')]

In [14]:
combcols_dict = {comb: [col for meta in comb for col in metacols_dict[meta]] for comb in metacombs}
combcols_dict

{('history',): ['background_father_POMERANIA',
  'background_father_GERMANY',
  'background_father_BRAZIL',
  'background_father_NETHERLANDS',
  'background_father_ITALY',
  'background_father_POLAND',
  'background_father_UNK',
  'background_father_PORTUGAL',
  'background_father_BRASIL',
  'background_father_CZECH',
  'background_father_AUSTRIA',
  'background_father_SPAIN',
  'background_father_ISRAEL',
  'background_mother_POMERANIA',
  'background_mother_ITALY',
  'background_mother_GERMANY',
  'background_mother_BRAZIL',
  'background_mother_UNK',
  'background_mother_POLAND',
  'background_mother_NORWAY',
  'background_mother_PORTUGAL',
  'background_mother_NETHERLANDS',
  'background_mother_FRANCE',
  'background_mother_SPAIN',
  'age',
  'gender_FEMALE',
  'gender_MALE',
  'skin_cancer_history_True',
  'skin_cancer_history_False',
  'cancer_history_True',
  'cancer_history_False',
  'fitspatrick_3.0',
  'fitspatrick_1.0',
  'fitspatrick_2.0',
  'fitspatrick_4.0',
  'fitspatric

In [15]:
torch.cuda.device_count()

4

In [16]:
# model_names = ['resnet18']
#model_names = ['resnet34', 'resnet50', 'resnet101', 'resnet152', 'effnetb0', 'effnetb1',
             #  'effnetb2', 'effnetb3', 'effnetb4', 'effnetb5']
model_names = ['resnext50', 'vgg11', 'vit_b_32']

In [17]:
# fusion_methods = ['concat', 'metanet', 'metablock']
fusion_methods = ['concat']

In [18]:
data_dir      = 'imgs'
batch_size    = 32
num_workers   = 16
input_size    = 224

train_transform = transforms.Compose([transforms.RandomResizedCrop(input_size),
                                transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

val_transform   = transforms.Compose([transforms.Resize((input_size, input_size)),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

In [19]:
#n_classes = len(set(train_labels))
n_epochs  = 100


lr        = 1e-3 # Learning rate
device    = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

n_samples = df.diagnostic_number.value_counts().sort_index().values

weights = [1 - (x / sum(n_samples)) for x in n_samples]
weights = torch.FloatTensor(weights).to(device)

criterion = nn.CrossEntropyLoss(weight=weights).to(device)

saved_models_folder      = 'saved_comb_models'
saved_scores_folder      = 'saved_comb_scores'
saved_base_models_folder = 'saved_basemodels'
saved_base_scores_folder = 'saved_basescores'

cuda


In [20]:
n_samples

array([608, 704,  43, 204, 160, 196])

In [21]:
#folds = [0, 1, 2, 3, 4]
#folds = [1, 2, 3, 4]
folds  = [0]
n_reducer_block = 256

for fold in folds:
    for metacomb in metacombs:
        metacomb_name = '_'.join(list(metacomb))
        
        metadata_cols = combcols_dict[metacomb]
        
        # Dataloaders
        train_idcs = train_folds[fold]
        val_idcs   = val_folds[fold]
        train_imgs = df.loc[train_idcs, 'img_id'].values
        val_imgs   = df.loc[val_idcs, 'img_id'].values
        test_imgs  = df.loc[test_idcs, 'img_id'].values

        train_paths = [f'{os.path.join(data_dir, img)}' for img in train_imgs]
        val_paths   = [f'{os.path.join(data_dir, img)}' for img in val_imgs]
        test_paths  = [f'{os.path.join(data_dir, img)}' for img in test_imgs]

        train_labels = df.loc[train_idcs, 'diagnostic_number'].values
        val_labels   = df.loc[val_idcs, 'diagnostic_number'].values
        test_labels  = df.loc[test_idcs, 'diagnostic_number'].values

        train_metadata = df.loc[train_idcs, metadata_cols].values
        val_metadata   = df.loc[val_idcs, metadata_cols].values
        test_metadata  = df.loc[test_idcs, metadata_cols].values
        train_dataloader = get_data_loader(train_paths, train_labels, metadata=train_metadata, transform=train_transform, batch_size=batch_size, num_workers=num_workers)
        val_dataloader   = get_data_loader(val_paths, val_labels, metadata=val_metadata, transform=val_transform, batch_size=batch_size, num_workers=num_workers)
        test_dataloader  = get_data_loader(test_paths, test_labels, metadata=test_metadata, transform=val_transform, batch_size=batch_size, num_workers=num_workers) 

        # Training
        n_classes  = len(set(train_labels))
        n_metadata = train_metadata.shape[1]

        for model_name in model_names:

            base_save_path = f'best_base_{model_name}_w_{fold}'
            base_model     = BaseMetaModel(get_model(model_name, n_classes=n_classes, pretrained=True)).to(device)
            base_model.load_state_dict(torch.load(os.path.join(saved_base_models_folder, base_save_path)))

            for fusion_method in fusion_methods:
                print(f'{"*"*79}\n{model_name.upper()} {metacomb_name.upper()} FOLD {fold} {fusion_method.upper()}\n{"*"*79}\n')

                save_path = f'{model_name}_{fusion_method}_{metacomb_name}_{fold}'
                model     = MetaModel(base_model, n_classes, n_metadata=n_metadata, fusion_method=fusion_method, n_reducer_block=n_reducer_block).to(device)

                optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
                scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=7)

                train(model, train_dataloader, val_dataloader, optimizer, scheduler, criterion, device, n_epochs,
                  saved_models_folder, saved_scores_folder, save_path, printfreq=10)

                del model
                gc.collect()
                torch.cuda.empty_cache()
#             del base_model
#             gc.collect()
#             torch.cuda.empty_cache()

*******************************************************************************
RESNEXT50 HISTORY FOLD 0 CONCAT
*******************************************************************************

 Epoch    Train Loss    Val Loss    Train Acc    Val Acc    Best      lr      Time [min]
-----------------------------------------------------------------------------------------
    10      0.7117       0.8493       0.7485      0.7074     ***    1.0e-03       0.1
    20      0.5734       1.1672       0.7958      0.6278            1.0e-04       0.1
Training stopped early
-----------------------------------------------------------------------------------------
Total time [min] for 25 Epochs: 3.1
*******************************************************************************
VGG11 HISTORY FOLD 0 CONCAT
*******************************************************************************

 Epoch    Train Loss    Val Loss    Train Acc    Val Acc    Best      lr      Time [min]
-----------------------------

    30      0.5596       0.7871       0.8198      0.7188     ***    1.0e-04       0.1
    40      0.5243       0.9222       0.8039      0.6392            1.0e-05       0.1
Training stopped early
-----------------------------------------------------------------------------------------
Total time [min] for 45 Epochs: 5.7
*******************************************************************************
VGG11 HISTORY_HABITS FOLD 0 CONCAT
*******************************************************************************

 Epoch    Train Loss    Val Loss    Train Acc    Val Acc    Best      lr      Time [min]
-----------------------------------------------------------------------------------------
    10      1.1977       0.9089       0.6316      0.7131     ***    1.0e-03       0.2
    20      0.9117       1.3578       0.6927      0.6449            1.0e-03       0.1
    30      0.6836       1.4012       0.7568      0.4631            1.0e-03       0.1
    40      0.5127       0.8017       0.8226  

Training stopped early
-----------------------------------------------------------------------------------------
Total time [min] for 58 Epochs: 7.7
*******************************************************************************
VGG11 HISTORY_HABITS_LESION FOLD 0 CONCAT
*******************************************************************************

 Epoch    Train Loss    Val Loss    Train Acc    Val Acc    Best      lr      Time [min]
-----------------------------------------------------------------------------------------
    10      1.0560       1.0452       0.6432      0.6648            1.0e-03       0.1
    20      0.7578       1.6386       0.7269      0.5142            1.0e-03       0.1
    30      0.4992       0.9409       0.8240      0.7585            1.0e-03       0.1
    40      0.4940       0.9039       0.8274      0.6733            1.0e-03       0.1
    50      0.4004       0.7647       0.8603      0.7642            1.0e-04       0.1
    60      0.3451       0.7394       0

# Testing

In [22]:
fusion_methods

['concat']

In [29]:
folds            = [0]
#model_names      = ['resnet18]
# model_names      = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'effnetb0', 'effnetb1',
#                'effnetb2', 'effnetb3', 'effnetb4', 'effnetb5']
# model_names = ['resnet18', 'resnet50', 'effnetb3', 'resnext50', 'vgg11', 'vit_b_32']
model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnext50', 
               'effnetb0', 'effnetb1','effnetb2', 'effnetb3', 'effnetb4', 'effnetb5', 'vgg11', 'vit_b_32']

#fusion_methods   = ['concat', 'metanet', 'metablock']
fusion_methods   = ['concat']

In [30]:
#usemeta = ['age', 'gender', 'region']
usemeta = ['history', 'habits', 'lesion']
metacombs = get_combs(usemeta)
metacombs

[('history',),
 ('habits',),
 ('lesion',),
 ('history', 'habits'),
 ('history', 'lesion'),
 ('habits', 'lesion'),
 ('history', 'habits', 'lesion')]

In [31]:
#folds = [0, 1, 2, 3, 4]
#folds = [1, 2, 3, 4]
folds  = [0]
n_reducer_block = 256

all_metrics_dict = dict()
for fold in folds:
    fold_dict = dict()
    for metacomb in metacombs:
        metacomb_dict = dict()
        metacomb_name = '_'.join(list(metacomb))
        
        metadata_cols = combcols_dict[metacomb]
        
        # Dataloaders
        test_imgs  = df.loc[test_idcs, 'img_id'].values

        test_paths  = [f'{os.path.join(data_dir, img)}' for img in test_imgs]

        test_labels  = df.loc[test_idcs, 'diagnostic_number'].values

        test_metadata  = df.loc[test_idcs, metadata_cols].values
        test_dataloader  = get_data_loader(test_paths, test_labels, metadata=test_metadata, transform=val_transform, batch_size=batch_size, num_workers=num_workers) 

        # Training
        n_classes  = 6
        n_metadata = test_metadata.shape[1]

        for model_name in model_names:
            model_dict = dict()
            base_model     = BaseMetaModel(get_model(model_name, n_classes=n_classes, pretrained=True)).to(device)

            for fusion_method in fusion_methods:
                print(f'{"*"*79}\n{model_name.upper()} {metacomb_name.upper()} FOLD {fold} {fusion_method.upper()}\n{"*"*79}\n')
                
                save_path = f'best_{model_name}_{fusion_method}_{metacomb_name}_{fold}'
                
                model = MetaModel(base_model, n_classes, n_metadata=n_metadata, fusion_method=fusion_method, n_reducer_block=n_reducer_block).to(device)
                model.load_state_dict(torch.load(os.path.join(saved_models_folder, save_path)))

                y_true, y_prob, y_pred = get_scores(model, test_dataloader, batch_size, device)
                np.save(f'test_scores/y_true_{model_name}_{fusion_method}_{metacomb_name}_{fold}', y_true)
                np.save(f'test_scores/y_prob_{model_name}_{fusion_method}_{metacomb_name}_{fold}', y_prob)
                np.save(f'test_scores/y_pred_{model_name}_{fusion_method}_{metacomb_name}_{fold}', y_pred)
                metrics_dict = get_metrics(y_true, y_prob, y_pred)
                
                del model
                gc.collect()
                torch.cuda.empty_cache()
                model_dict[fusion_method] = metrics_dict
            del base_model
            gc.collect()
            torch.cuda.empty_cache()
            metacomb_dict[model_name] = model_dict
        fold_dict[metacomb_name] = metacomb_dict
    all_metrics_dict[fold] = fold_dict

*******************************************************************************
RESNET18 HISTORY FOLD 0 CONCAT
*******************************************************************************

*******************************************************************************
RESNET34 HISTORY FOLD 0 CONCAT
*******************************************************************************

*******************************************************************************
RESNET50 HISTORY FOLD 0 CONCAT
*******************************************************************************

*******************************************************************************
RESNET101 HISTORY FOLD 0 CONCAT
*******************************************************************************

*******************************************************************************
RESNET152 HISTORY FOLD 0 CONCAT
*******************************************************************************

**************************************

Loaded pretrained weights for efficientnet-b5
*******************************************************************************
EFFNETB5 LESION FOLD 0 CONCAT
*******************************************************************************

*******************************************************************************
VGG11 LESION FOLD 0 CONCAT
*******************************************************************************

*******************************************************************************
VIT_B_32 LESION FOLD 0 CONCAT
*******************************************************************************

*******************************************************************************
RESNET18 HISTORY_HABITS FOLD 0 CONCAT
*******************************************************************************

*******************************************************************************
RESNET34 HISTORY_HABITS FOLD 0 CONCAT
*******************************************************************

Loaded pretrained weights for efficientnet-b2
*******************************************************************************
EFFNETB2 HABITS_LESION FOLD 0 CONCAT
*******************************************************************************

Loaded pretrained weights for efficientnet-b3
*******************************************************************************
EFFNETB3 HABITS_LESION FOLD 0 CONCAT
*******************************************************************************

Loaded pretrained weights for efficientnet-b4
*******************************************************************************
EFFNETB4 HABITS_LESION FOLD 0 CONCAT
*******************************************************************************

Loaded pretrained weights for efficientnet-b5
*******************************************************************************
EFFNETB5 HABITS_LESION FOLD 0 CONCAT
*******************************************************************************

************************

In [32]:
import shutil
shutil.make_archive('test_scores', 'zip', 'test_scores')

'/home/gabriel/skin/PADUFES20/test_scores.zip'

In [33]:
import json

with open('metrics_combs_allmodels_concat_grouped_final.json', 'w') as outfile:
    json.dump(all_metrics_dict, outfile)
    
all_metrics_dict

{0: {'history': {'resnet18': {'concat': {'precision': 0.6853411187896432,
     'recall': 0.6727688787185355,
     'f1-score': 0.669865624037287,
     'support': 437,
     'accuracy': 0.6727688787185355,
     'balanced_accuracy': 0.5582073233749586,
     'auc': 0.8852411965368403}},
   'resnet34': {'concat': {'precision': 0.6589858814099683,
     'recall': 0.6796338672768879,
     'f1-score': 0.6613556884226673,
     'support': 437,
     'accuracy': 0.6796338672768879,
     'balanced_accuracy': 0.5872672773211848,
     'auc': 0.8896146124116102}},
   'resnet50': {'concat': {'precision': 0.6954203931577283,
     'recall': 0.7048054919908466,
     'f1-score': 0.6904772600618676,
     'support': 437,
     'accuracy': 0.7048054919908466,
     'balanced_accuracy': 0.5976799465477339,
     'auc': 0.9027460043583655}},
   'resnet101': {'concat': {'precision': 0.6524416688850675,
     'recall': 0.6498855835240275,
     'f1-score': 0.6376482089989813,
     'support': 437,
     'accuracy': 0.6498

In [34]:
pd.DataFrame(all_metrics_dict[0]['age']['resnet18'])

KeyError: 'age'

In [None]:
metacomb_names = ['_'.join(list(metacomb)) for metacomb in metacombs]
metacomb_names

In [None]:
pd.DataFrame({metacomb: all_metrics_dict[0][metacomb]['resnet18']['concat'] for metacomb in metacomb_names})

In [None]:
pd.DataFrame({metacomb: all_metrics_dict[0][metacomb]['resnet18']['metanet'] for metacomb in metacomb_names})

In [None]:
pd.DataFrame({metacomb: all_metrics_dict[0][metacomb]['resnet18']['metablock'] for metacomb in metacomb_names})