# PADUFES20



Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
import time
import gc
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.models.feature_extraction import get_graph_node_names
from torchvision.models.feature_extraction import create_feature_extractor
import copy
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 7]

from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

from efficientnet_pytorch import EfficientNet
import torchextractor as tx

from itertools import chain, combinations

def get_combs(l):
    return list(chain.from_iterable(combinations(l, r) for r in range(1, len(l)+1)))

In [2]:
sys.path.append('..')

from utils.train import train
from utils.metrics import get_scores, get_metrics
from utils.dataset import get_data_loader
from utils.models import get_model, BaseMetaModel, MetaModel

# Dataset

In [3]:
df = pd.read_csv('train_meta.csv')
df.head()

Unnamed: 0,img_id,patient_id,lesion_id,biopsed,diagnostic,diagnostic_number,age,smoke_False,smoke_True,drink_False,...,hurt_UNK,changed_False,changed_True,changed_UNK,bleed_False,bleed_True,bleed_UNK,elevation_False,elevation_True,elevation_UNK
0,PAT_1516_1765_530.png,PAT_1516,1765,False,NEV,3,8,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,PAT_46_881_939.png,PAT_46,881,True,BCC,1,55,1,0,1,...,0,0,1,0,0,1,0,0,1,0
2,PAT_1545_1867_547.png,PAT_1545,1867,False,ACK,0,77,0,0,0,...,0,1,0,0,1,0,0,1,0,0
3,PAT_1989_4061_934.png,PAT_1989,4061,False,ACK,0,75,0,0,0,...,0,1,0,0,1,0,0,1,0,0
4,PAT_1549_1882_230.png,PAT_1549,1882,False,SEK,5,53,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [4]:
open_file = open('train_idcs', "rb")
train_folds = pickle.load(open_file)
open_file.close()

open_file = open('val_idcs', "rb")
val_folds = pickle.load(open_file)
open_file.close()

open_file = open('test_idcs', "rb")
test_idcs = pickle.load(open_file)
open_file.close()

## Metadata columns

In [5]:
meta_prefixes = ['smoke', 'drink', 'background_father',
       'background_mother', 'age', 'pesticide', 'gender',
       'skin_cancer_history', 'cancer_history', 'has_piped_water',
       'has_sewage_system', 'fitspatrick', 'region', 'diameter_1',
       'diameter_2', 'itch', 'grew', 'hurt', 'changed', 'bleed',
       'elevation']

In [6]:
# meta_prefixes = ['smoke', 'drink', 'background', 'age', 'pesticide', 'gender',
#        'skin_cancer_history', 'cancer_history', 'has_piped_water',
#        'has_sewage_system', 'fitspatrick', 'region', 'diameter_1',
#        'diameter_2', 'itch', 'grew', 'hurt', 'changed', 'bleed',
#        'elevation']

In [7]:
# # history_meta  = ['background_father','background_mother', 'age', 'gender', 'skin_cancer_history',
# #                  'cancer_history', 'fitspatrick']

# # habits_meta = ['smoke', 'drink', 'pesticide', 'has_piped_water', 'has_sewage_system']

# # lesion_meta = ['region', 'diameter_1', 'diameter_2', 'itch', 'grew', 'hurt', 'changed', 'bleed', 'elevation']
# meta_types_dict = {'history': history_meta, 'habits': habits_meta, 'lesion': lesion_meta} 

In [8]:
# metacols_dict = {meta_type: [col  for prefix in meta_types_dict[meta_type] for col in df.columns if col.startswith(prefix)] for meta_type in  ['history', 'habits', 'lesion']}
# metacols_dict

In [9]:
metacols_dict = {prefix: [col for col in df.columns if col.startswith(prefix)] for prefix in meta_prefixes}
#metacols_dict

# Training

In [10]:
usemeta = ['age', 'gender', 'region']

In [11]:
metacombs = get_combs(usemeta)
len(metacombs)

7

In [12]:
metacombs

[('age',),
 ('gender',),
 ('region',),
 ('age', 'gender'),
 ('age', 'region'),
 ('gender', 'region'),
 ('age', 'gender', 'region')]

In [13]:
combcols_dict = {comb: [col for meta in comb for col in metacols_dict[meta]] for comb in metacombs}
#combcols_dict

In [14]:
torch.cuda.device_count()

4

In [15]:
# model_names = ['resnet18']
# model_names = ['resnet34', 'resnet50', 'resnet101', 'resnet152', 'effnetb0', 'effnetb1',
#                'effnetb2', 'effnetb3', 'effnetb4', 'effnetb5']

model_names = ['resnext50', 'vgg11', 'vit_b_32']

In [16]:
# fusion_methods = ['concat', 'metanet', 'metablock']
fusion_methods = ['concat']

In [17]:
data_dir      = 'imgs'
batch_size    = 32
num_workers   = 16
input_size    = 224

train_transform = transforms.Compose([transforms.RandomResizedCrop(input_size),
                                transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

val_transform   = transforms.Compose([transforms.Resize((input_size, input_size)),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

In [18]:
#n_classes = len(set(train_labels))
n_epochs  = 100


lr        = 1e-3 # Learning rate
device    = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

n_samples = df.diagnostic_number.value_counts().sort_index().values

weights = [1 - (x / sum(n_samples)) for x in n_samples]
weights = torch.FloatTensor(weights).to(device)

criterion = nn.CrossEntropyLoss(weight=weights).to(device)

saved_models_folder      = 'saved_comb_models'
saved_scores_folder      = 'saved_comb_scores'
saved_base_models_folder = 'saved_basemodels'
saved_base_scores_folder = 'saved_basescores'

cuda


In [19]:
# metacombs = [('gender', 'region')]

In [21]:
#folds = [0, 1, 2, 3, 4]
#folds = [1, 2, 3, 4]
folds  = [0]
n_reducer_block = 256

for fold in folds:
    for metacomb in metacombs:
        metacomb_name = '_'.join(list(metacomb))
        
        metadata_cols = combcols_dict[metacomb]
        
        # Dataloaders
        train_idcs = train_folds[fold]
        val_idcs   = val_folds[fold]
        train_imgs = df.loc[train_idcs, 'img_id'].values
        val_imgs   = df.loc[val_idcs, 'img_id'].values
        test_imgs  = df.loc[test_idcs, 'img_id'].values

        train_paths = [f'{os.path.join(data_dir, img)}' for img in train_imgs]
        val_paths   = [f'{os.path.join(data_dir, img)}' for img in val_imgs]
        test_paths  = [f'{os.path.join(data_dir, img)}' for img in test_imgs]

        train_labels = df.loc[train_idcs, 'diagnostic_number'].values
        val_labels   = df.loc[val_idcs, 'diagnostic_number'].values
        test_labels  = df.loc[test_idcs, 'diagnostic_number'].values

        train_metadata = df.loc[train_idcs, metadata_cols].values
        val_metadata   = df.loc[val_idcs, metadata_cols].values
        test_metadata  = df.loc[test_idcs, metadata_cols].values
        train_dataloader = get_data_loader(train_paths, train_labels, metadata=train_metadata, transform=train_transform, batch_size=batch_size, num_workers=num_workers)
        val_dataloader   = get_data_loader(val_paths, val_labels, metadata=val_metadata, transform=val_transform, batch_size=batch_size, num_workers=num_workers)
        test_dataloader  = get_data_loader(test_paths, test_labels, metadata=test_metadata, transform=val_transform, batch_size=batch_size, num_workers=num_workers) 

        # Training
        n_classes  = len(set(train_labels))
        n_metadata = train_metadata.shape[1]

        for model_name in model_names:

            base_save_path = f'best_base_{model_name}_w_{fold}'
            base_model     = BaseMetaModel(get_model(model_name, n_classes=n_classes, pretrained=True)).to(device)
            base_model.load_state_dict(torch.load(os.path.join(saved_base_models_folder, base_save_path)))

            for fusion_method in fusion_methods:
                print(f'{"*"*79}\n{model_name.upper()} {metacomb_name.upper()} FOLD {fold} {fusion_method.upper()}\n{"*"*79}\n')

                save_path = f'{model_name}_{fusion_method}_{metacomb_name}_{fold}'
                model     = MetaModel(base_model, n_classes, n_metadata=n_metadata, fusion_method=fusion_method, n_reducer_block=n_reducer_block).to(device)

                optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
                scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=7)

                train(model, train_dataloader, val_dataloader, optimizer, scheduler, criterion, device, n_epochs,
                  saved_models_folder, saved_scores_folder, save_path, printfreq=10)

                del model
                gc.collect()
                torch.cuda.empty_cache()

*******************************************************************************
RESNEXT50 AGE_GENDER_REGION FOLD 0 CONCAT
*******************************************************************************

 Epoch    Train Loss    Val Loss    Train Acc    Val Acc    Best      lr      Time [min]
-----------------------------------------------------------------------------------------
    10      0.7272       1.0596       0.7446      0.6165            1.0e-03       0.1
    20      0.5251       0.8958       0.8025      0.7472            1.0e-04       0.1
    30      0.5503       0.9260       0.8054      0.7216            1.0e-04       0.1
    40      0.5814       0.9009       0.7995      0.7188            1.0e-05       0.1
Training stopped early
-----------------------------------------------------------------------------------------
Total time [min] for 48 Epochs: 5.7
*******************************************************************************
VGG11 AGE_GENDER_REGION FOLD 0 CONCAT
*******

### Combi faltante

In [24]:
model_names = ['vit_b_32']
metacombs = [('gender', 'region'), ('age', 'gender', 'region')]

In [25]:
#folds = [0, 1, 2, 3, 4]
#folds = [1, 2, 3, 4]
folds  = [0]
n_reducer_block = 256

for fold in folds:
    for metacomb in metacombs:
        metacomb_name = '_'.join(list(metacomb))
        
        metadata_cols = combcols_dict[metacomb]
        
        # Dataloaders
        train_idcs = train_folds[fold]
        val_idcs   = val_folds[fold]
        train_imgs = df.loc[train_idcs, 'img_id'].values
        val_imgs   = df.loc[val_idcs, 'img_id'].values
        test_imgs  = df.loc[test_idcs, 'img_id'].values

        train_paths = [f'{os.path.join(data_dir, img)}' for img in train_imgs]
        val_paths   = [f'{os.path.join(data_dir, img)}' for img in val_imgs]
        test_paths  = [f'{os.path.join(data_dir, img)}' for img in test_imgs]

        train_labels = df.loc[train_idcs, 'diagnostic_number'].values
        val_labels   = df.loc[val_idcs, 'diagnostic_number'].values
        test_labels  = df.loc[test_idcs, 'diagnostic_number'].values

        train_metadata = df.loc[train_idcs, metadata_cols].values
        val_metadata   = df.loc[val_idcs, metadata_cols].values
        test_metadata  = df.loc[test_idcs, metadata_cols].values
        train_dataloader = get_data_loader(train_paths, train_labels, metadata=train_metadata, transform=train_transform, batch_size=batch_size, num_workers=num_workers)
        val_dataloader   = get_data_loader(val_paths, val_labels, metadata=val_metadata, transform=val_transform, batch_size=batch_size, num_workers=num_workers)
        test_dataloader  = get_data_loader(test_paths, test_labels, metadata=test_metadata, transform=val_transform, batch_size=batch_size, num_workers=num_workers) 

        # Training
        n_classes  = len(set(train_labels))
        n_metadata = train_metadata.shape[1]

        for model_name in model_names:

            base_save_path = f'best_base_{model_name}_w_{fold}'
            base_model     = BaseMetaModel(get_model(model_name, n_classes=n_classes, pretrained=True)).to(device)
            base_model.load_state_dict(torch.load(os.path.join(saved_base_models_folder, base_save_path)))

            for fusion_method in fusion_methods:
                print(f'{"*"*79}\n{model_name.upper()} {metacomb_name.upper()} FOLD {fold} {fusion_method.upper()}\n{"*"*79}\n')

                save_path = f'{model_name}_{fusion_method}_{metacomb_name}_{fold}'
                model     = MetaModel(base_model, n_classes, n_metadata=n_metadata, fusion_method=fusion_method, n_reducer_block=n_reducer_block).to(device)

                optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
                scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=7)

                train(model, train_dataloader, val_dataloader, optimizer, scheduler, criterion, device, n_epochs,
                  saved_models_folder, saved_scores_folder, save_path, printfreq=10)

                del model
                gc.collect()
                torch.cuda.empty_cache()

*******************************************************************************
VIT_B_32 GENDER_REGION FOLD 0 CONCAT
*******************************************************************************

 Epoch    Train Loss    Val Loss    Train Acc    Val Acc    Best      lr      Time [min]
-----------------------------------------------------------------------------------------
    10      0.7871       0.8792       0.7345      0.6960     ***    1.0e-03       0.2
    20      0.6328       0.8893       0.7747      0.7017            1.0e-03       0.1
    30      0.5667       0.9522       0.7978      0.6051            1.0e-03       0.1
    40      0.5152       1.0347       0.8128      0.6250            1.0e-03       0.1
    50      0.4637       1.2883       0.8383      0.6136            1.0e-04       0.1
Training stopped early
-----------------------------------------------------------------------------------------
Total time [min] for 53 Epochs: 6.8
********************************************

# Testing

In [20]:
folds            = [0]
#model_names      = ['resnet18]
# model_names      = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'effnetb0', 'effnetb1',
#                'effnetb2', 'effnetb3', 'effnetb4', 'effnetb5']
# model_names = ['resnet18', 'resnet50', 'effnetb3', 'resnext50', 'vgg11', 'vit_b_32']
model_names = ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnext50', 
               'effnetb0', 'effnetb1','effnetb2', 'effnetb3', 'effnetb4', 'effnetb5', 'vgg11', 'vit_b_32']



#fusion_methods   = ['concat', 'metanet', 'metablock']
fusion_methods   = ['concat']

In [21]:
usemeta = ['age', 'gender', 'region']
metacombs = get_combs(usemeta)
metacombs

[('age',),
 ('gender',),
 ('region',),
 ('age', 'gender'),
 ('age', 'region'),
 ('gender', 'region'),
 ('age', 'gender', 'region')]

In [22]:
#folds = [0, 1, 2, 3, 4]
#folds = [1, 2, 3, 4]
folds  = [0]
n_reducer_block = 256

all_metrics_dict = dict()
for fold in folds:
    fold_dict = dict()
    for metacomb in metacombs:
        metacomb_dict = dict()
        metacomb_name = '_'.join(list(metacomb))
        
        metadata_cols = combcols_dict[metacomb]
        
        # Dataloaders
        test_imgs  = df.loc[test_idcs, 'img_id'].values

        test_paths  = [f'{os.path.join(data_dir, img)}' for img in test_imgs]

        test_labels  = df.loc[test_idcs, 'diagnostic_number'].values

        test_metadata  = df.loc[test_idcs, metadata_cols].values
        test_dataloader  = get_data_loader(test_paths, test_labels, metadata=test_metadata, transform=val_transform, batch_size=batch_size, num_workers=num_workers) 

        # Training
        n_classes  = 6
        n_metadata = test_metadata.shape[1]

        for model_name in model_names:
            model_dict = dict()
            base_model     = BaseMetaModel(get_model(model_name, n_classes=n_classes, pretrained=True)).to(device)

            for fusion_method in fusion_methods:
                print(f'{"*"*79}\n{model_name.upper()} {metacomb_name.upper()} FOLD {fold} {fusion_method.upper()}\n{"*"*79}\n')
                
                save_path = f'best_{model_name}_{fusion_method}_{metacomb_name}_{fold}'
                
                model = MetaModel(base_model, n_classes, n_metadata=n_metadata, fusion_method=fusion_method, n_reducer_block=n_reducer_block).to(device)
                model.load_state_dict(torch.load(os.path.join(saved_models_folder, save_path)))

                y_true, y_prob, y_pred = get_scores(model, test_dataloader, batch_size, device)
                print(y_true.shape, y_prob.shape, y_pred.shape)
                np.save(f'test_scores/y_true_{model_name}_{fusion_method}_{metacomb_name}_{fold}', y_true)
                np.save(f'test_scores/y_prob_{model_name}_{fusion_method}_{metacomb_name}_{fold}', y_prob)
                np.save(f'test_scores/y_pred_{model_name}_{fusion_method}_{metacomb_name}_{fold}', y_pred)
                metrics_dict = get_metrics(y_true, y_prob, y_pred)
                
                del model
                gc.collect()
                torch.cuda.empty_cache()
                model_dict[fusion_method] = metrics_dict
            del base_model
            gc.collect()
            torch.cuda.empty_cache()
            metacomb_dict[model_name] = model_dict
        fold_dict[metacomb_name] = metacomb_dict
    all_metrics_dict[fold] = fold_dict

*******************************************************************************
RESNET18 AGE FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNET34 AGE FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNET50 AGE FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNET101 AGE FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNET152 AGE FOLD 0 CONCAT
***********************************************

(437,) (437, 6) (437,)
Loaded pretrained weights for efficientnet-b2
*******************************************************************************
EFFNETB2 REGION FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
Loaded pretrained weights for efficientnet-b3
*******************************************************************************
EFFNETB3 REGION FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
Loaded pretrained weights for efficientnet-b4
*******************************************************************************
EFFNETB4 REGION FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
Loaded pretrained weights for efficientnet-b5
*******************************************************************************
EFFNETB5 REGION FOLD 0 CONCAT
*****************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNET34 GENDER_REGION FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNET50 GENDER_REGION FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNET101 GENDER_REGION FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNET152 GENDER_REGION FOLD 0 CONCAT
*******************************************************************************

(437,) (437, 6) (437,)
*******************************************************************************
RESNEXT50 G

In [23]:
y_prob.shape

(437, 6)

In [None]:
all_metrics_dict = dict()
fold = '0'

for model_name in model_names:
    model_dict = dict()
    for fusion_method in fusion_methods:
        y_true_path = f'test_scores/y_true_{model_name}_{fusion_method}_{fold}.npy'
        y_prob_path = f'test_scores/y_prob_{model_name}_{fusion_method}_{fold}.npy'
        y_pred_path = f'test_scores/y_pred_{model_name}_{fusion_method}_{fold}.npy'

        y_true = np.load(y_true_path)
        y_prob = np.load(y_true_path)
        y_pred = np.load(y_true_path)
        
        print(y_true.shape, y_prob.shape, y_pred.shape)

        metrics_dict = get_metrics(y_true, y_prob, y_pred)
        model_dict[fusion_method] = metrics_dict
    all_metrics_dict[model_name]  = model_dict

In [23]:
import shutil
shutil.make_archive('test_scores', 'zip', 'test_scores')

'/home/gabriel/skin/PADUFES20/test_scores.zip'

In [24]:
import json

with open('metrics_combs_allmodels_concat_final.json', 'w') as outfile:
    json.dump(all_metrics_dict, outfile)
    
all_metrics_dict

{0: {'age': {'resnet18': {'concat': {'precision': 0.6513079052606302,
     'recall': 0.665903890160183,
     'f1-score': 0.651958053627099,
     'support': 437,
     'accuracy': 0.665903890160183,
     'balanced_accuracy': 0.5624184235809896,
     'auc': 0.8764033605797077}},
   'resnet34': {'concat': {'precision': 0.650874357568034,
     'recall': 0.665903890160183,
     'f1-score': 0.6478911336683435,
     'support': 437,
     'accuracy': 0.665903890160183,
     'balanced_accuracy': 0.5664371901322185,
     'auc': 0.8699099224292421}},
   'resnet50': {'concat': {'precision': 0.6982029388699772,
     'recall': 0.7025171624713958,
     'f1-score': 0.694146353154887,
     'support': 437,
     'accuracy': 0.7025171624713958,
     'balanced_accuracy': 0.5967702373979953,
     'auc': 0.8939539922077723}},
   'resnet101': {'concat': {'precision': 0.6515228474021939,
     'recall': 0.665903890160183,
     'f1-score': 0.6508299210304375,
     'support': 437,
     'accuracy': 0.665903890160183

In [25]:
pd.DataFrame(all_metrics_dict[0]['age']['resnet18'])

Unnamed: 0,concat
accuracy,0.665904
auc,0.876403
balanced_accuracy,0.562418
f1-score,0.651958
precision,0.651308
recall,0.665904
support,437.0


In [26]:
metacomb_names = ['_'.join(list(metacomb)) for metacomb in metacombs]
metacomb_names

['age',
 'gender',
 'region',
 'age_gender',
 'age_region',
 'gender_region',
 'age_gender_region']

In [27]:
pd.DataFrame({metacomb: all_metrics_dict[0][metacomb]['resnet18']['concat'] for metacomb in metacomb_names})

Unnamed: 0,age,gender,region,age_gender,age_region,gender_region,age_gender_region
precision,0.651308,0.663859,0.681149,0.646527,0.660583,0.669627,0.663081
recall,0.665904,0.684211,0.681922,0.665904,0.665904,0.688787,0.677346
f1-score,0.651958,0.663418,0.66187,0.647514,0.643711,0.67224,0.665991
support,437.0,437.0,437.0,437.0,437.0,437.0,437.0
accuracy,0.665904,0.684211,0.681922,0.665904,0.665904,0.688787,0.677346
balanced_accuracy,0.562418,0.550601,0.555068,0.561176,0.566506,0.587679,0.563369
auc,0.876403,0.882626,0.875867,0.87771,0.87525,0.881471,0.881707


In [28]:
pd.DataFrame({metacomb: all_metrics_dict[0][metacomb]['resnet18']['metanet'] for metacomb in metacomb_names})

KeyError: 'metanet'

In [None]:
pd.DataFrame({metacomb: all_metrics_dict[0][metacomb]['resnet18']['metablock'] for metacomb in metacomb_names})