In [1]:
import sys
sys.path.append('../')

from dataset_data.constants.var_types import VAR_TYPES

import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import load_model

In [None]:
# Dataset Names
dsNames = list(VAR_TYPES.keys())

In [2]:
# Models
dict_models = {}

for dsName in dsNames:
    dict_models[dsName] = load_model(f'./models/{dsName}.h5')

In [3]:
best_param_info = pd.read_csv('./best_params.txt', sep=' ', header=None)
best_param_info.drop(columns=[5], inplace=True)
best_param_info[2] = best_param_info[2].apply(lambda x: x.replace('(', '').replace(',', ''))
best_param_info[3] = best_param_info[3].apply(lambda x: x.replace(',', ''))
best_param_info[4] = best_param_info[4].apply(lambda x: x.replace(')', ''))
best_param_info.columns = ['dsName', 'AUC', 'LR', 'Epochs', 'Neurons']

In [4]:
experiment_data_path = '../dataset_data/experiments_data'

# Train Datasets
dict_trainDS = {}
# Validation Datasets
dict_validDS = {}
# Test Datasets
dict_testDS = {}


for dsName in dsNames:
    if len(VAR_TYPES[dsName]['categorical']) > 0:
        dict_trainDS[dsName] = pd.read_csv(f'{experiment_data_path}/{dsName}_TRAINOHDATASET.csv', index_col='Unnamed: 0')
        dict_testDS[dsName] = pd.read_csv(f'{experiment_data_path}/{dsName}_TESTOHDATASET.csv', index_col='Unnamed: 0')
        dict_validDS[dsName] = pd.read_csv(f'{experiment_data_path}/{dsName}_VALIDATIONOHDATASET.csv', index_col='Unnamed: 0')
        
    else:
        dict_trainDS[dsName] = pd.read_csv(f'{experiment_data_path}/{dsName}_TRAINDATASET.csv', index_col='Unnamed: 0')
        dict_testDS[dsName] = pd.read_csv(f'{experiment_data_path}/{dsName}_TESTDATASET.csv', index_col='Unnamed: 0')
        dict_validDS[dsName] = pd.read_csv(f'{experiment_data_path}/{dsName}_VALIDATIONDATASET.csv', index_col='Unnamed: 0')
        

In [5]:
# Path of all full datasets
data_path = "../dataset_data/data"
full_datasets_paths = ["/NORM_BCW.csv", "/NORM_Ecoli.csv", "/NORM_Iris.csv", 
                       "/NORM_ISOLET.csv", "/NORM_SDD.csv", "/NORM_PBC.csv", 
                       "/NORM_CMSC.csv", "/NORM_MagicGT.csv", "/NORM_Wine.csv", 
                       "/OH_BalanceScale.csv", "/OH_CarEvaluation.csv", 
                       "/OH_HayesRoth.csv", "/OH_Chess.csv", "/OH_Lymphography.csv", 
                       "/OH_Nursery.csv", "/OH_SoybeanSmall.csv", "/OH_TicTacToe.csv", 
                       "/OH_NORM_DefaultOfCCC.csv", "/OH_NORM_StudentPerf.csv", 
                       "/OH_NORM_Adult.csv", "/OH_NORM_InternetAdv.csv", 
                       "/OH_NORM_StatlogGC.csv"]

dict_fullDS = {}

for fullDSpath in full_datasets_paths:
    dsName = fullDSpath.split('.')[0].split('_')[-1]
    
    df_full = pd.read_csv(data_path+fullDSpath)
    
    # Bug related to different versions of Pandas and/or Numpy, the value_counts().index may return
    # different results if it has draws. This does not affect results, just this reporting
    # The script below automatically detects the right index
    
    total_auc = round(best_param_info[best_param_info['dsName']==dsName]['AUC'].tolist()[0], 2)
    
    valid_idx = list(dict_validDS[dsName].index)

    # All data prediction
    all_pred = dict_models[dsName].predict(df_full.iloc[valid_idx].drop(columns=['output']))
    all_pred_prob = pd.DataFrame(all_pred)[1]

    found_right_idx = False
    for most_common_class in df_full['output'].unique():

        # Get the indexes of the respective class
        df_test_out_class = df_full['output'].apply(lambda x: 1 if x==most_common_class else 0)

        fpr, tpr, _ = roc_curve(df_test_out_class.iloc[valid_idx], all_pred_prob)
        auc_test = auc(fpr, tpr)

        # If it is the right class, the intersection must have the same size as the factual
        if round(auc_test, 2) == total_auc:
            found_right_idx = True
            break
    
    # Verify if the right class was found
    assert found_right_idx
    
    # Give the right binary labels
    df_full['output'] = df_full['output'].apply(lambda x: 0 if x==most_common_class else 1)
    
    dict_fullDS[dsName] = df_full

In [None]:
# Factual Datasets 0
dict_fact0DS = {}
# Factual Datasets 1
dict_fact1DS = {}

for dsName in dsNames:
    fact0DS_idxs = list(pd.read_csv(f'{experiment_data_path}/{dsName}_CFDATASET_0.csv', index_col='Unnamed: 0').index)
    dict_fact0DS[dsName] = dict_fullDS[dsName].loc[fact0DS_idxs]
    
    fact1DS_idxs = list(pd.read_csv(f'{experiment_data_path}/{dsName}_CFDATASET_1.csv', index_col='Unnamed: 0').index)
    dict_fact1DS[dsName] = dict_fullDS[dsName].loc[fact1DS_idxs]

In [6]:
def calc_acc_auc(dsName, df):
    
    X = df.drop(columns=['output'])
    y = dict_fullDS[dsName].loc[list(df.index)]['output']
    y = pd.concat([y, y.map({0:1, 1:0})], axis=1)
    
    y_pred = dict_models[dsName].predict(X)
    
    acc = accuracy_score(np.apply_along_axis(np.argmax, 1, y), np.apply_along_axis(np.argmax, 1, y_pred))
    
    fpr, tpr, thresholds = roc_curve(y.iloc[:,1:], pd.DataFrame(y_pred)[1])

    auc_m = auc(fpr, tpr)
    
    
    return round(acc, 2), round(auc_m, 2)

In [7]:
rows_data = []
for dsName in dsNames:
    
    n_0_class = dict_fullDS[dsName][dict_fullDS[dsName]['output'] == 0].shape[0]
    n_1_class = dict_fullDS[dsName][dict_fullDS[dsName]['output'] == 1].shape[0]
    
    acc_total, auc_total = calc_acc_auc(dsName, dict_fullDS[dsName])
    acc_train, auc_train = calc_acc_auc(dsName, dict_trainDS[dsName])
    acc_valid, auc_valid = calc_acc_auc(dsName, dict_validDS[dsName])
    acc_test, auc_test = calc_acc_auc(dsName, dict_testDS[dsName])
    acc_fact0DS, _ = calc_acc_auc(dsName, dict_fact0DS[dsName])
    acc_fact1DS, _ = calc_acc_auc(dsName, dict_fact1DS[dsName])
    
    row_info = {'Dataset': dsName, 
                'Rows (total)': dict_fullDS[dsName].shape[0],
                'Rows (train)': dict_trainDS[dsName].shape[0],
                'Rows (valid)': dict_validDS[dsName].shape[0],
                'Rows (test)': dict_testDS[dsName].shape[0],
                'Rows (fac0)': dict_fact0DS[dsName].shape[0],
                'Rows (fac1)': dict_fact1DS[dsName].shape[0],
                'Percentage 0 Class': n_0_class/(n_0_class+n_1_class),
                'Percentage 1 Class': n_1_class/(n_0_class+n_1_class),
                'Columns Total': dict_fullDS[dsName].shape[1]-1,
                'Columns Numerical': len(VAR_TYPES[dsName]['numerical']),
                'Columns Categorical': (dict_fullDS[dsName].shape[1]-1)-len(VAR_TYPES[dsName]['numerical']),
                'Neurons': best_param_info[best_param_info['dsName']==dsName]['Neurons'].values[0],
                'Epochs': best_param_info[best_param_info['dsName']==dsName]['Epochs'].values[0], 
                'LR': best_param_info[best_param_info['dsName']==dsName]['LR'].values[0],
                'AUC (total)':auc_total,
                'AUC (train)':auc_train,
                'AUC (valid)':auc_valid,
                'AUC (test)':auc_test,
                'Acuracy (total)':acc_total,
                'Acuracy (train)':acc_train,
                'Acuracy (valid)':acc_valid,
                'Acuracy (test)':acc_test,
                'Acuracy (fac0)':acc_fact0DS,
                'Acuracy (fac1)':acc_fact1DS,
               }
    rows_data.append(row_info)



In [8]:
pd.DataFrame(rows_data).sort_values('Dataset').reset_index(drop=True)

Unnamed: 0,Dataset,Rows (total),Rows (train),Rows (valid),Rows (test),Rows (fac0),Rows (fac1),Percentage 0 Class,Percentage 1 Class,Columns Total,...,AUC (total),AUC (train),AUC (valid),AUC (test),Acuracy (total),Acuracy (train),Acuracy (valid),Acuracy (test),Acuracy (fac0),Acuracy (fac1)
0,Adult,32561,19536,6513,6512,100,100,0.75919,0.24081,107,...,0.92,0.92,0.91,0.91,0.86,0.87,0.86,0.85,0.72,0.87
1,BCW,198,118,41,39,41,100,0.762626,0.237374,32,...,0.94,1.0,0.85,0.77,0.92,0.99,0.88,0.74,0.88,0.9
2,BalanceScale,625,374,126,125,100,100,0.4608,0.5392,20,...,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.98,0.98,0.99
3,CMSC,540,323,109,108,37,100,0.914815,0.085185,18,...,0.96,1.0,0.98,0.79,0.98,1.0,0.95,0.94,0.97,0.95
4,CarEvaluation,1728,1036,346,346,100,100,0.700231,0.299769,21,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99,0.98,1.0
5,Chess,28056,16833,5612,5611,100,100,0.900342,0.099658,40,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,DefaultOfCCC,30000,17999,6001,6000,100,100,0.7788,0.2212,90,...,0.79,0.79,0.77,0.79,0.82,0.83,0.82,0.83,0.68,0.84
7,Ecoli,336,200,68,68,100,100,0.425595,0.574405,7,...,0.99,0.99,1.0,1.0,0.97,0.98,0.94,0.97,0.95,0.96
8,HayesRoth,132,78,28,26,96,36,0.386364,0.613636,15,...,0.95,0.95,0.96,0.94,0.87,0.87,0.86,0.88,0.83,0.97
9,ISOLET,7797,4678,1560,1559,100,100,0.038476,0.961524,617,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
pd.DataFrame(rows_data)[['Rows (fac0)', 'Rows (fac1)']].sum().sum()

3925

In [9]:
pd.DataFrame(rows_data).sort_values('Dataset').to_excel('model_performance_analysis.xlsx')