In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import load_model

from constants.var_types import VAR_TYPES

In [2]:
# Path of all full datasets
full_datasets_paths = ["./data/NORM_BCW.csv", "./data/NORM_Ecoli.csv", "./data/NORM_Iris.csv", 
                       "./data/NORM_ISOLET.csv", "./data/NORM_SDD.csv", "./data/NORM_PBC.csv", 
                       "./data/NORM_CMSC.csv", "./data/NORM_MagicGT.csv", "./data/NORM_Wine.csv", 
                       "./data/OH_BalanceScale.csv", "./data/OH_CarEvaluation.csv", 
                       "./data/OH_HayesRoth.csv", "./data/OH_Chess.csv", "./data/OH_Lymphography.csv", 
                       "./data/OH_Nursery.csv", "./data/OH_SoybeanSmall.csv", "./data/OH_TicTacToe.csv", 
                       "./data/OH_NORM_DefaultOfCCC.csv", "./data/OH_NORM_StudentPerf.csv", 
                       "./data/OH_NORM_Adult.csv", "./data/OH_NORM_InternetAdv.csv", 
                       "./data/OH_NORM_StatlogGC.csv"]

dict_fullDS = {}

for fullDSpath in full_datasets_paths:
    dsName = fullDSpath.split('.')[1].split('_')[-1]
    
    df_full = pd.read_csv(fullDSpath)
    
    # Define the majority class as 0 and the other classes as 1
    most_common_class = df_full['output'].value_counts().index[0]
    df_full['output'] = df_full['output'].apply(lambda x: 1 if x==most_common_class else 0)
    
    dict_fullDS[dsName] = df_full


In [3]:
# Dataset Names
dsNames = list(VAR_TYPES.keys())

# Train Datasets
dict_trainDS = {}
# Test Datasets
dict_testDS = {}
# Factual Datasets 0
dict_fact0DS = {}
# Factual Datasets 1
dict_fact1DS = {}

for dsName in dsNames:
    if len(VAR_TYPES[dsName]['categorical']) > 0:
        dict_trainDS[dsName] = pd.read_csv(f'./experiments_data/{dsName}_TRAINOHDATASET.csv', index_col='Unnamed: 0')
        dict_testDS[dsName] = pd.read_csv(f'./experiments_data/{dsName}_TESTOHDATASET.csv', index_col='Unnamed: 0')
    else:
        dict_trainDS[dsName] = pd.read_csv(f'./experiments_data/{dsName}_TRAINDATASET.csv', index_col='Unnamed: 0')
        dict_testDS[dsName] = pd.read_csv(f'./experiments_data/{dsName}_TESTDATASET.csv', index_col='Unnamed: 0')
    
    fact0DS_idxs = list(pd.read_csv(f'./experiments_data/{dsName}_CFDATASET_0.csv', index_col='Unnamed: 0').index)
    dict_fact0DS[dsName] = dict_fullDS[dsName].loc[fact0DS_idxs]
    
    fact1DS_idxs = list(pd.read_csv(f'./experiments_data/{dsName}_CFDATASET_1.csv', index_col='Unnamed: 0').index)
    dict_fact1DS[dsName] = dict_fullDS[dsName].loc[fact1DS_idxs]

In [4]:
best_param_info = pd.read_csv('./best_params.txt', sep=' ', header=None)
best_param_info.drop(columns=[5], inplace=True)
best_param_info[2] = best_param_info[2].apply(lambda x: x.replace('(', '').replace(',', ''))
best_param_info[3] = best_param_info[3].apply(lambda x: x.replace(',', ''))
best_param_info[4] = best_param_info[4].apply(lambda x: x.replace(')', ''))
best_param_info.columns = ['dsName', 'AUC', 'LR', 'Epochs', 'Neurons']

In [5]:
# Models
dict_models = {}

for dsName in dsNames:
    dict_models[dsName] = load_model(f'./models/{dsName}.h5')

In [6]:
def calc_acc_auc(dsName, df):
    
    X = df.drop(columns=['output'])
    y = df['output']
    
    y_pred = dict_models[dsName].predict_proba(X)
    y_pred = pd.DataFrame(y_pred)[1]
    y_pred_bin = y_pred.apply(lambda x: 1 if x > 0.5 else 0).tolist()
    
    acc = accuracy_score(y, y_pred_bin)
    
    fpr, tpr, thresholds = roc_curve(y, y_pred)

    auc_m = auc(fpr, tpr)
    
    return round(acc, 2), round(auc_m, 2)

In [7]:
rows_data = []
for dsName in dsNames:
    
    n_0_class = dict_fullDS[dsName][dict_fullDS[dsName]['output'] == 0].shape[0]
    n_1_class = dict_fullDS[dsName][dict_fullDS[dsName]['output'] == 1].shape[0]
    
    acc_total, auc_total = calc_acc_auc(dsName, dict_fullDS[dsName])
    acc_train, auc_train = calc_acc_auc(dsName, dict_trainDS[dsName])
    acc_test, auc_test = calc_acc_auc(dsName, dict_testDS[dsName])
    acc_fact0DS, _ = calc_acc_auc(dsName, dict_fact0DS[dsName])
    acc_fact1DS, _ = calc_acc_auc(dsName, dict_fact1DS[dsName])
    
    row_info = {'Dataset': dsName, 
                'Rows (total)': dict_fullDS[dsName].shape[0],
                'Rows (train)': dict_trainDS[dsName].shape[0],
                'Rows (test)': dict_testDS[dsName].shape[0],
                'Rows (fac0)': dict_fact0DS[dsName].shape[0],
                'Rows (fac1)': dict_fact1DS[dsName].shape[0],
                'Class 0/1 ratio': round(n_0_class/n_1_class, 2), 
                'Columns': dict_fullDS[dsName].shape[1], 
                'Neurons': best_param_info[best_param_info['dsName']==dsName]['Neurons'].values[0], 
                'Epochs': best_param_info[best_param_info['dsName']==dsName]['Epochs'].values[0], 
                'LR': best_param_info[best_param_info['dsName']==dsName]['LR'].values[0],
                'AUC (total)':auc_total,
                'AUC (train)':auc_train,
                'AUC (test)':auc_test,
                'Acuracy (total)':acc_total,
                'Acuracy (train)':acc_train,
                'Acuracy (test)':acc_test,
                'Acuracy (fac0)':acc_fact0DS,
                'Acuracy (fac1)':acc_fact1DS,
               }
    rows_data.append(row_info)











In [8]:
pd.DataFrame(rows_data).sort_values('Dataset').reset_index(drop=True)

Unnamed: 0,Dataset,Rows (total),Rows (train),Rows (test),Rows (fac0),Rows (fac1),Class 0/1 ratio,Columns,Neurons,Epochs,LR,AUC (total),AUC (train),AUC (test),Acuracy (total),Acuracy (train),Acuracy (test),Acuracy (fac0),Acuracy (fac1)
0,Adult,32561,26048,6513,100,100,0.32,108,172,50,0.0001,0.92,0.92,0.91,0.86,0.87,0.86,1.0,1.0
1,BCW,198,157,41,45,100,0.31,33,26,500,0.001,1.0,1.0,0.97,0.98,1.0,0.9,1.0,1.0
2,BalanceScale,625,499,126,100,100,1.17,21,8,50,0.01,1.0,1.0,1.0,1.0,1.0,0.98,1.0,1.0
3,CMSC,540,431,109,4,100,0.09,19,37,100,0.0001,0.94,0.95,0.9,0.92,0.93,0.91,1.0,1.0
4,CarEvaluation,1728,1382,346,100,100,0.43,22,8,50,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,Chess,28056,22444,5612,100,100,0.11,41,48,100,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,DefaultOfCCC,30000,23999,6001,100,100,0.28,91,181,50,0.0001,0.79,0.8,0.77,0.82,0.83,0.82,1.0,1.0
7,Ecoli,336,268,68,100,100,1.35,8,3,500,0.01,0.99,0.99,1.0,0.97,0.97,0.99,1.0,1.0
8,HayesRoth,132,104,28,80,37,1.59,16,12,100,0.001,0.97,0.98,0.97,0.89,0.9,0.82,1.0,1.0
9,ISOLET,7797,6237,1560,100,100,24.99,618,494,100,0.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
pd.DataFrame(rows_data).sort_values('Dataset').to_excel('model_performance_analysis.xlsx')