In [166]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.metrics import roc_auc_score, f1_score
from io import StringIO

# Baseline

In [167]:
# list the datasets that should be used in the current run
datasets = ["hungarian", "cleveland", "switzerland", "long-beach-va"]

In [168]:
# custom function to read the dataset into a csv formated string
# the name is used as a delimiter here because it is the last feature and common among all entrys
def read_raw_data(file_path:str):
    with open(file_path) as file:
        data_string = file.read()
        data_string = data_string.replace("\n"," ")
        data_string = re.sub("[a-zA-Z]+ ","name\n", data_string)
        data_string = data_string.replace(" ",",")
        return data_string

In [169]:
# read the data from the specified datasets into the df
df = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv(StringIO(read_raw_data("./Data/"+ dataset +".data")), header=None, sep=",")
    dataset_df['dataset'] = dataset
    df = pd.concat([df,dataset_df ], ignore_index=True)

In [170]:
df.columns=["id", "ccf", "age", "sex", "painloc", "painexer" , "relrest" , "pncaden" , "cp", "trestbps", "htn", "chol", "smoke", "cigs", "years", "fbs", "dm", "famhist", "restecg", "ekgmo", "ekgday", "ekgyr", "dig", "prop", "nitr", "pro", "diuretic", "proto", "thaldur", "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd", "dummy", "trestbpd", "exang", "xhypo", "oldpeak", "slope", "rldv5", "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm", "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe", "cmo", "cday", "cyr", "num", "lmt", "ladprox", "laddist", "diag", "cxmain", "ramus", "om1", "om2", "rcaprox", "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk", "name", "dataset"]

In [171]:
# the uci encodes missing values as -9 we use np.float('nan') for better profiling reports
df = df.replace(-9, float('nan'))

In [172]:
y_true = df['num']
y_true[y_true > 1] = 1
y_pred = [1] * len(y_true)
roc_auc_score(y_true, y_pred)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_true[y_true > 1] = 1


0.5

In [173]:
f1_score(y_true, y_pred)

0.7101865136298422

In [174]:
def calculate_metrics(filepath:str): 
    with open(filepath, 'r') as f:
        data = json.load(f)
    # extract data form json into usable df 
    df = pd.json_normalize(data, record_path =['measurements'])
    # save minimum percentage to be dropped in extra variable
    df["drop_columns"] = df["best_params.drop_columns__minimum_percentage_to_be_dropped"]
    # calculate main values of interest
    df["auc_std"] = df.apply(lambda row: np.std(row["auc"]), axis =1)
    df["pre_std"] = df.apply(lambda row: np.std(row["classification_report.1.precision"]), axis =1)
    df["pre_mean"] = df.apply(lambda row: np.mean(row["classification_report.1.precision"]), axis =1)
    df["acc_mean"] = df.apply(lambda row: np.mean(row["classification_report.accuracy"]), axis =1)
    df["rec_std"] = df.apply(lambda row: np.std(row["classification_report.1.recall"]), axis =1)
    df["rec_mean"] = df.apply(lambda row: np.mean(row["classification_report.1.recall"]), axis =1)
    df["f1_std"] = df.apply(lambda row: np.std(row["classification_report.1.f1-score"]), axis =1)
    df["f1_mean"] = df.apply(lambda row: np.mean(row["classification_report.1.f1-score"]), axis =1)
    df["type2"] = df.apply(lambda row: np.sum(row["confusion_matrix.(1, 0)"]), axis =1)
    # Confidence Intervalls
    df["auc_confl"] = df["auc_mean"]- 1.96 * (df["auc_std"] / np.sqrt(10))
    df["auc_confu"] = df["auc_mean"]+ 1.96 * (df["auc_std"] / np.sqrt(10))
    df["f1_confl"] = df["f1_mean"]- 1.96 * (df["f1_std"] / np.sqrt(10))
    df["f1_confu"] = df["f1_mean"]+ 1.96 * (df["f1_std"] / np.sqrt(10))
    return df

In [175]:
df_XGBoost = calculate_metrics("outputs/output xgboost.json")
# only display variables important for the model evaluation
df_XGBoost = df_XGBoost.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_XGBoost[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
7,0.805592,0.099109,76.971665,100,0.797337
6,0.801592,0.094049,75.280811,100,0.786392
18,0.785469,0.116458,75.352738,100,0.779663
9,0.785429,0.118263,75.475697,100,0.780468
15,0.781469,0.11929,75.402738,100,0.778468
12,0.781469,0.11929,75.277738,100,0.777656
0,0.781469,0.11929,75.027738,100,0.77625
3,0.779429,0.119897,75.050697,100,0.775643
8,0.779347,0.098439,76.540518,75,0.787538
1,0.777306,0.111754,75.438477,100,0.778476


In [176]:
# best model in detail 
XGBoost = df_XGBoost.loc[8]
XGBoost[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                  Normalizer
estimator            XGBClassifier
imputer              SimpleImputer
sampler         RandomUnderSampler
auc_mean                 76.540518
auc_confl                76.486675
auc_confu                 76.59436
type2                          109
f1_mean                   0.787538
f1_confl                  0.751974
f1_confu                  0.823103
drop_columns                    75
pre_mean                  0.814819
rec_mean                  0.779347
Name: 8, dtype: object

In [177]:
# loading KNeighbors 
df_KNeighbors = calculate_metrics("outputs/output KNN.json")
df_KNeighbors = df_KNeighbors.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_KNeighbors[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(40)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
18,0.858449,0.048486,76.285254,0,0.807729
15,0.831755,0.116541,80.398731,100,0.826626
19,0.828286,0.071276,75.770993,0,0.79551
6,0.828163,0.057378,74.658163,0,0.789032
12,0.809469,0.128226,77.281396,20,0.800056
20,0.808082,0.066057,75.626643,0,0.789785
9,0.807265,0.143362,78.180338,100,0.803177
17,0.797429,0.110121,79.798258,100,0.813584
11,0.787143,0.124669,78.79007,100,0.802811
8,0.78351,0.074452,76.010876,0,0.784084


In [178]:
KNeighbors = df_KNeighbors.loc[18]
KNeighbors[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                           str
estimator       KNeighborsClassifier
imputer                SimpleImputer
sampler                          str
auc_mean                   76.285254
auc_confl                  76.250001
auc_confu                  76.320507
type2                             70
f1_mean                     0.807729
f1_confl                    0.788437
f1_confu                    0.827021
drop_columns                       0
pre_mean                    0.768873
rec_mean                    0.858449
Name: 18, dtype: object

In [179]:
# loading Random Forest Classifier
df_Forest = calculate_metrics("outputs/output RandomForestClassifier.json")
df_Forest = df_Forest.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_Forest[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
18,0.844163,0.09718,78.409383,100,0.818713
9,0.844122,0.096034,78.160391,100,0.817078
3,0.842163,0.099078,78.19048,100,0.816477
0,0.842163,0.099078,78.06548,100,0.815951
15,0.840122,0.102677,78.332342,100,0.816883
12,0.840082,0.097832,78.205301,100,0.816678
10,0.813714,0.10267,77.889983,75,0.807204
19,0.807592,0.101768,78.080811,75,0.805459
16,0.805592,0.098013,78.105811,75,0.805394
1,0.805551,0.103401,78.103771,75,0.804986


In [180]:
Forest = df_Forest.loc[18]
Forest[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                             str
estimator       RandomForestClassifier
imputer                  SimpleImputer
sampler                            str
auc_mean                     78.409383
auc_confl                    78.349166
auc_confu                      78.4696
type2                               77
f1_mean                       0.818713
f1_confl                      0.785005
f1_confu                      0.852422
drop_columns                       100
pre_mean                      0.813039
rec_mean                      0.844163
Name: 18, dtype: object

In [181]:
# loading Decision Tree Classifier
df_Tree = calculate_metrics("outputs/output DecisionTrees.json")
df_Tree = df_Tree.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean', 'scaler'], ascending = False)
df_Tree[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean','scaler']].head(20)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean,scaler
16,0.852245,0.06239,76.703708,0,0.808715,str
13,0.852245,0.06239,76.703708,0,0.808715,StandardScaler
10,0.852245,0.06239,76.703708,0,0.808715,RobustScaler
7,0.852245,0.06239,76.703708,0,0.808715,PowerTransformer
3,0.852245,0.06239,76.703708,0,0.808715,MinMaxScaler
0,0.852245,0.06239,76.703708,0,0.808715,MaxAbsScaler
6,0.836204,0.072749,75.907765,0,0.799131,Normalizer
17,0.797755,0.092605,74.229219,0,0.776856,str
14,0.797755,0.092605,74.229219,0,0.776856,StandardScaler
11,0.797755,0.092605,74.229219,0,0.776856,RobustScaler


In [182]:
Tree = df_Tree.loc[16]
Tree[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                             str
estimator       DecisionTreeClassifier
imputer                  SimpleImputer
sampler                            str
auc_mean                     76.703708
auc_confl                    76.666607
auc_confu                     76.74081
type2                               73
f1_mean                       0.808715
f1_confl                      0.788895
f1_confu                      0.828535
drop_columns                         0
pre_mean                      0.778788
rec_mean                      0.852245
Name: 16, dtype: object

In [183]:
# Tree.to_excel(r'DTree.xlsx', index=False)

In [184]:
# loading SVC
df_SVC = calculate_metrics("outputs/output SVC.json")
df_SVC = df_SVC.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_SVC[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
9,0.81351,0.113184,77.870632,20.0,0.804905
0,0.809633,0.08811,77.935901,20.0,0.804629
3,0.801469,0.095761,77.527738,20.0,0.799262
17,0.795469,0.088218,79.468591,20.0,0.810375
15,0.789388,0.101353,77.173656,20.0,0.793365
6,0.78502,0.140378,76.793703,60.0,0.787453
16,0.777184,0.101452,78.051257,20.0,0.795842
10,0.775469,0.105006,76.849689,20.0,0.7876
2,0.771388,0.090258,78.508412,100.0,0.797818
1,0.771184,0.101451,77.007354,20.0,0.787595


In [185]:
SVC = df_SVC.loc[9]
SVC[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler          PowerTransformer
estimator                    SVC
imputer            SimpleImputer
sampler                      str
auc_mean               77.870632
auc_confl              77.818455
auc_confu              77.922809
type2                       92.0
f1_mean                 0.804905
f1_confl                0.767937
f1_confu                0.841873
drop_columns                20.0
pre_mean                0.817674
rec_mean                 0.81351
Name: 9, dtype: object

In [186]:
# loading naïve bayes (bernoulli)
df_bernoulli = calculate_metrics("outputs/output BernoulliNB.json")
df_bernoulli = df_bernoulli.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_bernoulli[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
3,0.795714,0.088,76.230836,8,0.79079
0,0.793388,0.10522,76.840119,20,0.794019
6,0.793388,0.10522,76.840119,20,0.794019
18,0.793388,0.10522,76.840119,20,0.794019
9,0.779265,0.126946,77.518143,35,0.79342
4,0.775429,0.083407,76.579355,8,0.787495
15,0.775143,0.090321,79.068118,8,0.801905
10,0.769184,0.130019,77.389062,35,0.789491
11,0.769102,0.134596,77.38498,35,0.789022
5,0.767429,0.080123,76.179355,8,0.782852


In [187]:
bernoulli = df_bernoulli.loc[3]
bernoulli[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler           MinMaxScaler
estimator         BernoulliNB
imputer         SimpleImputer
sampler                   str
auc_mean            76.230836
auc_confl           76.182989
auc_confu           76.278684
type2                     101
f1_mean               0.79079
f1_confl             0.764525
f1_confu             0.817056
drop_columns                8
pre_mean             0.803848
rec_mean             0.795714
Name: 3, dtype: object

In [188]:
# naïve bayes (categorical) is skipped because no model could be trained successfully 

In [189]:
# loading naïve bayes (complement)
df_complement = calculate_metrics("outputs/output CompleteNB.json")
df_complement = df_complement.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_complement[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(7)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
3,0.842408,0.032214,77.483213,100.0,0.811631
5,0.771388,0.053464,76.5389,100.0,0.78514
4,0.771388,0.053464,76.416949,100.0,0.784579
0,,,,,
1,,,,,
2,,,,,
6,,,,,


In [190]:
complement = df_complement.loc[3]
complement[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler           MinMaxScaler
estimator        ComplementNB
imputer         SimpleImputer
sampler                   str
auc_mean            77.483213
auc_confl           77.439309
auc_confu           77.527117
type2                    78.0
f1_mean              0.811631
f1_confl             0.783947
f1_confu             0.839315
drop_columns            100.0
pre_mean             0.787182
rec_mean             0.842408
Name: 3, dtype: object

In [191]:
# loading naïve bayes (gaussian)
df_gaussian= calculate_metrics("outputs/output GaussianNB.json")
df_gaussian = df_gaussian.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_gaussian[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
6,0.528735,0.126961,70.455027,20,0.642916
18,0.526408,0.124253,70.835652,20,0.645028
7,0.522612,0.12538,70.526954,20,0.640321
19,0.514286,0.126286,70.232578,20,0.633862
8,0.512408,0.129901,69.888701,20,0.629968
20,0.510245,0.131807,70.027489,20,0.629638
12,0.37302,0.127526,65.163216,20,0.508464
14,0.36298,0.128042,64.783126,20,0.498257
13,0.356857,0.126501,64.605052,20,0.492803
0,0.284041,0.113199,61.711187,20,0.415911


In [192]:
gaussian = df_gaussian.loc[6]
gaussian[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler             Normalizer
estimator          GaussianNB
imputer         SimpleImputer
sampler                   str
auc_mean            70.455027
auc_confl           70.417968
auc_confu           70.492087
type2                     233
f1_mean              0.642916
f1_confl              0.58873
f1_confu             0.697102
drop_columns               20
pre_mean             0.881112
rec_mean             0.528735
Name: 6, dtype: object

In [193]:
# loading naïve bayes (multinomial)
df_multinomial = calculate_metrics("outputs/output MultinomialNB.json")
df_multinomial = df_multinomial.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_multinomial[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(5)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
3,0.799592,0.067888,76.461299,100.0,0.792041
4,0.763306,0.054256,76.38177,100.0,0.782115
5,0.761306,0.058305,76.28177,100.0,0.780389
0,,,,,
1,,,,,


In [194]:
multinomial = df_multinomial.loc[3]
multinomial[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler           MinMaxScaler
estimator       MultinomialNB
imputer         SimpleImputer
sampler                   str
auc_mean            76.461299
auc_confl           76.422843
auc_confu           76.499755
type2                    99.0
f1_mean              0.792041
f1_confl             0.762598
f1_confu             0.821484
drop_columns            100.0
pre_mean             0.791219
rec_mean             0.799592
Name: 3, dtype: object

In [195]:
# loading logistic regression
df_logistic = calculate_metrics("outputs/output LogisticRegression.json")
df_logistic = df_logistic.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_logistic[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

Unnamed: 0,rec_mean,rec_std,auc_mean,drop_columns,f1_mean
6,0.84849,0.077311,76.028148,0,0.802571
15,0.791551,0.087012,77.641575,20,0.797472
3,0.791469,0.089943,77.893591,20,0.79955
18,0.791388,0.097733,76.886461,20,0.792772
0,0.791388,0.085821,77.13951,20,0.794461
12,0.787429,0.090881,77.563502,35,0.795023
9,0.781347,0.102803,75.881371,20,0.78372
17,0.775265,0.108387,78.818143,20,0.799202
7,0.769551,0.050236,76.431819,0,0.783671
19,0.769143,0.09683,77.646167,20,0.791511


In [196]:
logistic = df_logistic.loc[6]
logistic[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                  Normalizer
estimator       LogisticRegression
imputer              SimpleImputer
sampler                        str
auc_mean                 76.028148
auc_confl                75.992772
auc_confu                76.063524
type2                           75
f1_mean                   0.802571
f1_confl                  0.779839
f1_confu                  0.825302
drop_columns                     0
pre_mean                  0.770346
rec_mean                   0.84849
Name: 6, dtype: object

# Combined review of all DFs

In [197]:
df = pd.concat([df_Tree, df_XGBoost, df_SVC, df_logistic, df_Forest, df_bernoulli, df_KNeighbors])

In [198]:
df[["acc_mean", "rec_mean", "rec_std", "auc_mean", 'f1_mean', "drop_columns", "estimator"]].sort_values('acc_mean', ascending=False)

Unnamed: 0,acc_mean,rec_mean,rec_std,auc_mean,f1_mean,drop_columns,estimator
15,0.807328,0.831755,0.116541,80.398731,0.826626,100.0,KNeighborsClassifier
17,0.798464,0.797429,0.110121,79.798258,0.813584,100.0,KNeighborsClassifier
17,0.795194,0.795469,0.088218,79.468591,0.810375,20.0,SVC
16,0.792896,0.779265,0.097708,79.383997,0.806949,100.0,KNeighborsClassifier
18,0.790637,0.844163,0.097180,78.409383,0.818713,100.0,RandomForestClassifier
...,...,...,...,...,...,...,...
13,,,,,,,SVC
14,,,,,,,SVC
18,,,,,,,SVC
19,,,,,,,SVC


In [199]:
df[["rec_mean", "rec_std", "auc_mean", 'f1_mean', "drop_columns", "estimator"]].sort_values('f1_mean', ascending=False)

Unnamed: 0,rec_mean,rec_std,auc_mean,f1_mean,drop_columns,estimator
15,0.831755,0.116541,80.398731,0.826626,100.0,KNeighborsClassifier
18,0.844163,0.097180,78.409383,0.818713,100.0,RandomForestClassifier
9,0.844122,0.096034,78.160391,0.817078,100.0,RandomForestClassifier
15,0.840122,0.102677,78.332342,0.816883,100.0,RandomForestClassifier
12,0.840082,0.097832,78.205301,0.816678,100.0,RandomForestClassifier
...,...,...,...,...,...,...
13,,,,,,SVC
14,,,,,,SVC
18,,,,,,SVC
19,,,,,,SVC


In [200]:
Tree = df_Tree.loc[16]

Tree[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                             str
estimator       DecisionTreeClassifier
imputer                  SimpleImputer
sampler                            str
auc_mean                     76.703708
auc_confl                    76.666607
auc_confu                     76.74081
type2                               73
f1_mean                       0.808715
f1_confl                      0.788895
f1_confu                      0.828535
drop_columns                         0
pre_mean                      0.778788
rec_mean                      0.852245
Name: 16, dtype: object