In [34]:
import json
import pandas as pd
import numpy as np
import statistics

In [35]:
def calculate_metrics(filepath:str): 
    with open(filepath, 'r') as f:
        data = json.load(f)
    # extract data form json into usable df 
    df = pd.json_normalize(data, record_path =['measurements'])
    # save minimum percentage to be dropped in extra variable
    df["drop_columns"] = df["best_params.drop_columns__minimum_percentage_to_be_dropped"]
    # calculate main values of interest
    df["auc_std"] = df.apply(lambda row: np.std(row["auc"]), axis =1)
    df["pre_mean"] = df.apply(lambda row: np.mean(row["classification_report.weighted avg.precision"]), axis =1)
    df["rec_mean"] = df.apply(lambda row: np.mean(row["classification_report.weighted avg.recall"]), axis =1)
    df["f1_std"] = df.apply(lambda row: np.std(row["classification_report.weighted avg.f1-score"]), axis =1)
    df["f1_mean"] = df.apply(lambda row: np.mean(row["classification_report.weighted avg.f1-score"]), axis =1)
    df["type2"] = df.apply(lambda row: np.sum(row["confusion_matrix.(1, 0)"]), axis =1)
    # Confidence Intervalls
    df["auc_confl"] = df["auc_mean"]- 1.96 * (df["auc_std"] / np.sqrt(10))
    df["auc_confu"] = df["auc_mean"]+ 1.96 * (df["auc_std"] / np.sqrt(10))
    df["f1_confl"] = df["f1_mean"]- 1.96 * (df["f1_std"] / np.sqrt(10))
    df["f1_confu"] = df["f1_mean"]+ 1.96 * (df["f1_std"] / np.sqrt(10))
    return df

In [36]:
df_XGBoost = calculate_metrics("outputs/output xgboost.json")
# only display variables important for the model evaluation
df_XGBoost = df_XGBoost.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_XGBoost[["auc_mean", "auc_std", "type2", "drop_columns"]].head(20)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
7,76.971665,0.074685,96,100
8,76.540518,0.08687,109,75
9,75.475697,0.080552,106,100
1,75.438477,0.070096,110,100
4,75.438477,0.070096,110,100
10,75.438477,0.070096,110,100
13,75.438477,0.070096,110,100
19,75.438477,0.070096,110,100
15,75.402738,0.075977,108,100
18,75.352738,0.076452,106,100


In [37]:
# best model in detail 
XGBoost = df_XGBoost.loc[8]
XGBoost[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                  Normalizer
estimator            XGBClassifier
imputer              SimpleImputer
sampler         RandomUnderSampler
auc_mean                 76.540518
auc_confl                76.486675
auc_confu                 76.59436
type2                          109
f1_mean                   0.759294
f1_confl                  0.700291
f1_confu                  0.818296
drop_columns                    75
pre_mean                  0.778677
rec_mean                  0.767316
Name: 8, dtype: object

In [38]:
# loading KNeighbors 
df_KNeighbors = calculate_metrics("outputs/output KNN.json")
df_KNeighbors = df_KNeighbors.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_KNeighbors[["auc_mean", "auc_std", "type2", "drop_columns"]].head(20)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
15,80.398731,0.097697,83,100
17,79.798258,0.090229,100,100
16,79.383997,0.09265,109,100
5,78.938589,0.063572,127,100
4,78.811013,0.065513,121,100
11,78.79007,0.090879,105,100
2,78.481906,0.065569,123,100
9,78.180338,0.088097,95,100
10,78.144126,0.095192,114,100
0,77.672623,0.075865,109,100


In [39]:
df_XGBoost = calculate_metrics("outputs/output xgboost.json")
# get a look of the final dataset
df_XGBoost.head()

Unnamed: 0,scaler,estimator,imputer,sampler,X_shape,one_hot_encoded_features,auc_mean,execution_time_in_seconds,auc,parameters.impute__strategy,...,auc_std,pre_mean,rec_mean,f1_std,f1_mean,type2,auc_confl,auc_confu,f1_confl,f1_confu
0,MaxAbsScaler,XGBClassifier,SimpleImputer,str,"[899, 43]","[cp, restecg, slope, ca, restwm]",75.027738,122.561587,"[0.714534594325535, 0.7675460428073669, 0.8188...","[mean, median, most_frequent]",...,0.076513,0.768763,0.753995,0.086839,0.744731,108,74.980315,75.075161,0.690908,0.798555
1,MaxAbsScaler,XGBClassifier,SimpleImputer,RandomOverSampler,"[899, 43]","[cp, restecg, slope, ca, restwm]",75.438477,123.505258,"[0.7349427575908413, 0.7797411647585863, 0.771...","[mean, median, most_frequent]",...,0.070096,0.775624,0.757341,0.08004,0.74822,110,75.395031,75.481923,0.698611,0.79783
2,MaxAbsScaler,XGBClassifier,SimpleImputer,RandomUnderSampler,"[899, 43]","[cp, restecg, slope, ca, restwm]",75.162469,107.387465,"[0.7553509208561474, 0.7899452463912394, 0.794...","[mean, median, most_frequent]",...,0.081176,0.765268,0.75397,0.091534,0.745038,114,75.112155,75.212783,0.688304,0.801771
3,MinMaxScaler,XGBClassifier,SimpleImputer,str,"[899, 43]","[cp, restecg, slope, ca, restwm]",75.050697,116.305835,"[0.714534594325535, 0.7675460428073669, 0.8188...","[mean, median, most_frequent]",...,0.075209,0.768886,0.753995,0.085831,0.744858,109,75.004082,75.097312,0.69166,0.798057
4,MinMaxScaler,XGBClassifier,SimpleImputer,RandomOverSampler,"[899, 43]","[cp, restecg, slope, ca, restwm]",75.438477,122.199511,"[0.7349427575908413, 0.7797411647585863, 0.771...","[mean, median, most_frequent]",...,0.070096,0.775624,0.757341,0.08004,0.74822,110,75.395031,75.481923,0.698611,0.79783


In [40]:
# loading Random Forest Classifier
df_Forest = calculate_metrics("outputs/output RandomForestClassifier.json")
df_Forest = df_Forest.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_Forest[["auc_mean", "auc_std", "type2", "drop_columns"]].head(20)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
18,78.409383,0.097154,77,100
15,78.332342,0.097413,79,100
12,78.205301,0.104771,79,100
3,78.19048,0.095344,78,100
9,78.160391,0.095387,77,100
16,78.105811,0.093092,96,75
1,78.103771,0.094111,96,75
19,78.080811,0.093026,95,75
0,78.06548,0.098726,78,100
4,78.041551,0.095939,99,75


In [41]:
Forest = df_Forest.loc[15]
Forest[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                  StandardScaler
estimator       RandomForestClassifier
imputer                  SimpleImputer
sampler                            str
auc_mean                     78.332342
auc_confl                    78.271965
auc_confu                    78.392719
type2                               79
f1_mean                       0.775015
f1_confl                      0.700043
f1_confu                      0.849987
drop_columns                       100
pre_mean                      0.815984
rec_mean                      0.789526
Name: 15, dtype: object

In [42]:
# loading Decision Tree Classifier
df_Tree = calculate_metrics("outputs/output DecisionTrees.json")
df_Tree = df_Tree.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_Tree[["auc_mean", "auc_std", "type2", "drop_columns", 'scaler']].head(20)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns,scaler
0,76.703708,0.059859,73,0,MaxAbsScaler
3,76.703708,0.059859,73,0,MinMaxScaler
7,76.703708,0.059859,73,0,PowerTransformer
10,76.703708,0.059859,73,0,RobustScaler
13,76.703708,0.059859,73,0,StandardScaler
16,76.703708,0.059859,73,0,str
6,75.907765,0.056369,81,0,Normalizer
2,74.731234,0.04784,105,0,MaxAbsScaler
5,74.731234,0.04784,105,0,MinMaxScaler
9,74.731234,0.04784,105,0,PowerTransformer


In [43]:
Tree = df_Tree.loc[16]
Tree[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                             str
estimator       DecisionTreeClassifier
imputer                  SimpleImputer
sampler                            str
auc_mean                     76.703708
auc_confl                    76.666607
auc_confu                     76.74081
type2                               73
f1_mean                       0.770171
f1_confl                      0.733067
f1_confu                      0.807274
drop_columns                         0
pre_mean                      0.785349
rec_mean                      0.776305
Name: 16, dtype: object

In [44]:
# Tree.to_excel(r'DTree.xlsx', index=False)

In [45]:
# loading SVC
df_SVC = calculate_metrics("outputs/output SVC.json")
df_SVC = df_SVC.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_SVC[["auc_mean", "auc_std", "type2", "drop_columns"]].head(20)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
17,79.468591,0.065352,101.0,20.0
2,78.508412,0.076647,113.0,100.0
16,78.051257,0.084012,110.0,20.0
0,77.935901,0.073471,94.0,20.0
4,77.907379,0.083855,114.0,20.0
9,77.870632,0.084183,92.0,20.0
3,77.527738,0.0733,98.0,20.0
11,77.283387,0.075887,114.0,75.0
5,77.206434,0.071671,133.0,75.0
15,77.173656,0.078727,104.0,20.0


In [46]:
SVC = df_SVC.loc[9]
SVC[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler          PowerTransformer
estimator                    SVC
imputer            SimpleImputer
sampler                      str
auc_mean               77.870632
auc_confl              77.818455
auc_confu              77.922809
type2                       92.0
f1_mean                 0.774087
f1_confl                0.716995
f1_confu                0.831179
drop_columns                20.0
pre_mean                0.800496
rec_mean                0.782909
Name: 9, dtype: object

In [47]:
# loading naïve bayes (bernoulli)
df_bernoulli = calculate_metrics("outputs/output BernoulliNB.json")
df_bernoulli = df_bernoulli.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_bernoulli[["auc_mean", "auc_std", "type2", "drop_columns"]].head(20)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
15,79.068118,0.071319,111,8
16,78.913029,0.067901,115,8
17,78.758947,0.069558,119,8
9,77.518143,0.095385,109,35
10,77.389062,0.094922,114,35
11,77.38498,0.097225,114,35
12,76.92271,0.099803,116,60
0,76.840119,0.091046,102,20
6,76.840119,0.091046,102,20
18,76.840119,0.091046,102,20


In [48]:
bernoulli = df_bernoulli.loc[3]
bernoulli[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler           MinMaxScaler
estimator         BernoulliNB
imputer         SimpleImputer
sampler                   str
auc_mean            76.230836
auc_confl           76.182989
auc_confu           76.278684
type2                     101
f1_mean              0.757561
f1_confl             0.704142
f1_confu              0.81098
drop_columns                8
pre_mean             0.779828
rec_mean              0.76623
Name: 3, dtype: object

In [49]:
# naïve bayes (categorical) is skipped because no model could be trained successfully 

In [51]:
# loading naïve bayes (complement)
df_complement = calculate_metrics("outputs/output CompleteNB.json")
df_complement = df_complement.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_complement[["auc_mean", "auc_std", "type2", "drop_columns"]].head(7)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
3,77.483213,0.070835,78.0,100.0
5,76.5389,0.061275,113.0,100.0
4,76.416949,0.064465,113.0,100.0
0,,,,
1,,,,
2,,,,
6,,,,


In [52]:
complement = df_complement.loc[3]
complement[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler           MinMaxScaler
estimator        ComplementNB
imputer         SimpleImputer
sampler                   str
auc_mean            77.483213
auc_confl           77.439309
auc_confu           77.527117
type2                    78.0
f1_mean              0.778347
f1_confl             0.735348
f1_confu             0.821347
drop_columns            100.0
pre_mean              0.78419
rec_mean              0.78186
Name: 3, dtype: object

In [53]:
# loading naïve bayes (gaussian)
df_gaussian= calculate_metrics("outputs/output GaussianNB.json")
df_gaussian = df_gaussian.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_gaussian[["auc_mean", "auc_std", "type2", "drop_columns"]].head(20)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
18,70.835652,0.074371,234,20
7,70.526954,0.064782,236,20
6,70.455027,0.059793,233,20
19,70.232578,0.070411,240,20
20,70.027489,0.072613,242,20
8,69.888701,0.06626,241,20
12,65.163216,0.053465,310,20
14,64.783126,0.048594,315,20
13,64.605052,0.052414,318,20
0,61.711187,0.046103,354,20


In [54]:
gaussian = df_gaussian.loc[20]
gaussian[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                         str
estimator               GaussianNB
imputer              SimpleImputer
sampler         RandomUnderSampler
auc_mean                 70.027489
auc_confl                69.982483
auc_confu                70.072495
type2                          242
f1_mean                   0.667017
f1_confl                  0.617693
f1_confu                   0.71634
drop_columns                    20
pre_mean                  0.755673
rec_mean                  0.681798
Name: 20, dtype: object

In [57]:
# loading naïve bayes (multinomial)
df_multinomial = calculate_metrics("outputs/output MultinomialNB.json")
df_multinomial = df_multinomial.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_multinomial[["auc_mean", "auc_std", "type2", "drop_columns"]].head(5)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
3,76.461299,0.062045,99.0,100.0
4,76.38177,0.065212,117.0,100.0
5,76.28177,0.061807,118.0,100.0
0,,,,
1,,,,


In [58]:
multinomial = df_multinomial.loc[3]
multinomial[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler           MinMaxScaler
estimator       MultinomialNB
imputer         SimpleImputer
sampler                   str
auc_mean            76.461299
auc_confl           76.422843
auc_confu           76.499755
type2                    99.0
f1_mean              0.766352
f1_confl             0.728844
f1_confu             0.803859
drop_columns            100.0
pre_mean             0.772775
rec_mean             0.768539
Name: 3, dtype: object

In [59]:
# loading logistic regression
df_logistic = calculate_metrics("outputs/output LogisticRegression.json")
df_logistic = df_logistic.sort_values(by = ["auc_mean", "auc_std", "type2", "drop_columns"], ascending = False)
df_logistic[["auc_mean", "auc_std", "type2", "drop_columns"]].head(20)

Unnamed: 0,auc_mean,auc_std,type2,drop_columns
17,78.818143,0.081598,111,20
14,78.633972,0.081839,114,20
2,78.319126,0.066004,116,20
3,77.893591,0.077889,103,20
16,77.696192,0.072278,116,20
13,77.665045,0.079287,120,20
19,77.646167,0.089447,114,20
15,77.641575,0.069453,103,20
12,77.563502,0.064914,105,35
5,77.563004,0.069132,121,20


# Combined review of all DFs

In [60]:
df = pd.concat([df_Tree, df_XGBoost, df_SVC, df_logistic, df_Forest, df_bernoulli, df_KNeighbors])

In [61]:
df[["auc_mean", "auc_std", "type2", "drop_columns", "estimator"]].sort_values('type2')

Unnamed: 0,auc_mean,auc_std,type2,drop_columns,estimator
18,76.285254,0.056878,70.0,0.0,KNeighborsClassifier
16,76.703708,0.059859,73.0,0.0,DecisionTreeClassifier
13,76.703708,0.059859,73.0,0.0,DecisionTreeClassifier
0,76.703708,0.059859,73.0,0.0,DecisionTreeClassifier
7,76.703708,0.059859,73.0,0.0,DecisionTreeClassifier
...,...,...,...,...,...
13,,,,,SVC
14,,,,,SVC
18,,,,,SVC
19,,,,,SVC


In [62]:
df[["auc_mean", "auc_std",'f1_mean', "type2", "drop_columns", "estimator"]].sort_values('f1_mean', ascending=False)

Unnamed: 0,auc_mean,auc_std,f1_mean,type2,drop_columns,estimator
15,80.398731,0.097697,0.797751,83.0,100.0,KNeighborsClassifier
17,79.468591,0.065352,0.792116,101.0,20.0,SVC
17,79.798258,0.090229,0.791528,100.0,100.0,KNeighborsClassifier
15,79.068118,0.071319,0.787344,111.0,8.0,BernoulliNB
16,79.383997,0.092650,0.786896,109.0,100.0,KNeighborsClassifier
...,...,...,...,...,...,...
13,,,,,,SVC
14,,,,,,SVC
18,,,,,,SVC
19,,,,,,SVC


In [63]:
Tree = df_Tree.loc[16]

Tree[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                             str
estimator       DecisionTreeClassifier
imputer                  SimpleImputer
sampler                            str
auc_mean                     76.703708
auc_confl                    76.666607
auc_confu                     76.74081
type2                               73
f1_mean                       0.770171
f1_confl                      0.733067
f1_confu                      0.807274
drop_columns                         0
pre_mean                      0.785349
rec_mean                      0.776305
Name: 16, dtype: object

In [64]:
logistic = df_logistic.loc[6]
logistic[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

scaler                  Normalizer
estimator       LogisticRegression
imputer              SimpleImputer
sampler                        str
auc_mean                 76.028148
auc_confl                75.992772
auc_confu                76.063524
type2                           75
f1_mean                   0.763059
f1_confl                  0.726622
f1_confu                  0.799497
drop_columns                     0
pre_mean                  0.780859
rec_mean                   0.76965
Name: 6, dtype: object