In [None]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.metrics import roc_auc_score, f1_score
from io import StringIO

# Baseline

In [None]:
# list the datasets that should be used in the current run
datasets = ["hungarian", "cleveland", "switzerland", "long-beach-va"]

In [None]:
# custom function to read the dataset into a csv formated string
# the name is used as a delimiter here because it is the last feature and common among all entrys
def read_raw_data(file_path:str):
    with open(file_path) as file:
        data_string = file.read()
        data_string = data_string.replace("\n"," ")
        data_string = re.sub("[a-zA-Z]+ ","name\n", data_string)
        data_string = data_string.replace(" ",",")
        return data_string

In [None]:
# read the data from the specified datasets into the df
df = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv(StringIO(read_raw_data("./Data/"+ dataset +".data")), header=None, sep=",")
    dataset_df['dataset'] = dataset
    df = pd.concat([df,dataset_df ], ignore_index=True)

In [None]:
df.columns=["id", "ccf", "age", "sex", "painloc", "painexer" , "relrest" , "pncaden" , "cp", "trestbps", "htn", "chol", "smoke", "cigs", "years", "fbs", "dm", "famhist", "restecg", "ekgmo", "ekgday", "ekgyr", "dig", "prop", "nitr", "pro", "diuretic", "proto", "thaldur", "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd", "dummy", "trestbpd", "exang", "xhypo", "oldpeak", "slope", "rldv5", "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm", "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe", "cmo", "cday", "cyr", "num", "lmt", "ladprox", "laddist", "diag", "cxmain", "ramus", "om1", "om2", "rcaprox", "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk", "name", "dataset"]

In [None]:
# the uci encodes missing values as -9 we use np.float('nan') for better profiling reports
df = df.replace(-9, float('nan'))

In [None]:
y_true = df['num']
y_true[y_true > 1] = 1
y_pred = [1] * len(y_true)
roc_auc_score(y_true, y_pred)

In [None]:
f1_score(y_true, y_pred)

In [None]:
def calculate_metrics(filepath:str): 
    with open(filepath, 'r') as f:
        data = json.load(f)
    # extract data form json into usable df 
    df = pd.json_normalize(data, record_path =['measurements'])
    # save minimum percentage to be dropped in extra variable
    df["drop_columns"] = df["best_params.drop_columns__minimum_percentage_to_be_dropped"]
    # calculate main values of interest
    df["auc_std"] = df.apply(lambda row: np.std(row["auc"]), axis =1)
    df["pre_std"] = df.apply(lambda row: np.std(row["classification_report.1.precision"]), axis =1)
    df["pre_mean"] = df.apply(lambda row: np.mean(row["classification_report.1.precision"]), axis =1)
    df["acc_mean"] = df.apply(lambda row: np.mean(row["classification_report.accuracy"]), axis =1)
    df["rec_std"] = df.apply(lambda row: np.std(row["classification_report.1.recall"]), axis =1)
    df["rec_mean"] = df.apply(lambda row: np.mean(row["classification_report.1.recall"]), axis =1)
    df["f1_std"] = df.apply(lambda row: np.std(row["classification_report.1.f1-score"]), axis =1)
    df["f1_mean"] = df.apply(lambda row: np.mean(row["classification_report.1.f1-score"]), axis =1)
    df["type2"] = df.apply(lambda row: np.sum(row["confusion_matrix.(1, 0)"]), axis =1)
    # Confidence Intervalls
    df["auc_confl"] = df["auc_mean"]- 1.96 * (df["auc_std"] / np.sqrt(10))
    df["auc_confu"] = df["auc_mean"]+ 1.96 * (df["auc_std"] / np.sqrt(10))
    df["f1_confl"] = df["f1_mean"]- 1.96 * (df["f1_std"] / np.sqrt(10))
    df["f1_confu"] = df["f1_mean"]+ 1.96 * (df["f1_std"] / np.sqrt(10))
    return df

In [None]:
df_XGBoost = calculate_metrics("outputs/output xgboost.json")
# only display variables important for the model evaluation
df_XGBoost = df_XGBoost.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_XGBoost[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

In [None]:
# best model in detail 
XGBoost = df_XGBoost.loc[8]
XGBoost[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# loading KNeighbors 
df_KNeighbors = calculate_metrics("outputs/output KNN.json")
df_KNeighbors = df_KNeighbors.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_KNeighbors[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(40)

In [None]:
KNeighbors = df_KNeighbors.loc[18]
KNeighbors[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# loading Random Forest Classifier
df_Forest = calculate_metrics("outputs/output RandomForestClassifier.json")
df_Forest = df_Forest.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_Forest[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

In [None]:
Forest = df_Forest.loc[18]
Forest[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# loading Decision Tree Classifier
df_Tree = calculate_metrics("outputs/output DecisionTrees.json")
df_Tree = df_Tree.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean', 'scaler'], ascending = False)
df_Tree[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean','scaler']].head(20)

In [None]:
Tree = df_Tree.loc[16]
Tree[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# Tree.to_excel(r'DTree.xlsx', index=False)

In [None]:
# loading SVC
df_SVC = calculate_metrics("outputs/output SVC.json")
df_SVC = df_SVC.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_SVC[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

In [None]:
SVC = df_SVC.loc[9]
SVC[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# loading naïve bayes (bernoulli)
df_bernoulli = calculate_metrics("outputs/output BernoulliNB.json")
df_bernoulli = df_bernoulli.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_bernoulli[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

In [None]:
bernoulli = df_bernoulli.loc[3]
bernoulli[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# naïve bayes (categorical) is skipped because no model could be trained successfully 

In [None]:
# loading naïve bayes (complement)
df_complement = calculate_metrics("outputs/output CompleteNB.json")
df_complement = df_complement.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_complement[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(7)

In [None]:
complement = df_complement.loc[3]
complement[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# loading naïve bayes (gaussian)
df_gaussian= calculate_metrics("outputs/output GaussianNB.json")
df_gaussian = df_gaussian.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_gaussian[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

In [None]:
gaussian = df_gaussian.loc[6]
gaussian[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# loading naïve bayes (multinomial)
df_multinomial = calculate_metrics("outputs/output MultinomialNB.json")
df_multinomial = df_multinomial.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_multinomial[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(5)

In [None]:
multinomial = df_multinomial.loc[3]
multinomial[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

In [None]:
# loading logistic regression
df_logistic = calculate_metrics("outputs/output LogisticRegression.json")
df_logistic = df_logistic.sort_values(by = ["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean'], ascending = False)
df_logistic[["rec_mean", "rec_std", "auc_mean", "drop_columns", 'f1_mean']].head(20)

In [None]:
logistic = df_logistic.loc[6]
logistic[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]

# Combined review of all DFs

In [None]:
df = pd.concat([df_Tree, df_XGBoost, df_SVC, df_logistic, df_Forest, df_bernoulli, df_KNeighbors])

In [None]:
df[["acc_mean", "rec_mean", "rec_std", "auc_mean", 'f1_mean', "drop_columns", "estimator"]].sort_values('acc_mean', ascending=False)

In [None]:
df[["rec_mean", "rec_std", "auc_mean", 'f1_mean', "drop_columns", "estimator"]].sort_values('f1_mean', ascending=False)

In [None]:
Tree = df_Tree.loc[16]

Tree[["scaler", "estimator", "imputer", "sampler", "auc_mean", "auc_confl", "auc_confu", "type2", "f1_mean", "f1_confl", "f1_confu", "drop_columns", "pre_mean", "rec_mean"]]