# Setup
## Variables for configuration

In [None]:
oneHotEncodedFeatures = ['cp','restecg', 'slope','ca', 'restwm']
# list the datasets that should be used in the current run
datasets = ["hungarian", "cleveland", "switzerland", "long-beach-va"]

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
# set pandas to show all columns of the df when using the display function
pd.set_option('display.max_columns', None)

# Import Data

In [None]:
# custom function to read the dataset into a csv formated string
# the name is used as a delimiter here because it is the last feature and common among all entrys 
def read_raw_data(file_path:str):
    with open(file_path) as file:
        file_string = file.read()
        file_string = file_string.replace("\n"," ")
        file_string = re.sub("[a-zA-Z]+ ","name\n", file_string)
        file_string = file_string.replace(" ",",")
        return file_string

In [None]:
# read the data from the specified datasets into the df 
from io import StringIO
df = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv(StringIO(read_raw_data("./Data/"+ dataset +".data")), header=None, sep=",")
    dataset_df['dataset'] = dataset
    df = pd.concat([df,dataset_df ], ignore_index=True)

In [None]:
df.columns=["id", "ccf", "age", "sex", "painloc", "painexer" , "relrest" , "pncaden" , "cp", "trestbps", "htn", "chol", "smoke", "cigs", "years", "fbs", "dm", "famhist", "restecg", "ekgmo", "ekgday", "ekgyr", "dig", "prop", "nitr", "pro", "diuretic", "proto", "thaldur", "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd", "dummy", "trestbpd", "exang", "xhypo", "oldpeak", "slope", "rldv5", "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm", "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe", "cmo", "cday", "cyr", "num", "lmt", "ladprox", "laddist", "diag", "cxmain", "ramus", "om1", "om2", "rcaprox", "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk", "name", "dataset"]

In [None]:
if generate_pandas_profiling_reports:
    from pandas_profiling import ProfileReport
    profile = ProfileReport(df, title='Pandas Profiling Report for all features')
    profile.to_file("Pandas Profiling Report for all features.html")

# Todo Correlated Features

## Treatment of different scales in the datasets
### met

In [None]:
# plot to show the reason why we need to process this data:
sns.boxplot(x="met",y="dataset",data= df)
plt.show()

In [None]:
# does not seem valid
df.loc[df["dataset"] == "switzerland", "met"] = -9

### rldv5e

In [None]:
# plot to show the reason why we need to process this data: 
sns.boxplot(x="rldv5e",y="dataset",data= df)
plt.show()

# Drop columns

In [None]:
unrealistic_values_in_column = {
    'rldv5e'
}
df.drop(unrealistic_values_in_column, inplace=True, axis=1)

In [None]:
irrelevant_columns = [
    "id", # A id is not relevant for a model
    "ccf", # the social security number does not influence if you have a heart disease or not
    "pncaden", # sum of painlox painexer relrest -> the features are already in the dataset -> drop because it is a duplicate
    "ekgmo", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "ekgday", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "ekgyr", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "cmo", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "cday", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "cyr", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "name" # Constant
]
df.drop(irrelevant_columns, inplace=True, axis=1)

In [None]:
unexplained_columns = [
    "restckm", # irrelevant according to the uci
    "exerckm", # irrelevant according to the uci
    "thalsev", # irrelevant according to the uci
    "thalpul", # irrelevant according to the uci
    "earlobe", # Constant
    "lvx1", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "lvx2", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "lvx3", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "lvx4", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "lvf", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "dummy", # no description available -> from the name does not seem relevant
    'junk'
]
df.drop(unexplained_columns, inplace=True, axis=1)

In [None]:
hidden_identifier = [
    'lmt',      # Left main truck
    'ladprox',  # Proximal left anterior descending artery
    'laddist',  # Distal left anterior descending artery
    'diag',     # Diagonal branches
    'cxmain',   # Circumflex
    'ramus',    # Ramus intermedius
    'om1',      # First obtuse marginal branch
    'om2',      # Second obtuse marginal branch
    'rcaprox',  # Proximal right coronary artery
    'rcadist',  # Distal right coronary artery
]
df.drop(hidden_identifier, inplace=True, axis=1)

# Train the different models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
# from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from  sklearn.naive_bayes import *

estimators=[
    # {"estimator": CatBoostClassifier(random_state=42, thread_count=-1, silent= True), "parameters": {'estimator__depth':[None] + [*range(1,200)],
    #                                                                                                  'estimator__n_estimators':range(10,1000, 100),
    #                                                                                                  'estimator__learning_rate':[0.001,0.01,0.1,0.2,0.3],
    #                                                                                                  'estimator__l2_leaf_reg':range(5,100, 5),
    #                                                                                                  'estimator__border_count':range(5,200, 5),
    #                                                                                                  'estimator__ctr_border_count':range(5,200, 5)
    #                                                                                                  }},
    # {"estimator": XGBClassifier(random_state=42, n_jobs=1), "parameters": {'estimator__max_depth': [None] + [*range(1,200)],
    #                                                                        'estimator__n_estimators': range(10,1000, 100),
    #                                                                         'estimator__learning_rate':[0.001,0.01,0.1,0.2,0.3]}},
    # {"estimator": SVC(random_state=42, tol=0.01), "parameters": {'estimator__C': [110,120,130,140,150],
    #                                                              'estimator__gamma': [0.0001, 0.001, 0.01, 0.1],
    #                                                              'estimator__degree': [3,4,5,6],
    #                                                              'estimator__kernel':['linear', 'rbf', 'poly', 'sigmoid'] }}, # '
    # {"estimator": BernoulliNB(), "parameters": {'estimator__alpha' : np.arange(0,20,0.1)}},
    # {"estimator": CategoricalNB(), "parameters": {'estimator__alpha' : np.arange(0,20,0.1)}},
    # {"estimator": ComplementNB(), "parameters": {'estimator__alpha' : np.arange(0,20,0.1),
    #                                              'estimator__norm':[True,False]}},
    # {"estimator": GaussianNB(), "parameters": {}},
    # {"estimator": MultinomialNB(), "parameters": {'estimator__alpha' : np.arange(0,20,0.1)}},
    # {"estimator": DecisionTreeClassifier(random_state=42), "parameters": {'estimator__criterion':['gini','entropy', 'log_loss'],
    #                                                                       'estimator__max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],
    #                                                                       'estimator__min_samples_split': range(2,20),
    #                                                                       'estimator__min_samples_leaf': range(2,20)}},
    {"estimator": KNeighborsClassifier(), "parameters": {'estimator__n_neighbors': range(2, 100,5),
                                                         'estimator__weights': ['uniform','distance'],
                                                         'estimator__p': [1,2]}},
    # {"estimator": RandomForestClassifier(random_state=42, n_jobs=1), "parameters": {'estimator__n_estimators':range(10,1000, 100)}},
    # {"estimator": SGDClassifier(max_iter=1000000), "parameters": {'estimator__loss':['log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    #                                               'estimator__penalty':['l1','l2','elasticnet'],
    #                                               'estimator__alpha' : np.arange(1,40,1)}}
]

In [None]:
from sklearn.preprocessing import *
scalers = [
    {"scaler": MaxAbsScaler(), "parameters": {}},
    {"scaler": MinMaxScaler(), "parameters": {}},
    {"scaler": Normalizer(), "parameters": {'scaler__norm': ['l1', 'l2', 'max']}},
    {"scaler": PowerTransformer(), "parameters": {}},
    {"scaler": RobustScaler(), "parameters": {'scaler__with_centering': [ True, False],'scaler__with_scaling': [ True, False]}},
    {"scaler": 'passthrough', "parameters": {}},
    {"scaler": StandardScaler(), "parameters": {'scaler__with_mean': [ True, False],'scaler__with_std': [ True, False]}}
]

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
imputers = [
    {"imputer": SimpleImputer(missing_values = -9), "parameters": {'impute__strategy' : ['mean', 'median', 'most_frequent']}},
    # {"imputer": KNNImputer(missing_values = -9), "parameters": {'impute__n_neighbors': range(2, 10,1)}},
]

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

sampling = dict(sampler= [RandomOverSampler(),RandomUnderSampler(), 'passthrough'])

In [None]:
general_parameters = {
    'drop_columns__minimum_percentage_to_be_dropped': range(0,101,10)
}

In [None]:
# assumption the dictionaries are of equal structure
def merge_dict(dict1, dict2):
    for key, val in dict1.items():
        if type(val) == dict:
            dict1[key] = merge_dict(dict1[key], dict2[key])
        elif type(val) == list:
            if type(dict2[key]) == list:
                dict1[key] = [ *dict1[key], *dict2[key]]
            else:
                dict1[key] = [*dict1[key], dict2[key]]
        else:
            dict1[key] = [val, dict2[key]]

    return dict1

The columns smoke and years both describe whether a respondent smokes or not. Smoke does this by being binary coded, while years describes the number of years a person has smoked. Due to the high number of missing values, the columns are useless on their own. However, it is possible to enrich the smoke column with the years column.

In [None]:
class DataframeSmokeTransformer:
    def transform(self, input_df, **transform_params):
        if all(value in input_df for value in ['smoke, years']):
            input_df.loc[(input_df['smoke'].isna()) & (input_df['years'] == 0),'smoke'] = 0
            input_df.loc[(input_df['smoke'].isna()) & (input_df['years'] > 0),'smoke'] = 1

        if all(value in input_df for value in ['smoke, years']):
            input_df.loc[(input_df['smoke'].isna()) & (input_df['cigs'] == 0),'smoke'] = 0
            input_df.loc[(input_df['smoke'].isna()) & (input_df['cigs'] > 0),'smoke'] = 1
        return input_df
    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
from sklearn.exceptions import NotFittedError
class DropColumnsBasedOnMinimumPercentageToBeDropped:
    def __init__(self):
        self.minimum_percentage_to_be_dropped = 100
        self.fitted = False
        self.valuesToKeep = False

    def set_params(self, **params):
        self.minimum_percentage_to_be_dropped = params.get('minimum_percentage_to_be_dropped')

    def transform(self, input_df, **transform_params):
        if self.fitted:
            return input_df[input_df.columns.intersection(self.valuesToKeep.keys())]
        else:
            raise NotFittedError()

    def fit(self, X, y=None, **fit_params):
        percentage_missing = ((X.eq(-9).sum()/len(df)*100).round(2)).to_dict()
        self.valuesToKeep = {key: val for key, val in percentage_missing.items() if val <= self.minimum_percentage_to_be_dropped}
        self.fitted = True
        return self

In [None]:
class FixCommonEncodingErrors:
    def transform(self, input_df, **transform_params):
        input_df = input_df.copy(deep=True)
        # if cholesterin is 0 it was not measured
        input_df.loc[input_df['chol'] == 0,'chol'] =  -9
        # leave the dead ones behind
        # drop entries with a blood pressure of 0
        input_df.loc[input_df['trestbps'] == 0,'trestbps'] =  -9
        # is a binary variable
        input_df.loc[df['prop'].isin([0,1]) == False,'prop' ] = -9

        input_df.loc[input_df['ca'] >3 ,'ca'] =  -9
        # transform proto to possible values
        input_df.loc[input_df['proto'] == 200,'proto'] =  9
        input_df.loc[input_df['proto'] == 175,'proto'] =  8
        input_df.loc[input_df['proto'] == 150,'proto'] =  7
        input_df.loc[input_df['proto'] == 130,'proto'] =  6
        input_df.loc[input_df['proto'] == 125,'proto'] =  5
        input_df.loc[input_df['proto'] == 100,'proto'] = 4
        input_df.loc[input_df['proto'] == 75,'proto'] = 3
        input_df.loc[input_df['proto'] == 50,'proto'] = 2
        input_df.loc[input_df['proto'] == 50,'proto'] = 1

        input_df.loc[input_df['proto'].isin([*range(1,13)]) == False, 'proto'] = -9

        return input_df
    def fit(self, X, y=None, **fit_params):
        return self

In [None]:
import pickle
import os
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_auc_score


def classification_report_with_auc_score(y_true, y_pred):
    confusion_matrix_dict = {}
    for idxRow, row in np.ndenumerate(confusion_matrix(y_true, y_pred)):
        confusion_matrix_dict[str(idxRow)] = row
    current_roc_auc_score = roc_auc_score(y_true, y_pred, average='macro')
    if os.path.exists('temp.pickle'):
        with open("temp.pickle", "rb") as temp_file:
            report = pickle.load(temp_file)
            report['classification_report'] = merge_dict(report['classification_report'], classification_report(y_true, y_pred, output_dict=True))
            report["auc"].append(current_roc_auc_score)
            report['confusion_matrix'] = merge_dict(report['confusion_matrix'], confusion_matrix_dict)
    else:
        report = {'classification_report': classification_report(y_true, y_pred, output_dict=True),
                  "auc": [current_roc_auc_score],
                  'confusion_matrix': confusion_matrix_dict
                  }
    with open('temp.pickle', 'wb') as temp_file:
        pickle.dump(report, temp_file, protocol=pickle.HIGHEST_PROTOCOL)
    # because we need to return something
    return current_roc_auc_score # return accuracy score

In [None]:
from typing import Callable
from imblearn.base import BaseSampler
import json
class CustomEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if type(obj) == range:
            return [*obj]
        if isinstance(obj, BaseSampler):
            return obj.__class__.__name__
        return super(CustomEncoder, self).default(obj)

In [None]:

from imblearn.over_sampling import RandomOverSampler
import time
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# build pipelines
# create the pipeline
pipelines = []
X = df.loc[:,(df.columns!= 'num') & (df.columns != 'dataset') ].copy(deep=True)
y = df.loc[: , (df.columns== 'num')].values.ravel().copy()
y[y>1]=1
for scaler in scalers:
    for estimator in estimators:
        for imputer in imputers:
            startTime = time.time()
            parameters =  scaler.get("parameters") | estimator.get("parameters") | imputer.get('parameters') | general_parameters | sampling
            oneHotEncoder = ColumnTransformer(
                transformers=[
                        ('oneHotEncoder', OneHotEncoder(handle_unknown='infrequent_if_exist'), lambda X : [value for value in oneHotEncodedFeatures if value in X.columns]),
                    ], remainder='passthrough')
            pipeline = Pipeline(steps=[
                ('fix_encoding_errors', FixCommonEncodingErrors()),
                ('transform_smoke', DataframeSmokeTransformer()),
                ('drop_columns', DropColumnsBasedOnMinimumPercentageToBeDropped()),
                ('oneHotEncoder', oneHotEncoder),
                ('impute', imputer.get('imputer')),
                ('scaler', scaler.get('scaler')),
                ('sampler', RandomOverSampler()),
                ('estimator', estimator.get("estimator"))
            ])
            # create the grid search instance
            grid_search_estimator = GridSearchCV(pipeline, parameters, scoring='roc_auc', cv=10, error_score='raise', n_jobs=-1, verbose= 0)
            try:
                auc_best = cross_val_score(grid_search_estimator, X, y, cv=10, scoring=make_scorer(classification_report_with_auc_score), error_score='raise', verbose = 2, n_jobs=1)
                grid_search_estimator.fit(X, y)
                print(f"auc for {scaler.get('scaler').__class__.__name__}, {estimator.get('estimator').__class__.__name__},{estimator.get('imputer').__class__.__name__} and = {auc_best.mean() * 100.0}")
                display(grid_search_estimator.best_params_)
                executionTime = (time.time() - startTime)
            except Exception as e:
                print(f'Skipping the combination of {scaler.get("scaler").__class__.__name__}, {estimator.get("estimator").__class__.__name__},{imputer.get("imputer").__class__.__name__} because:')
                print(str(e))
                output_dict = {}
                output_dict["scaler"]= scaler.get('scaler').__class__.__name__
                output_dict["estimator"] = estimator.get('estimator').__class__.__name__
                output_dict["imputer"] = imputer.get('imputer').__class__.__name__
                output_dict["reason"] = str(e)
            else:
                output_dict = {}
                output_dict["scaler"]= scaler.get('scaler').__class__.__name__
                output_dict["estimator"] = estimator.get('estimator').__class__.__name__
                output_dict["imputer"] = imputer.get('imputer').__class__.__name__
                output_dict["X_shape"] = X.shape
                output_dict["oneHotEncodedFeatures"] = oneHotEncodedFeatures
                output_dict["parameters"] = parameters
                output_dict["auc mean"] = auc_best.mean() * 100
                output_dict["Execution time in seconds"] = executionTime
                output_dict["best_params"] = grid_search_estimator.best_params_
                with open("temp.pickle", "rb") as temp_file:
                    report = pickle.load(temp_file)

                    output_dict["auc"] = report['auc']
                    output_dict["classification_report"] = report['classification_report']
                    output_dict["confusion_matrix"] = report['confusion_matrix']

            finally:
                try:
                    with open("output.json", "r") as file:
                        file_dict = json.load(file)
                        measurements  = file_dict.get('measurements')
                except Exception as e:
                    measurements = []
                measurements.append(output_dict)
                with open("output.json", "w") as file:
                    json.dump({"measurements": measurements}, file, cls= CustomEncoder)
                if os.path.exists('temp.pickle'):
                    os.remove('temp.pickle')



        print("-----------------------------------------------------------------")