In [None]:
# Variables for configuration
generate_pandas_profiling_reports = False
process_preprocessed_data_of_uci = False
print_pair_plots = False
drop_correlated_features = False
drop_nan= True
encode_labels = True
minimumPercentageMissingToBeDropped = 12
oneHotEncodedFeatures = ['cp','restecg']

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
import matplotlib as mpl
mpl.rc('image', cmap='coolwarm')

# set pandas to show all columns of the df when using the display function
pd.set_option('display.max_columns', None)

In [None]:
# list the datasets that should be used in the current run
datasets = ["hungarian", "cleveland", "switzerland", "long-beach-va"]

In [None]:
# custom function to read the dataset into a csv formated string
# the name is used as a delimiter here because it is the last feature and common among all entrys 
def readRawData(filePath:str):
    with open(filePath) as file:
        dataString = file.read()
        dataString = dataString.replace("\n"," ")
        dataString = re.sub("[a-zA-Z]+ ","name\n", dataString)
        dataString = dataString.replace(" ",",")
        return dataString

In [None]:
# read the data from the specified datasets into the df 
from io import StringIO
df = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv(StringIO(readRawData("./Data/"+ dataset +".data")), header=None, sep=",")
    dataset_df['dataset'] = dataset
    df = pd.concat([df,dataset_df ], ignore_index=True)

In [None]:
df.columns=["id", "ccf", "age", "sex", "painloc", "painexer" , "relrest" , "pncaden" , "cp", "trestbps", "htn", "chol", "smoke", "cigs", "years", "fbs", "dm", "famhist", "restecg", "ekgmo", "ekgday", "ekgyr", "dig", "prop", "nitr", "pro", "diuretic", "proto", "thaldur", "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd", "dummy", "trestbpd", "exang", "xhypo", "oldpeak", "slope", "rldv5", "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm", "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe", "cmo", "cday", "cyr", "num", "lmt", "ladprox", "laddist", "diag", "cxmain", "ramus", "om1", "om2", "rcaprox", "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk", "name", "dataset"]

In [None]:
# read the data from the specified datasets into the df
from io import StringIO
dfNew = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv(StringIO(readRawData("./Data/"+ 'new' +".data")), header=None, sep=",")
    dataset_df['dataset'] = dataset
    dfNew = pd.concat([dfNew,dataset_df ], ignore_index=True)

In [None]:
df = df.replace(-9, float('nan'))
df.describe()

In [None]:
# the dataset encodes unfilled cells with -9 they are replaced with NaN for better compatibility with pd
dfNew = dfNew.replace(-9, float('nan'))
dfNew.describe()

# Data cleanup

In [None]:
if generate_pandas_profiling_reports:
    from pandas_profiling import ProfileReport
    profile = ProfileReport(df, title='Pandas Profiling Report for all features')
    profile.to_file("Pandas Profiling Report for all features.html")

The columns smoke and years both describe whether a respondent smokes or not. Smoke does this by being binary coded, while years describes the number of years a person has smoked. Due to the high number of missing values, the columns are useless on their own. However, it is possible to enrich the smoke column with the years column. 

In [None]:
print(f"Number of NaNs in smoke: {df['smoke'].isna().sum()}")
df.loc[(df['smoke'].isna()) & (df['years'] == 0),'smoke'] = 0
df.loc[(df['smoke'].isna()) & (df['years'] > 0),'smoke'] = 1
print(f"Number of NaNs in smoke after combination with years: {df['smoke'].isna().sum()}")
df.loc[(df['smoke'].isna()) & (df['cigs'] == 0),'smoke'] = 0
df.loc[(df['smoke'].isna()) & (df['cigs'] > 0),'smoke'] = 1
print(f"Number of NaNs in smoke after combination with years and cigs: {df['smoke'].isna().sum()}")

Finding: reduces the number of missing values in smoke by 280 entries.

In [None]:
# if cholesterin is 0 it was not measured
df['chol'] = df['chol'].replace(0, float('nan'))

## Explore how many NaNs and zeros are within one column for each attribute.

In [None]:
(df.loc[ : , df.columns != 'dataset'].isna()).join(df['dataset']).groupby("dataset").sum()

In [None]:
(df.loc[ : , df.columns != 'dataset'].eq(0)).join(df['dataset']).groupby("dataset").sum()

## Treatment of different scales in the datasets
### met

In [None]:
# plot to show the reason why we need to process this data:
sns.boxplot(x="met",y="dataset",data= df)
plt.show()

In [None]:
df.loc[df["dataset"] == "switzerland", "met"] = df.loc[df["dataset"] == "switzerland", "met"]/10

In [None]:
# plot to show the reason why we need to process this data:
sns.boxplot(x="met",y="dataset",data= df)
plt.show()

### rldv5e

In [None]:
# plot to show the reason why we need to process this data: 
sns.boxplot(x="rldv5e",y="dataset",data= df)
plt.show()

In [None]:
df.loc[df["dataset"] == "cleveland", "rldv5e"] = df.loc[df["dataset"] == "cleveland", "rldv5e"]/10

In [None]:
# plot to show the reason why we drop:
sns.boxplot(x="rldv5e",y="dataset",data= df)
plt.show()

In [None]:
# plt.figure(figsize=(15,15))
# # Compute the correlation matrix
# corr = df.corr()
# corr = corr.round(2)
# # Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, cmap="coolwarm", center=0, square=True, linewidths=.5, vmin=-1, vmax=1, annot=True)
# plt.show()

In [None]:
# for dataset in datasets:
#     plt.figure(figsize=(15, 15))
#     # Compute the correlation matrix
#     corr = df[df['dataset'] == dataset].corr()
#     corr = corr.round(2)
#     # Draw the heatmap with the mask and correct aspect ratio
#     sns.heatmap(corr, cmap="coolwarm", center=0, square=True, linewidths=.5, vmin=-1, vmax=1, annot=True)
#     plt.show()


In [None]:
if drop_correlated_features:
    df.drop("met", inplace=True, axis=1)

# Drop columns

In [None]:
irrelevant_columns = [
    "id", # A id is not relevant for a model
    "ccf", # the social security number does not influence if you have a heart disease or not
    "pncaden", # sum of painlox painexer relrest -> the features are already in the dataset -> drop because it is a duplicate
    "ekgmo", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "ekgday", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "ekgyr", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "cmo", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "cday", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "cyr", # The date of the medical examination is irrelevant for the occurrence of a disease.
    "name" # Constant
]
df.drop(irrelevant_columns, inplace=True, axis=1)

In [None]:
unexplained_columns = [
    "restckm", # irrelevant according to the uci
    "exerckm", # irrelevant according to the uci
    "thalsev", # irrelevant according to the uci
    "thalpul", # irrelevant according to the uci
    "earlobe", # Constant
    "lvx1", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "lvx2", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "lvx3", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "lvx4", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "lvf", # it is not possible to gain information about what this feature measures -> could not be supplied to trained models -> drop https://archive.ics.uci.edu/ml/datasets/Heart+Disease
    "dummy", # no description available -> from the name does not seem relevant
]
df.drop(unexplained_columns, inplace=True, axis=1)

In [None]:
percentage_missing = ((df.isna().sum()/len(df)*100).round(2)).to_dict()
missing_vlaues = {key: val for key, val in percentage_missing.items() if val > minimumPercentageMissingToBeDropped}
df.drop([*missing_vlaues.keys()], inplace=True, axis=1)

In [None]:
df['dataset'].value_counts()

## drop by rows because of unrealistic values

In [None]:
# leave the dead ones behind
# drop entries with a blood pressure of 0
print(f"Shape before drop of entrys with a blood preasure of 0: {df.shape}")
df.drop(df[df['trestbps'] == 0].index, inplace=True, axis=0)
print(f"Shape after drop of entrys with a blood preasure of 0: {df.shape}")

In [None]:
# drop entries with unrealistic values for prop
print(f"Shape before drop of entries with unrealisic prop values: {df.shape}")
df.drop(df[df['prop'] > 1].index, inplace=True, axis=0)
print(f"Shape after drop of entries with unrealisic prop values: {df.shape}")

# drop more columns because switzerland would be lost

In [None]:
# what would happen
df.dropna(axis=0, how='any').loc[:,"dataset"].value_counts()

In [None]:
# because of which features
(df.loc[ : , df.columns != 'dataset'].isna()).join(df['dataset']).groupby("dataset").sum()

In [None]:
# df.drop(["fbs", "rldv5e", "htn"], inplace=True, axis=1)

In [None]:
# what would happen now
df.dropna(axis=0, how='any').loc[:,"dataset"].value_counts()

## summary

In [None]:
if generate_pandas_profiling_reports:
    profile = ProfileReport(df, title='Pandas Profiling Report for selected features')
    profile.to_file("Pandas Profiling Report for selected features.html")

In [None]:
if drop_nan:
    # drop all entries which contain one or more NanN vlaues
    print(f"Shape before drop of NaN containing rows: {df.shape}")
    df.dropna(inplace=True, axis=0, how='any')
    print(f"Shape after drop of NaN containing rows: {df.shape}")

In [None]:
df['dataset'].value_counts()

# Visualize Data

In [None]:
if print_pair_plots:
    sns.pairplot(df, hue="num", palette="tab10")

In [None]:
if print_pair_plots:
    sns.pairplot(df, hue="dataset", palette="tab10")

In [None]:
from sklearn.preprocessing import LabelEncoder
# drop all rows where the label column is nan
df = df[df['num'].notna()]
if encode_labels:
    labelEncoder = LabelEncoder()
    df.loc[df['num'] >= 1,"num"] = 1
    df['num'] = labelEncoder.fit_transform(df['num'])

In [None]:
if print_pair_plots:
    sns.pairplot(df, hue="num", palette="tab10")

# Train the different models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from  sklearn.naive_bayes import *

estimators_and_hyperparameters=[
    {"estimator": CatBoostClassifier(random_state=42, thread_count=-1, silent= True), "parameters": {'classification__depth':[None] + [*range(1,200)],
                                                                                                     'classification__n_estimators':range(10,1000, 100),
                                                                                                     'classification__learning_rate':[0.001,0.01,0.1,0.2,0.3],
                                                                                                     # 'classification__l2_leaf_reg':range(5,100, 5),
                                                                                                     # 'classification__border_count':range(5,200, 5),
                                                                                                     # 'classification__ctr_border_count':range(5,200, 5)
                                                                                                     }},
    # {"estimator": XGBClassifier(random_state=42, n_jobs=1), "parameters": {'classification__max_depth': [None] + [*range(1,200)],
    #                                                                        'classification__n_estimators': range(10,1000, 100),
    #                                                                         'classification__learning_rate':[0.001,0.01,0.1,0.2,0.3]}},
    # {"estimator": SVC(random_state=42, tol=0.01), "parameters": {'classification__C': [110,120,130,140,150],
    #                                                              'classification__gamma': [0.0001, 0.001, 0.01, 0.1],
    #                                                              'classification__degree': [3,4,5,6],
    #                                                              'classification__kernel':['linear', 'rbf', 'poly', 'sigmoid'] }}, # '
    # {"estimator": BernoulliNB(), "parameters": {'classification__alpha' : np.arange(0,20,0.001)}},
    # {"estimator": CategoricalNB(), "parameters": {'classification__alpha' : np.arange(0,20,0.001)}},
    # {"estimator": ComplementNB(), "parameters": {'classification__alpha' : np.arange(0,20,0.001),
    #                                              'classification__norm':[True,False]}},
    # {"estimator": GaussianNB(), "parameters": {}},
    # {"estimator": MultinomialNB(), "parameters": {'classification__alpha' : np.arange(0,20,0.001)}},
    # {"estimator": DecisionTreeClassifier(random_state=42), "parameters": {'classification__criterion':['gini','entropy', 'log_loss'],
    #                                                                       'classification__max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],
    #                                                                       'classification__min_samples_split': range(2,20),
    #                                                                       'classification__min_samples_leaf': range(2,20)}},
    # {"estimator": KNeighborsClassifier(), "parameters": {'classification__n_neighbors': range(2, 100),
    #                                                      'classification__weights': ['uniform','distance'],
    #                                                      'classification__p': [1,2]}},
    {"estimator": RandomForestClassifier(random_state=42, n_jobs=-1), "parameters": {'classification__n_estimators':range(10,1000, 100),
                                                                                     'classification__max_depth':[None] + [*range(1,200)],
                                                                                     'classification__min_samples_split':range(2,20),
                                                                                     'classification__min_samples_leaf': range(2,20),}},
    # {"estimator": SGDClassifier(max_iter=1000000), "parameters": {'classification__loss':['log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    #                                               'classification__penalty':['l1','l2','elasticnet'],
    #                                               'classification__alpha' : np.arange(1,40,1)}}
]

In [None]:
from sklearn.preprocessing import *
scalers = [
    {"scaler": MaxAbsScaler(), "parameters": {}},
    {"scaler": MinMaxScaler(), "parameters": {}},
    {"scaler": Normalizer(), "parameters": {}},
    {"scaler": PowerTransformer(), "parameters": {}},
    {"scaler": RobustScaler(), "parameters": {}},
    {"scaler": FunctionTransformer(lambda x: x), "parameters": {}},
    {"scaler": StandardScaler(), "parameters": {'preprocessing__scaler__with_mean': [ True, False],'preprocessing__scaler__with_std': [ True, False]}}
]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
X = df.loc[:,(df.columns!= 'num') & (df.columns != 'dataset') ]
y = df.loc[: , (df.columns== 'num')].values.ravel()

In [None]:
# assumption the dictionarys are of equal structure
def merge_dict(dict1, dict2):
    for key, val in dict1.items():
        if type(val) == dict:
                merge_dict(dict1[key], dict2[key])
        elif(type(val) == list):
            dict1[key] += (';' + str(dict2[key]))
        else:
            dict1[key] = str(val) + ';'+ str(dict2[key])

    return dict1

In [None]:
import pickle
import os
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


def classification_report_with_accuracy_score(y_true, y_pred):
    confusion_matrix_dict = {}
    for idxRow, row in np.ndenumerate(confusion_matrix(y_true, y_pred)):
        confusion_matrix_dict[idxRow] = row
    current_auc_score = roc_auc_score(y_true, y_pred)
    if os.path.exists('temp.pickle'):
        with open("temp.pickle", "rb") as tempFile:
            report = pickle.load(tempFile)
            report['classification_report'] = merge_dict(report['classification_report'], classification_report(y_true, y_pred, output_dict=True))
            report["auc"].append(current_auc_score)
            report['confusion_matrix'] = merge_dict(report['confusion_matrix'], confusion_matrix_dict)
    else:
        report = {'classification_report': classification_report(y_true, y_pred, output_dict=True)}
        report["auc"] = [current_auc_score]
        report['confusion_matrix'] = confusion_matrix_dict
    with open('temp.pickle', 'wb') as tempFile:
        pickle.dump(report, tempFile, protocol=pickle.HIGHEST_PROTOCOL)


    # because we need to return something
    return current_auc_score # return accuracy score

In [None]:
import time
from sklearn.metrics import make_scorer
from sklearn import decomposition
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# build pipelines
# create the pipeline
pipelines = []

for scaler in scalers:
    print(f"Scaler: {scaler.get('scaler').__class__.__name__}")
    for estimator in estimators_and_hyperparameters:
        startTime = time.time()

        parameters = scaler.get("parameters") | estimator.get("parameters")
        preprocessor = ColumnTransformer(
            transformers=[
                # use StandardScaler for Temperature and Humidity
                ('scaler', scaler.get("scaler"), X.columns),
                # use OneHotEncoder for Outlook and Wind
                ('encoder', OneHotEncoder(), oneHotEncodedFeatures)
            ])
        pipeline = Pipeline(steps=[ ('preprocessing', preprocessor), ('classification', estimator.get("estimator")) ])
        # create the grid search instance
        grid_search_estimator = GridSearchCV(pipeline, parameters, scoring='roc_auc', cv=10, error_score='raise', n_jobs=1, verbose= 0)
        try:
            with open("output.csv", "a") as file:
                accuracy_best = cross_val_score(grid_search_estimator, X, y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score), error_score='raise', verbose = 2, n_jobs=1)
                grid_search_estimator.fit(X, y)
                print(f"AUC for {estimator.get('estimator').__class__.__name__} = {accuracy_best.mean() * 100.0}")
                display(grid_search_estimator.best_params_)
                executionTime = (time.time() - startTime)

                file.write("\n\n")
                file.write(f"scaler:, {scaler.get('scaler').__class__.__name__}\n")
                file.write(f"estimator:, {estimator.get('estimator').__class__.__name__} \n")
                file.write(f"generate_pandas_profiling_reports:, {generate_pandas_profiling_reports} \n")
                file.write(f"process_preprocessed_data_of_uci:, {process_preprocessed_data_of_uci} \n")
                file.write(f"print_pair_plots:, {print_pair_plots} \n")
                file.write(f"drop_correlated_features:, {drop_correlated_features} \n")
                file.write(f"drop_nan:, {drop_nan} \n")
                file.write(f"encode_labels:, {encode_labels} \n")
                file.write(f"minimumPercentageMissingToBeDropped:, {minimumPercentageMissingToBeDropped} \n")
                file.write(f"oneHotEncodedFeatures:, {str(oneHotEncodedFeatures).replace(', ','; ')} \n")
                file.write(f"parameters:, {str(parameters).replace(', ','; ')} \n")
                file.write('Execution time in seconds:, ' + str(executionTime) +'\n')
                file.write('AUC mean:, '+ str(accuracy_best.mean() * 100 ) +'\n')
                file.write(f"best_params:, {str(grid_search_estimator.best_params_).replace(', ','; ')} \n")
                with open("temp.pickle", "rb") as tempFile:
                    report = pickle.load(tempFile)

                    file.write('AUC:,' + str(report['auc']).replace(', ', ';') +'\n')
                    file.write(pd.DataFrame(report['classification_report']).to_csv())
                    file.write(pd.DataFrame(np.array([[str(report['confusion_matrix'][(0, 0)]), str(report['confusion_matrix'][(0, 1)])], [str(report['confusion_matrix'][(1, 0)]), str(report['confusion_matrix'][(1, 1)])]])).to_csv())
                    file.write("\n\n")
                os.remove('temp.pickle')
        except Exception as e:
            print(f'Skipping the combination of {scaler.get("scaler").__class__.__name__} and {estimator.get("estimator").__class__.__name__} because:')
            print(str(e))
            with open("output.csv", "a") as file:
                file.write("\n\n")
                file.write(f'Skipping the combination of {scaler.get("scaler").__class__.__name__} and {estimator.get("estimator").__class__.__name__} because:')
                file.write(str(e))
                file.write("\n\n")



    print("-----------------------------------------------------------------")

In [None]:
if not process_preprocessed_data_of_uci:
    raise SystemExit("So Feierabend Emma")

# Tests with the preprocessed data by the UCI

In [None]:
import pandas as pd 
datasets = ["hungarian", "cleveland", "switzerland", "va"]
df_processed = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv("./Data/processed."+ dataset +".data", header=None, sep=",")
    dataset_df['dataset'] = dataset
    df_processed = pd.concat([df_processed,dataset_df ], ignore_index=True)
df_processed.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num', 'dataset']
df_processed = df_processed.replace('?', float('nan'))


In [None]:
df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']] = df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']].apply(pd.to_numeric)

In [None]:
(df_processed.loc[ : , df_processed.columns != 'dataset'].isna()).join(df_processed['dataset']).groupby("dataset").sum()

In [None]:
df_processed.drop(["slope", "ca","thal"], inplace=True, axis=1)

In [None]:
print(f"Shape before drop of NaN containing rows: {df_processed.shape}")
df_processed.dropna(inplace=True, axis=0, how='any')
print(f"Shape after drop of NaN containing rows: {df_processed.shape}")

In [None]:
df_processed.shape

In [None]:
df_processed.head()

In [None]:
df_processed.describe()

In [None]:
if generate_pandas_profiling_reports:
    profile = ProfileReport(df_processed, title='Pandas Profiling Report for the features processed by the UCI')
    profile.to_file("Pandas Profiling Report for the features processed by the UCI.html")

In [None]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
df_processed.loc[df_processed['num'] >= 1,"num"] = 1
df_processed['num'] = labelEncoder.fit_transform(df_processed['num'])

In [None]:
X = df_processed.loc[:,(df_processed.columns!= 'num') & (df_processed.columns != 'dataset')]
y = df_processed['num']

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from statistics import mean
for scaler in scalers:
    print(f'Current Sclaer: {scaler.__class__.__name__}')
    for estimator in estimators_and_hyperparameters:
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        try:
            X_trans = scaler.fit_transform(X)
            scores = cross_val_score(estimator[0], X_trans, y, scoring='f1',cv=skf, n_jobs=-1)
            print(f'F1 score for {estimator[0].__class__.__name__}: {mean(scores)}')
        except Exception as e:
            print(e)
            print(f'Skipping the combination of {scaler.__class__.__name__} and {estimator[0].__class__.__name__}')
    print('-----------------------------------------------------------------')
print(f'Current Sclaer: NoScaler')
for estimator in estimators_and_hyperparameters:
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    try:
        X_trans = X
        scores = cross_val_score(estimator[0], X_trans, y, scoring='f1',cv=skf, n_jobs=-1)
        print(f'F1 score for {estimator[0].__class__.__name__}: {mean(scores)}')
    except Exception as e:
        print(e)
        print(f'Skipping the combination of NoScaler and {estimator[0].__class__.__name__}')
print('-----------------------------------------------------------------')

In [None]:
# Tests with the preprocessed data by the UCI includeing the reprocessed data

In [None]:
import pandas as pd 
datasets = ["hungarian", "cleveland", "switzerland", "va"]
df_processed = pd.DataFrame()
for dataset in datasets:
    if dataset != "hungarian":
        dataset_df = pd.read_csv("./Data/processed."+ dataset +".data", header=None, sep=",")
        dataset_df['dataset'] = dataset
        df_processed = pd.concat([df_processed,dataset_df ], ignore_index=True)
with open("Data/reprocessed.hungarian.data") as file:
    dataString = file.read()
    dataString = dataString.replace(" ",",")
    dataset_df = pd.read_csv(StringIO(dataString), header=None, sep=",")
    dataset_df['dataset'] = dataset
df_processed = pd.concat([df_processed,dataset_df ], ignore_index=True)
df_processed.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num', 'dataset']
df_processed = df_processed.replace('?', float('nan'))

In [None]:
df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']] = df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']].apply(pd.to_numeric)

In [None]:
df_processed

In [None]:
if generate_pandas_profiling_reports:
    profile = ProfileReport(df_processed, title='Pandas Profiling Report for the features processed by the UCI + reprocessed hungarian')
    profile.to_file("Pandas Profiling Report for the features processed by the UCI + reprocessed hungarian.html")