In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# set pandas to show all columns of the df when using the display function
pd.set_option('display.max_columns', None)

In [None]:
# list the datasets that should be used in the current run
datasets = ["hungarian", "cleveland", "switzerland", "long-beach-va"]

In [None]:
# custom function to read the dataset into a csv formated string
# the name is used as a delimiter here because it is the last feature and common among all entrys 
def readRawData(filePath:str):
    with open(filePath) as file:
        dataString = file.read()
        dataString = dataString.replace("\n"," ")
        dataString = dataString.replace("name ","name\n")
        dataString = dataString.replace(" ",",")
        return dataString

In [None]:
# read the data from the specified datasets into the df 
from io import StringIO
df = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv(StringIO(readRawData("./Data/"+ dataset +".data")), header=None, sep=",")
    dataset_df['dataset'] = dataset
    df = pd.concat([df,dataset_df ], ignore_index=True)

In [None]:
df.columns=["id", "ccf", "age", "sex", "painloc", "painexer" , "relrest" , "pncaden" , "cp", "trestbps", "htn", "chol", "smoke", "cigs", "years", "fbs", "dm", "famhist", "restecg", "ekgmo", "ekgday", "ekgyr", "dig", "prop", "nitr", "pro", "diuretic", "proto", "thaldur", "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd", "dummy", "trestbpd", "exang", "xhypo", "oldpeak", "slope", "rldv5", "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm", "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe", "cmo", "cday", "cyr", "num", "lmt", "ladprox", "laddist", "diag", "cxmain", "ramus", "om1", "om2", "rcaprox", "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk", "name", "dataset"]

In [None]:
# the dataset encodes unfilled cells with -9 they are replaced with NaN for better compatibility with pd 
df = df.replace(-9, float('nan'))

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(df, title='Pandas Profiling Report')
# profile.to_file("df.html")

The columns smoke and years both describe whether a respondent smokes or not. Smoke does this by being binary coded, while years describes the number of years a person has smoked. Due to the high number of missing values, the columns are useless on their own. However, it is possible to enrich the smoke column with the years column. 

In [None]:
print(f"Number of NaNs in smoke: {df['smoke'].isna().sum()}")
df.loc[(df['smoke'].isna()) & (df['years'] == 0),'smoke'] = 0
df.loc[(df['smoke'].isna()) & (df['years'] > 0),'smoke'] = 1
df.drop(columns=['smoke'])
print(f"Number of NaNs in smoke after combination with years: {df['smoke'].isna().sum()}")
df.loc[(df['smoke'].isna()) & (df['cigs'] == 0),'smoke'] = 0
df.loc[(df['smoke'].isna()) & (df['cigs'] > 0),'smoke'] = 1
print(f"Number of NaNs in smoke after combination with years and cigs: {df['smoke'].isna().sum()}")

Finding: reduces the number of missing values in smoke by 280 entries. 

# Explore how many NaNs are within one coloumn for each attribute.

In [None]:
(df.loc[ : , df.columns != 'dataset'].isna()).join(df['dataset']).groupby("dataset").sum()

In [None]:
(df.loc[ : , df.columns != 'dataset'].eq(0)).join(df['dataset']).groupby("dataset").sum()

In [None]:
columns_to_drop = ["id",
"painloc",
"painexer",
"relrest",
"ccf",
"pncaden",
"years",
"cigs",
"dm",
"famhist",
"ekgmo",
"ekgday",
"ekgyr",
"dig",
"prop",
"nitr",
"diuretic",
"proto",
"thaldur",
"thaltime",
"dummy",
"slope",
"rldv5",
"ca",
"restckm",
"exerckm",
"restef",
"restwm",
"exeref",
"exerwm",
"thal",
"thalsev",
"thalpul",
"earlobe",
"cmo",
"cday",
"cyr",
"lmt",
"ladprox",
"laddist",
"diag",
"cxmain",
"ramus",
"om1",
"om2",
"rcaprox",
"rcadist",
"lvx1",
"lvx2",
"lvx3",
"lvx4",
"lvf",
"cathef",
"junk",
"name"]
df.drop(columns_to_drop, inplace=True, axis=1)
len(columns_to_drop)

In [None]:
df.drop(["smoke","met", "rldv5e"], inplace=True, axis=1)
len(columns_to_drop)

In [None]:
df.isna().sum()

In [None]:
print(f"Shape before drop of NaN containing rows: {df.shape}")
df.dropna(inplace=True, axis=0, how='any')
print(f"Shape after drop of NaN containing rows: {df.shape}")

In [None]:
df.describe()

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
df[df['num'] >= 1] = 1
df['num'] = labelEncoder.fit_transform(df['num'])

In [None]:
# Compute the correlation matrix
corr = df.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap="coolwarm", vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
X = df.loc[:,(df.columns!= 'num') & (df.columns != 'dataset')]

y = df['num']

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from  sklearn.naive_bayes import *
estimators_and_hyperparameters=[
    (CatBoostClassifier(random_state=42, thread_count=-1, silent= True),{}),
    (XGBClassifier(random_state=42, n_jobs=-1),{}),
    (SVC(kernel='linear',random_state=42),{}),
    (SVC(kernel='poly',random_state=42),{}),
    (SVC(kernel='rbf',random_state=42),{}),
    (SVC(kernel='sigmoid',random_state=42),{}),
    #(SVC(kernel='precomputed',random_state=42),{}),
    # (BernoulliNB(),{}),
    #(CategoricalNB(),{}),
    # (ComplementNB(),{}),
    # (GaussianNB(),{}),
    # (MultinomialNB(),{}),
    (DecisionTreeClassifier(random_state=42),{}),
    (KNeighborsClassifier(n_jobs=-1),{}),
    (RandomForestClassifier(random_state=42, n_jobs=-1), {}),
    (SGDClassifier(),{})
]

In [None]:
from sklearn.preprocessing import *
scalers = [
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PowerTransformer(),
    QuantileTransformer(output_distribution='uniform'),
    QuantileTransformer(output_distribution='normal'),
    RobustScaler(),
    StandardScaler(),
]

In [None]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import cross_val_score
# from statistics import mean
# for scaler in scalers:
#     print(f'Current Sclaer: {scaler.__class__.__name__}')
#     for estimator in estimators_and_hyperparameters:
#         skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
#         try:
#             X_trans = scaler.fit_transform(X)
#             scores = cross_val_score(estimator[0], X_trans, y, scoring='f1',cv=skf, n_jobs=-1)
#             print(f'F1 score for {estimator[0].__class__.__name__}: {mean(scores)}')
#         except Exception as e:
#             print(e)
#             print(f'Skipping the combination of {scaler.__class__.__name__} and {estimator.__class__.__name__}')
#     print('-----------------------------------------------------------------')

In [None]:
import pandas as pd 
datasets = ["hungarian", "cleveland", "switzerland", "va"]
df_processed = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv("./Data/processed."+ dataset +".data", header=None, sep=",")
    dataset_df['dataset'] = dataset
    df_processed = pd.concat([df_processed,dataset_df ], ignore_index=True)
df_processed.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num', 'dataset']
df_processed = df_processed.replace('?', float('nan'))


In [None]:
df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']] = df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']].apply(pd.to_numeric)

In [None]:
(df_processed.loc[ : , df_processed.columns != 'dataset'].isna()).join(df_processed['dataset']).groupby("dataset").sum()

In [None]:
df_processed.drop(["slope", "ca","thal"], inplace=True, axis=1)

In [None]:
print(f"Shape before drop of NaN containing rows: {df_processed.shape}")
df_processed.dropna(inplace=True, axis=0, how='any')
print(f"Shape after drop of NaN containing rows: {df_processed.shape}")

In [None]:
df.shape

In [None]:
df_processed.head()

In [None]:
df_processed.describe()

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df_processed, title='Pandas Profiling Report')
profile.to_file("df_processed.html")