In [81]:
# Variables for configuration
generate_pandas_profiling_reports = True 

In [82]:
import pandas as pd
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


# set pandas to show all columns of the df when using the display function
pd.set_option('display.max_columns', None)

In [83]:
# list the datasets that should be used in the current run
datasets = ["hungarian", "cleveland", "switzerland", "long-beach-va"]

In [84]:
# custom function to read the dataset into a csv formated string
# the name is used as a delimiter here because it is the last feature and common among all entrys 
def readRawData(filePath:str):
    with open(filePath) as file:
        dataString = file.read()
        dataString = dataString.replace("\n"," ")
        dataString = dataString.replace("name ","name\n")
        dataString = dataString.replace(" ",",")
        return dataString

In [85]:
# read the data from the specified datasets into the df 
from io import StringIO
df = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv(StringIO(readRawData("./Data/"+ dataset +".data")), header=None, sep=",")
    dataset_df['dataset'] = dataset
    df = pd.concat([df,dataset_df ], ignore_index=True)

In [86]:
df.columns=["id", "ccf", "age", "sex", "painloc", "painexer" , "relrest" , "pncaden" , "cp", "trestbps", "htn", "chol", "smoke", "cigs", "years", "fbs", "dm", "famhist", "restecg", "ekgmo", "ekgday", "ekgyr", "dig", "prop", "nitr", "pro", "diuretic", "proto", "thaldur", "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd", "dummy", "trestbpd", "exang", "xhypo", "oldpeak", "slope", "rldv5", "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm", "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe", "cmo", "cday", "cyr", "num", "lmt", "ladprox", "laddist", "diag", "cxmain", "ramus", "om1", "om2", "rcaprox", "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk", "name", "dataset"]

In [87]:
# the dataset encodes unfilled cells with -9 they are replaced with NaN for better compatibility with pd 
df = df.replace(-9, float('nan'))

In [88]:
df.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,chol,smoke,cigs,years,fbs,dm,famhist,restecg,ekgmo,ekgday,ekgyr,dig,prop,nitr,pro,diuretic,proto,thaldur,thaltime,met,thalach,thalrest,tpeakbps,tpeakbpd,dummy,trestbpd,exang,xhypo,oldpeak,slope,rldv5,rldv5e,ca,restckm,exerckm,restef,restwm,exeref,exerwm,thal,thalsev,thalpul,earlobe,cmo,cday,cyr,num,lmt,ladprox,laddist,diag,cxmain,ramus,om1,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,899.0,899.0,899.0,899.0,617.0,617.0,613.0,0.0,899.0,840.0,865.0,869.0,230.0,479.0,467.0,809.0,95.0,477.0,897.0,846.0,845.0,846.0,831.0,833.0,834.0,836.0,817.0,787.0,843.0,446.0,794.0,844.0,843.0,836.0,836.0,840.0,840.0,844.0,841.0,837.0,591.0,474.0,757.0,291.0,0.0,1.0,28.0,30.0,2.0,5.0,422.0,130.0,44.0,1.0,888.0,890.0,890.0,899.0,624.0,663.0,653.0,341.0,664.0,332.0,628.0,327.0,654.0,629.0,880.0,880.0,880.0,880.0,883.0,311.0,119.0
mean,957.235818,0.0,53.480534,0.790879,0.920583,0.593193,0.672104,,3.253615,132.10119,0.476301,198.759494,0.517391,19.118998,18.796574,0.166873,0.957895,0.563941,0.603122,5.973995,15.493491,84.056738,0.034898,0.283313,0.266187,0.172249,0.112607,37.081321,8.655872,5.690359,16.483123,137.298578,75.487544,171.641148,87.293062,132.10119,83.52381,0.390995,0.026159,0.87049,1.766497,14.398734,54.914135,0.697595,,0.0,0.531071,1.033333,0.55,0.2,5.018957,1.284615,0.295455,0.0,6.122748,15.988764,83.839326,1.129032,1.323718,1.3273,1.248086,1.202346,1.296687,1.114458,1.176752,1.067278,1.342508,1.171701,1.020455,1.032955,1.132955,1.611364,1.178935,27.623119,5.869748
std,1204.015482,0.0,9.435894,0.406908,0.270607,0.491637,0.46983,,0.928499,19.151127,0.499727,111.834415,0.500787,18.296273,16.359145,0.373093,0.201895,0.496415,0.803669,3.486479,8.761939,1.640204,0.183631,0.870965,0.442228,0.377823,0.316306,50.144559,3.746617,3.994673,30.772801,25.965959,14.727961,25.734488,14.734586,19.151127,10.252563,0.488263,0.159704,1.080548,0.621339,5.702942,60.309425,1.052728,,,0.146195,1.0662,0.070711,0.447214,1.949388,0.958314,0.461522,,3.474114,8.860872,4.407533,1.25972,6.447542,0.469582,0.432233,0.402339,0.457142,0.318847,0.381762,0.250887,0.474912,0.377421,0.277384,0.415902,0.703837,1.722199,0.512572,31.675295,1.650914
min,1.0,0.0,28.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,81.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,60.0,37.0,84.0,11.0,0.0,0.0,0.0,0.0,-2.6,0.0,2.0,2.0,0.0,,0.0,0.22,0.0,0.5,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.22,3.3
25%,116.0,0.0,47.0,1.0,1.0,0.0,0.0,,3.0,120.0,0.0,175.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,8.0,83.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,3.0,5.0,120.0,65.0,155.0,80.0,120.0,80.0,0.0,0.0,0.0,1.0,10.0,12.0,0.0,,0.0,0.4075,0.0,0.525,0.0,3.0,0.0,0.0,0.0,3.0,8.0,83.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.685,4.8
50%,266.0,0.0,54.0,1.0,1.0,1.0,1.0,,4.0,130.0,0.0,224.0,1.0,20.0,20.0,0.0,1.0,1.0,0.0,6.0,16.0,84.0,0.0,0.0,0.0,0.0,0.0,5.0,8.1,6.0,7.0,140.0,74.0,170.0,88.0,130.0,80.0,0.0,0.0,0.5,2.0,14.0,19.0,0.0,,0.0,0.57,1.0,0.55,0.0,6.0,1.0,0.0,0.0,6.0,16.0,84.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.82,5.6
75%,1207.5,0.0,60.0,1.0,1.0,1.0,1.0,,4.0,140.0,1.0,269.0,1.0,30.0,30.0,0.0,1.0,1.0,1.0,9.0,23.0,85.0,0.0,1.0,1.0,0.0,0.0,75.0,10.5,8.0,10.0,157.0,84.0,190.0,100.0,140.0,90.0,1.0,0.0,1.5,2.0,18.0,102.0,1.0,,0.0,0.625,2.0,0.575,0.0,7.0,2.0,1.0,0.0,9.0,23.75,85.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,63.0,6.9
max,5002.0,0.0,77.0,1.0,1.0,1.0,1.0,,4.0,200.0,1.0,603.0,1.0,99.0,60.0,1.0,1.0,1.0,2.0,12.0,31.0,87.0,1.0,22.0,1.0,1.0,1.0,200.0,24.0,20.0,200.0,202.0,139.0,240.0,134.0,200.0,120.0,1.0,1.0,6.2,3.0,36.0,270.0,9.0,,0.0,0.8,3.0,0.6,1.0,7.0,3.0,1.0,0.0,12.0,31.0,87.0,4.0,162.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


In [89]:
df.head()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,chol,smoke,cigs,years,fbs,dm,famhist,restecg,ekgmo,ekgday,ekgyr,dig,prop,nitr,pro,diuretic,proto,thaldur,thaltime,met,thalach,thalrest,tpeakbps,tpeakbpd,dummy,trestbpd,exang,xhypo,oldpeak,slope,rldv5,rldv5e,ca,restckm,exerckm,restef,restwm,exeref,exerwm,thal,thalsev,thalpul,earlobe,cmo,cday,cyr,num,lmt,ladprox,laddist,diag,cxmain,ramus,om1,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name,dataset
0,1254.0,0.0,40.0,1.0,1.0,0.0,0.0,,2.0,140.0,0.0,289.0,,,,0.0,,,0.0,12.0,16.0,84.0,0.0,0.0,0.0,0.0,0.0,150.0,18.0,,7.0,172.0,86.0,200.0,110.0,140.0,86.0,0.0,0.0,0.0,,26.0,20.0,,,,,,,,,,,,12.0,20.0,84.0,0.0,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian
1,1255.0,0.0,49.0,0.0,1.0,0.0,0.0,,3.0,160.0,1.0,180.0,,,,0.0,,,0.0,11.0,16.0,84.0,0.0,0.0,0.0,0.0,0.0,,10.0,9.0,7.0,156.0,100.0,220.0,106.0,160.0,90.0,0.0,0.0,1.0,2.0,14.0,13.0,,,,,,,,,,,,11.0,20.0,84.0,1.0,,,2.0,,,,,,,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian
2,1256.0,0.0,37.0,1.0,1.0,0.0,0.0,,2.0,130.0,0.0,283.0,,,,0.0,,,1.0,11.0,21.0,84.0,0.0,0.0,0.0,0.0,0.0,100.0,10.0,,5.0,98.0,58.0,180.0,100.0,130.0,80.0,0.0,0.0,0.0,,17.0,14.0,,,,,,,,,,,,11.0,26.0,84.0,0.0,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian
3,1257.0,0.0,48.0,0.0,1.0,1.0,1.0,,4.0,138.0,0.0,214.0,,,,0.0,,,0.0,9.0,21.0,84.0,0.0,0.0,0.0,0.0,0.0,50.0,5.0,4.0,4.0,108.0,54.0,210.0,106.0,138.0,86.0,1.0,0.0,1.5,2.0,19.0,22.0,,,,,,,,,,,,9.0,30.0,84.0,3.0,,2.0,,,2.0,,,,2.0,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian
4,1258.0,0.0,54.0,1.0,1.0,0.0,1.0,,3.0,150.0,0.0,,,,,0.0,,,0.0,7.0,25.0,84.0,0.0,0.0,1.0,1.0,0.0,25.0,2.0,,3.0,122.0,74.0,130.0,100.0,150.0,90.0,0.0,1.0,0.0,,13.0,9.0,,,,,,,,,,,,7.0,30.0,84.0,0.0,,,,,1.0,,,,1.0,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian


In [90]:
if generate_pandas_profiling_reports:
    profile = ProfileReport(df, title='Pandas Profiling Report for all features')
    profile.to_file("Pandas Profiling Report for all features.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"cramers": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/pandas-profiling/issues
(include the error message: 'No data; `observed` has size 0.')
  ax1.set_xticklabels(
(using `df.profile_report(missing_diagrams={"Count": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/pandas-profiling/issues
(include the error message: 'The number of FixedLocator locations (7), usually from a call to set_ticks, does not match the number of ticklabels (77).')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

The columns smoke and years both describe whether a respondent smokes or not. Smoke does this by being binary coded, while years describes the number of years a person has smoked. Due to the high number of missing values, the columns are useless on their own. However, it is possible to enrich the smoke column with the years column. 

In [91]:
print(f"Number of NaNs in smoke: {df['smoke'].isna().sum()}")
df.loc[(df['smoke'].isna()) & (df['years'] == 0),'smoke'] = 0
df.loc[(df['smoke'].isna()) & (df['years'] > 0),'smoke'] = 1
df.drop(columns=['smoke'])
print(f"Number of NaNs in smoke after combination with years: {df['smoke'].isna().sum()}")
df.loc[(df['smoke'].isna()) & (df['cigs'] == 0),'smoke'] = 0
df.loc[(df['smoke'].isna()) & (df['cigs'] > 0),'smoke'] = 1
print(f"Number of NaNs in smoke after combination with years and cigs: {df['smoke'].isna().sum()}")

Number of NaNs in smoke: 671
Number of NaNs in smoke after combination with years: 391
Number of NaNs in smoke after combination with years and cigs: 389


Finding: reduces the number of missing values in smoke by 280 entries. 

# Explore how many NaNs are within one coloumn for each attribute.

In [92]:
(df.loc[ : , df.columns != 'dataset'].isna()).join(df['dataset']).groupby("dataset").sum()

Unnamed: 0_level_0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,chol,smoke,cigs,years,fbs,dm,famhist,restecg,ekgmo,ekgday,ekgyr,dig,prop,nitr,pro,diuretic,proto,thaldur,thaltime,met,thalach,thalrest,tpeakbps,tpeakbpd,dummy,trestbpd,exang,xhypo,oldpeak,slope,rldv5,rldv5e,ca,restckm,exerckm,restef,restwm,exeref,exerwm,thal,thalsev,thalpul,earlobe,cmo,cday,cyr,num,lmt,ladprox,laddist,diag,cxmain,ramus,om1,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1
cleveland,0,0,0,0,282,282,282,282,0,0,0,0,4,5,5,0,259,0,0,0,0,0,2,2,2,2,2,0,0,69,0,0,0,0,0,0,0,0,0,0,0,282,0,2,282,282,282,282,282,282,2,282,282,282,0,0,0,0,0,0,0,282,0,282,0,282,0,0,0,0,0,0,0,282,282,0
hungarian,1,1,1,1,1,1,1,295,1,2,2,24,283,294,295,9,274,294,2,1,1,1,2,3,2,2,2,10,3,191,3,2,2,2,2,2,2,2,3,1,191,2,1,291,295,295,295,292,295,293,267,268,278,295,1,1,1,1,276,237,247,277,236,286,272,290,245,270,1,1,1,1,1,267,295,1
long-beach-va,1,1,1,1,1,1,5,201,1,57,4,8,2,11,13,8,158,9,1,53,54,53,61,61,60,60,74,54,54,161,54,54,55,60,60,57,57,54,54,57,102,66,66,199,201,200,173,174,199,198,160,170,200,200,3,1,1,1,1,1,1,1,1,1,1,2,2,2,3,3,3,3,4,24,82,1
switzerland,0,0,0,0,0,0,0,123,0,2,30,0,100,112,121,75,115,121,1,1,1,1,5,2,3,1,6,50,1,34,50,1,1,3,3,2,2,1,3,6,17,77,77,118,123,123,123,123,123,123,50,51,97,123,9,9,9,0,0,0,0,0,0,0,0,0,0,0,17,17,17,17,13,17,123,0


In [93]:
(df.loc[ : , df.columns != 'dataset'].eq(0)).join(df['dataset']).groupby("dataset").sum()

Unnamed: 0_level_0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,htn,chol,smoke,cigs,years,fbs,dm,famhist,restecg,ekgmo,ekgday,ekgyr,dig,prop,nitr,pro,diuretic,proto,thaldur,thaltime,met,thalach,thalrest,tpeakbps,tpeakbpd,dummy,trestbpd,exang,xhypo,oldpeak,slope,rldv5,rldv5e,ca,restckm,exerckm,restef,restwm,exeref,exerwm,thal,thalsev,thalpul,earlobe,cmo,cday,cyr,num,lmt,ladprox,laddist,diag,cxmain,ramus,om1,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1
cleveland,0,282,0,91,0,0,0,0,0,0,108,0,115,115,115,240,0,107,138,0,0,0,271,186,211,252,248,0,0,43,0,0,0,0,0,0,0,190,276,91,0,0,0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,157,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
hungarian,0,294,0,81,23,164,141,0,0,0,195,0,10,0,0,266,0,1,235,0,0,0,293,274,265,269,290,0,0,0,0,0,0,0,0,0,0,204,291,189,0,0,0,3,0,0,0,3,0,2,0,6,7,0,0,0,0,188,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
long-beach-va,0,200,0,6,15,65,33,0,0,1,90,49,96,38,38,125,4,100,80,0,0,0,122,88,61,105,94,0,0,2,0,0,0,0,0,1,1,52,139,40,1,0,0,2,0,1,0,10,0,2,0,9,1,1,0,0,0,51,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
switzerland,0,123,0,10,11,22,27,0,0,0,60,123,5,0,0,43,0,0,85,0,0,0,116,70,75,66,93,0,0,25,0,0,0,0,0,0,0,68,113,42,0,0,0,0,0,0,0,0,0,0,0,21,23,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [94]:
columns_to_drop = ["id",
"painloc",
"painexer",
"relrest",
"ccf",
"pncaden",
"years",
"cigs",
"dm",
"famhist",
"ekgmo",
"ekgday",
"ekgyr",
"dig",
"prop",
"nitr",
"diuretic",
"proto",
"thaldur",
"thaltime",
"dummy",
"slope",
"rldv5",
"ca",
"restckm",
"exerckm",
"restef",
"restwm",
"exeref",
"exerwm",
"thal",
"thalsev",
"thalpul",
"earlobe",
"cmo",
"cday",
"cyr",
"lmt",
"ladprox",
"laddist",
"diag",
"cxmain",
"ramus",
"om1",
"om2",
"rcaprox",
"rcadist",
"lvx1",
"lvx2",
"lvx3",
"lvx4",
"lvf",
"cathef",
"junk",
"name"]
df.drop(columns_to_drop, inplace=True, axis=1)
len(columns_to_drop)

55

In [95]:
if generate_pandas_profiling_reports:
    profile = ProfileReport(df, title='Pandas Profiling Report for selected features')
    profile.to_file("Pandas Profiling Report for selected features.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [96]:
df.drop(["smoke","met", "rldv5e"], inplace=True, axis=1)
len(columns_to_drop)

55

In [97]:
df.isna().sum()

age          2
sex          2
cp           2
trestbps    61
htn         36
chol        32
fbs         92
restecg      4
pro         65
thalach     57
thalrest    58
tpeakbps    65
tpeakbpd    65
trestbpd    61
exang       57
xhypo       60
oldpeak     64
num          2
dataset      0
dtype: int64

In [98]:
print(f"Shape before drop of NaN containing rows: {df.shape}")
df.dropna(inplace=True, axis=0, how='any')
print(f"Shape after drop of NaN containing rows: {df.shape}")

Shape before drop of NaN containing rows: (901, 19)
Shape after drop of NaN containing rows: (706, 19)


In [99]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,htn,chol,fbs,restecg,pro,thalach,thalrest,tpeakbps,tpeakbpd,trestbpd,exang,xhypo,oldpeak,num
count,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0,706.0
mean,53.025496,0.764873,3.225212,132.5,0.491501,220.641643,0.148725,0.634561,0.148725,138.771955,75.576487,172.18272,86.910765,83.88102,0.398017,0.021246,0.877054,1.067989
std,9.461347,0.424379,0.938681,18.624185,0.500282,94.375922,0.35607,0.840227,0.35607,25.913886,14.68083,24.997345,14.887063,10.208049,0.489836,0.144307,1.072062,1.268683
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,37.0,84.0,11.0,0.0,0.0,0.0,-1.0,0.0
25%,46.0,1.0,2.25,120.0,0.0,198.0,0.0,0.0,0.0,120.0,65.0,158.0,80.0,80.0,0.0,0.0,0.0,0.0
50%,54.0,1.0,4.0,130.0,0.0,233.0,0.0,0.0,0.0,140.0,74.0,170.0,86.0,80.0,0.0,0.0,0.5,1.0
75%,60.0,1.0,4.0,140.0,1.0,272.0,0.0,1.0,0.0,159.75,84.0,190.0,100.0,90.0,1.0,0.0,1.5,2.0
max,77.0,1.0,4.0,200.0,1.0,603.0,1.0,2.0,1.0,202.0,134.0,240.0,134.0,120.0,1.0,1.0,6.2,4.0


In [100]:
df

Unnamed: 0,age,sex,cp,trestbps,htn,chol,fbs,restecg,pro,thalach,thalrest,tpeakbps,tpeakbpd,trestbpd,exang,xhypo,oldpeak,num,dataset
0,40.0,1.0,2.0,140.0,0.0,289.0,0.0,0.0,0.0,172.0,86.0,200.0,110.0,86.0,0.0,0.0,0.0,0.0,hungarian
1,49.0,0.0,3.0,160.0,1.0,180.0,0.0,0.0,0.0,156.0,100.0,220.0,106.0,90.0,0.0,0.0,1.0,1.0,hungarian
2,37.0,1.0,2.0,130.0,0.0,283.0,0.0,1.0,0.0,98.0,58.0,180.0,100.0,80.0,0.0,0.0,0.0,0.0,hungarian
3,48.0,0.0,4.0,138.0,0.0,214.0,0.0,0.0,0.0,108.0,54.0,210.0,106.0,86.0,1.0,0.0,1.5,3.0,hungarian
5,39.0,1.0,3.0,120.0,0.0,339.0,0.0,0.0,0.0,170.0,86.0,198.0,100.0,80.0,0.0,0.0,0.0,0.0,hungarian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,62.0,1.0,4.0,158.0,1.0,170.0,0.0,1.0,0.0,138.0,86.0,202.0,98.0,90.0,1.0,0.0,0.0,1.0,long-beach-va
894,46.0,1.0,4.0,134.0,1.0,310.0,0.0,0.0,0.0,126.0,88.0,174.0,114.0,90.0,0.0,0.0,0.0,2.0,long-beach-va
895,54.0,0.0,4.0,127.0,0.0,333.0,1.0,1.0,0.0,154.0,83.0,158.0,84.0,78.0,0.0,0.0,0.0,1.0,long-beach-va
897,55.0,1.0,4.0,122.0,1.0,223.0,1.0,1.0,0.0,100.0,74.0,210.0,100.0,70.0,0.0,0.0,0.0,2.0,long-beach-va


In [101]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
df[df['num'] >= 1] = 1
df['num'] = labelEncoder.fit_transform(df['num'])

In [102]:
# Compute the correlation matrix
corr = df.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap="coolwarm", vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<AxesSubplot:>

In [103]:
X = df.loc[:,(df.columns!= 'num') & (df.columns != 'dataset')]

y = df['num']

In [104]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from  sklearn.naive_bayes import *
estimators_and_hyperparameters=[
    (CatBoostClassifier(random_state=42, thread_count=-1, silent= True),{}),
    (XGBClassifier(random_state=42, n_jobs=-1),{}),
    (SVC(kernel='linear',random_state=42),{}),
    (SVC(kernel='poly',random_state=42),{}),
    (SVC(kernel='rbf',random_state=42),{}),
    (SVC(kernel='sigmoid',random_state=42),{}),
    #(SVC(kernel='precomputed',random_state=42),{}),
    # (BernoulliNB(),{}),
    #(CategoricalNB(),{}),
    # (ComplementNB(),{}),
    # (GaussianNB(),{}),
    # (MultinomialNB(),{}),
    (DecisionTreeClassifier(random_state=42),{}),
    (KNeighborsClassifier(n_jobs=-1),{}),
    (RandomForestClassifier(random_state=42, n_jobs=-1), {}),
    (SGDClassifier(),{})
]

In [105]:
from sklearn.preprocessing import *
scalers = [
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PowerTransformer(),
    QuantileTransformer(output_distribution='uniform'),
    QuantileTransformer(output_distribution='normal'),
    RobustScaler(),
    StandardScaler(),
]

In [106]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import cross_val_score
# from statistics import mean
# for scaler in scalers:
#     print(f'Current Sclaer: {scaler.__class__.__name__}')
#     for estimator in estimators_and_hyperparameters:
#         skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
#         try:
#             X_trans = scaler.fit_transform(X)
#             scores = cross_val_score(estimator[0], X_trans, y, scoring='f1',cv=skf, n_jobs=-1)
#             print(f'F1 score for {estimator[0].__class__.__name__}: {mean(scores)}')
#         except Exception as e:
#             print(e)
#             print(f'Skipping the combination of {scaler.__class__.__name__} and {estimator.__class__.__name__}')
#     print('-----------------------------------------------------------------')

# Tests with the preprocessed data by the UCI

In [107]:
import pandas as pd 
datasets = ["hungarian", "cleveland", "switzerland", "va"]
df_processed = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv("./Data/processed."+ dataset +".data", header=None, sep=",")
    dataset_df['dataset'] = dataset
    df_processed = pd.concat([df_processed,dataset_df ], ignore_index=True)
df_processed.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num', 'dataset']
df_processed = df_processed.replace('?', float('nan'))


In [108]:
df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']] = df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']].apply(pd.to_numeric)

In [109]:
(df_processed.loc[ : , df_processed.columns != 'dataset'].isna()).join(df_processed['dataset']).groupby("dataset").sum()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
cleveland,0,0,0,0,0,0,0,0,0,0,0,4,2,0
hungarian,0,0,0,1,23,8,1,1,1,0,190,291,266,0
switzerland,0,0,0,2,0,75,1,1,1,6,17,118,52,0
va,0,0,0,56,7,7,0,53,53,56,102,198,166,0


In [110]:
df_processed.drop(["slope", "ca","thal"], inplace=True, axis=1)

In [111]:
print(f"Shape before drop of NaN containing rows: {df_processed.shape}")
df_processed.dropna(inplace=True, axis=0, how='any')
print(f"Shape after drop of NaN containing rows: {df_processed.shape}")

Shape before drop of NaN containing rows: (920, 12)
Shape after drop of NaN containing rows: (740, 12)


In [112]:
df.shape

(706, 19)

In [113]:
df_processed.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num,dataset
0,28.0,1.0,2.0,130.0,132.0,0.0,2.0,185.0,0.0,0.0,0,hungarian
1,29.0,1.0,2.0,120.0,243.0,0.0,0.0,160.0,0.0,0.0,0,hungarian
3,30.0,0.0,1.0,170.0,237.0,0.0,1.0,170.0,0.0,0.0,0,hungarian
4,31.0,0.0,2.0,100.0,219.0,0.0,1.0,150.0,0.0,0.0,0,hungarian
5,32.0,0.0,2.0,105.0,198.0,0.0,0.0,165.0,0.0,0.0,0,hungarian


In [114]:
df_processed.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
count,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0
mean,53.097297,0.764865,3.227027,132.754054,220.136486,0.15,0.635135,138.744595,0.4,0.894324,0.924324
std,9.408127,0.42437,0.939193,18.58125,93.614555,0.357313,0.840039,25.846082,0.490229,1.08716,1.128882
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-1.0,0.0
25%,46.0,1.0,2.0,120.0,197.0,0.0,0.0,120.0,0.0,0.0,0.0
50%,54.0,1.0,4.0,130.0,231.0,0.0,0.0,140.0,0.0,0.5,1.0
75%,60.0,1.0,4.0,140.0,271.0,0.0,1.0,159.25,1.0,1.5,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,4.0


In [115]:
if generate_pandas_profiling_reports:
    profile = ProfileReport(df, title='Pandas Profiling Report for the features processed by the UCI')
    profile.to_file("Pandas Profiling Report for the features processed by the UCI.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [116]:
# Tests with the preprocessed data by the UCI includeing the reprocessed data

In [117]:
import pandas as pd 
datasets = ["hungarian", "cleveland", "switzerland", "va"]
df_processed = pd.DataFrame()
for dataset in datasets:
    if dataset != "hungarian":
        dataset_df = pd.read_csv("./Data/processed."+ dataset +".data", header=None, sep=",")
        dataset_df['dataset'] = dataset
        df_processed = pd.concat([df_processed,dataset_df ], ignore_index=True)
with open("Data/reprocessed.hungarian.data") as file:
    dataString = file.read()
    dataString = dataString.replace(" ",",")
    dataset_df = pd.read_csv(StringIO(dataString), header=None, sep=",")
    dataset_df['dataset'] = dataset
df_processed = pd.concat([df_processed,dataset_df ], ignore_index=True)
df_processed.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num', 'dataset']
df_processed = df_processed.replace('?', float('nan'))

In [118]:
df_processed

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,dataset
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0,cleveland
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0,cleveland
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0,cleveland
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0,cleveland
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0,cleveland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
916,36.0,1.0,2.0,120.0,166.0,0.0,0.0,180.0,0.0,0.0,-9.0,-9.0,-9.0,0.0,va
917,48.0,1.0,3.0,110.0,211.0,0.0,0.0,138.0,0.0,0.0,-9.0,-9.0,6.0,0.0,va
918,47.0,0.0,2.0,140.0,257.0,0.0,0.0,135.0,0.0,1.0,1.0,-9.0,-9.0,0.0,va
919,53.0,1.0,4.0,130.0,182.0,0.0,0.0,148.0,0.0,0.0,-9.0,-9.0,-9.0,0.0,va


In [119]:
if generate_pandas_profiling_reports:
    profile = ProfileReport(df, title='Pandas Profiling Report for the features processed by the UCI + reprocessed hungarian')
    profile.to_file("Pandas Profiling Report for the features processed by the UCI + reprocessed hungarian.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]