In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)

In [15]:
datasets = ["hungarian", "cleveland", "switzerland", "long-beach-va"]

In [16]:
def readRawData(filePath:str):
    with open(filePath) as file:
        dataString = file.read()
        dataString = dataString.replace("\n"," ")
        dataString = dataString.replace("name ","name\n")
        dataString = dataString.replace(" ",",")
        return dataString

In [17]:
from io import StringIO
df = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv(StringIO(readRawData("./Data/"+ dataset +".data")), header=None, sep=",")
    dataset_df['dataset'] = dataset
    df = pd.concat([df,dataset_df ], ignore_index=True)

In [18]:
df.columns=["id", "ccf", "age", "sex", "painloc", "painexer" , "relrest" , "pncaden" , "cp", "trestbps", "htn", "chol", "smoke", "cigs", "years", "fbs", "dm", "famhist", "restecg", "ekgmo", "ekgday", "ekgyr", "dig", "prop", "nitr", "pro", "diuretic", "proto", "thaldur", "thaltime", "met", "thalach", "thalrest", "tpeakbps", "tpeakbpd", "dummy", "trestbpd", "exang", "xhypo", "oldpeak", "slope", "rldv5", "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm", "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe", "cmo", "cday", "cyr", "num", "lmt", "ladprox", "laddist", "diag", "cxmain", "ramus", "om1", "om2", "rcaprox", "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk", "name", "dataset"]

In [19]:
df = df.replace(-9, float('nan'))

In [20]:
df.describe()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,om2,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk
count,899.0,899.0,899.0,899.0,617.0,617.0,613.0,0.0,899.0,840.0,...,327.0,654.0,629.0,880.0,880.0,880.0,880.0,883.0,311.0,119.0
mean,957.235818,0.0,53.480534,0.790879,0.920583,0.593193,0.672104,,3.253615,132.10119,...,1.067278,1.342508,1.171701,1.020455,1.032955,1.132955,1.611364,1.178935,27.623119,5.869748
std,1204.015482,0.0,9.435894,0.406908,0.270607,0.491637,0.46983,,0.928499,19.151127,...,0.250887,0.474912,0.377421,0.277384,0.415902,0.703837,1.722199,0.512572,31.675295,1.650914
min,1.0,0.0,28.0,0.0,0.0,0.0,0.0,,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.22,3.3
25%,116.0,0.0,47.0,1.0,1.0,0.0,0.0,,3.0,120.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.685,4.8
50%,266.0,0.0,54.0,1.0,1.0,1.0,1.0,,4.0,130.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.82,5.6
75%,1207.5,0.0,60.0,1.0,1.0,1.0,1.0,,4.0,140.0,...,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,63.0,6.9
max,5002.0,0.0,77.0,1.0,1.0,1.0,1.0,,4.0,200.0,...,2.0,2.0,2.0,7.0,10.0,8.0,8.0,5.0,86.0,11.3


In [21]:
df.head()

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name,dataset
0,1254.0,0.0,40.0,1.0,1.0,0.0,0.0,,2.0,140.0,...,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian
1,1255.0,0.0,49.0,0.0,1.0,0.0,0.0,,3.0,160.0,...,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian
2,1256.0,0.0,37.0,1.0,1.0,0.0,0.0,,2.0,130.0,...,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian
3,1257.0,0.0,48.0,0.0,1.0,1.0,1.0,,4.0,138.0,...,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian
4,1258.0,0.0,54.0,1.0,1.0,0.0,1.0,,3.0,150.0,...,,1.0,1.0,1.0,1.0,1.0,,,name,hungarian


In [22]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(df, title='Pandas Profiling Report')
# profile.to_file("df.html")

The columns smoke and years both describe whether a respondent smokes or not. Smoke does this by being binary coded, while years describes the number of years a person has smoked. Due to the high number of missing values, the columns are useless on their own. However, it is possible to enrich the smoke column with the years column. 

In [23]:
print(f"Number of NaNs in smoke: {df['smoke'].isna().sum()}")
df.loc[(df['smoke'].isna()) & (df['years'] == 0),'smoke'] = 0
df.loc[(df['smoke'].isna()) & (df['years'] > 0),'smoke'] = 1
df.drop(columns=['smoke'])
print(f"Number of NaNs in smoke after combination with years: {df['smoke'].isna().sum()}")
df.loc[(df['smoke'].isna()) & (df['cigs'] == 0),'smoke'] = 0
df.loc[(df['smoke'].isna()) & (df['cigs'] > 0),'smoke'] = 1
print(f"Number of NaNs in smoke after combination with years and cigs: {df['smoke'].isna().sum()}")

Number of NaNs in smoke :671
Number of NaNs in smoke after combination with years :391
Number of NaNs in smoke after combination with cigs :389


Finding: reduces the number of missing values in smoke by 280 entries. 

In [11]:
df.columns

Index(['id', 'ccf', 'age', 'sex', 'painloc', 'painexer', 'relrest', 'pncaden',
       'cp', 'trestbps', 'htn', 'chol', 'smoke', 'cigs', 'years', 'fbs', 'dm',
       'famhist', 'restecg', 'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop', 'nitr',
       'pro', 'diuretic', 'proto', 'thaldur', 'thaltime', 'met', 'thalach',
       'thalrest', 'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd', 'exang',
       'xhypo', 'oldpeak', 'slope', 'rldv5', 'rldv5e', 'ca', 'restckm',
       'exerckm', 'restef', 'restwm', 'exeref', 'exerwm', 'thal', 'thalsev',
       'thalpul', 'earlobe', 'cmo', 'cday', 'cyr', 'num', 'lmt', 'ladprox',
       'laddist', 'diag', 'cxmain', 'ramus', 'om1', 'om2', 'rcaprox',
       'rcadist', 'lvx1', 'lvx2', 'lvx3', 'lvx4', 'lvf', 'cathef', 'junk',
       'name', 'dataset'],
      dtype='object')

# Explore how many NaNs are within one coloumn for each attribute.

In [12]:
(df.loc[ : , df.columns != 'dataset'].isna()).join(df['dataset']).groupby("dataset").sum()

Unnamed: 0_level_0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cleveland,0,0,0,0,282,282,282,282,0,0,...,0,0,0,0,0,0,0,282,282,0
hungarian,1,1,1,1,1,1,1,295,1,2,...,245,270,1,1,1,1,1,267,295,1
long-beach-va,1,1,1,1,1,1,5,201,1,57,...,2,2,3,3,3,3,4,24,82,1
switzerland,0,0,0,0,0,0,0,123,0,2,...,0,0,17,17,17,17,13,17,123,0


In [13]:
(df.loc[ : , df.columns != 'dataset'].eq(0)).join(df['dataset']).groupby("dataset").sum()

Unnamed: 0_level_0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cleveland,0,282,0,91,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hungarian,0,294,0,81,23,164,141,0,0,0,...,0,0,0,0,0,0,0,0,0,0
long-beach-va,0,200,0,6,15,65,33,0,0,1,...,0,0,0,0,0,0,1,0,0,0
switzerland,0,123,0,10,11,22,27,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [16]:
df.drop(["painloc", "painexer", "relrest", "pncaden", "trestbps", "htn", "chol", "smoke", "cigs", "years", "fbs",  "dm", "famhist", "ekgmo", "ekgday", "ekgyr", "dig", "prop", "nitr", "pro", "diuretic", "proto", "thaldur", "thaltime", "met", "slope", "rldv5", "rldv5e", "ca", "restckm", "exerckm", "restef", "restwm", "exeref", "exerwm", "thal", "thalsev", "thalpul", "earlobe", "lmt", "ladprox", "laddist", "diag", "cxmain", "ramus", "om1", "om2", "rcaprox", "rcadist", "lvx1", "lvx2", "lvx3", "lvx4", "lvf", "cathef", "junk","thal" ], inplace=True, axis=1)

In [17]:
df.isna().sum()

id           2
ccf          2
age          2
sex          2
cp           2
restecg      4
thalach     57
thalrest    58
tpeakbps    65
tpeakbpd    65
dummy       61
trestbpd    61
exang       57
xhypo       60
oldpeak     64
cmo         13
cday        11
cyr         11
num          2
name         2
dataset      0
dtype: int64

In [18]:
df.dropna(inplace=True, axis=0, how='any')

In [19]:
(df.loc[ : , df.columns != 'dataset'].isna()).join(df['dataset']).groupby("dataset").sum()

Unnamed: 0_level_0,id,ccf,age,sex,cp,restecg,thalach,thalrest,tpeakbps,tpeakbpd,dummy,trestbpd,exang,xhypo,oldpeak,cmo,cday,cyr,num,name
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
cleveland,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
hungarian,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
long-beach-va,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
switzerland,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
df.shape

(815, 21)

In [21]:
df.describe()

Unnamed: 0,id,ccf,age,sex,cp,restecg,thalach,thalrest,tpeakbps,tpeakbpd,dummy,trestbpd,exang,xhypo,oldpeak,cmo,cday,cyr,num
count,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0,815.0
mean,972.478528,0.0,53.034356,0.774233,3.251534,0.592638,137.781595,75.493252,171.997546,87.336196,132.213497,83.633129,0.390184,0.02454,0.860859,6.169325,15.974233,83.933742,1.099387
std,1189.244863,0.0,9.386267,0.418343,0.928286,0.818067,25.835216,14.54915,25.55598,14.800968,18.973888,10.296305,0.488091,0.154813,1.084524,3.470591,8.918306,3.348773,1.257386
min,1.0,0.0,28.0,0.0,1.0,0.0,60.0,37.0,84.0,11.0,0.0,0.0,0.0,0.0,-2.6,1.0,1.0,1.0,0.0
25%,126.5,0.0,46.0,1.0,3.0,0.0,120.0,65.0,156.0,80.0,120.0,80.0,0.0,0.0,0.0,3.0,8.0,83.0,0.0
50%,288.0,0.0,54.0,1.0,4.0,0.0,140.0,74.0,170.0,88.0,130.0,80.0,0.0,0.0,0.5,6.0,16.0,84.0,1.0
75%,1211.5,0.0,60.0,1.0,4.0,1.0,157.5,84.0,190.0,100.0,140.0,90.0,1.0,0.0,1.5,9.0,24.0,85.0,2.0
max,5002.0,0.0,77.0,1.0,4.0,2.0,202.0,134.0,240.0,134.0,200.0,120.0,1.0,1.0,6.2,12.0,31.0,87.0,4.0


In [22]:
df.drop(['id', 'name'], inplace=True, axis=1)

In [23]:
df

Unnamed: 0,ccf,age,sex,cp,restecg,thalach,thalrest,tpeakbps,tpeakbpd,dummy,trestbpd,exang,xhypo,oldpeak,cmo,cday,cyr,num,dataset
0,0.0,40.0,1.0,2.0,0.0,172.0,86.0,200.0,110.0,140.0,86.0,0.0,0.0,0.0,12.0,20.0,84.0,0.0,hungarian
1,0.0,49.0,0.0,3.0,0.0,156.0,100.0,220.0,106.0,160.0,90.0,0.0,0.0,1.0,11.0,20.0,84.0,1.0,hungarian
2,0.0,37.0,1.0,2.0,1.0,98.0,58.0,180.0,100.0,130.0,80.0,0.0,0.0,0.0,11.0,26.0,84.0,0.0,hungarian
3,0.0,48.0,0.0,4.0,0.0,108.0,54.0,210.0,106.0,138.0,86.0,1.0,0.0,1.5,9.0,30.0,84.0,3.0,hungarian
4,0.0,54.0,1.0,3.0,0.0,122.0,74.0,130.0,100.0,150.0,90.0,0.0,1.0,0.0,7.0,30.0,84.0,0.0,hungarian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,0.0,62.0,1.0,4.0,1.0,138.0,86.0,202.0,98.0,158.0,90.0,1.0,0.0,0.0,1.0,26.0,87.0,1.0,long-beach-va
894,0.0,46.0,1.0,4.0,0.0,126.0,88.0,174.0,114.0,134.0,90.0,0.0,0.0,0.0,7.0,28.0,83.0,2.0,long-beach-va
895,0.0,54.0,0.0,4.0,1.0,154.0,83.0,158.0,84.0,127.0,78.0,0.0,0.0,0.0,6.0,29.0,83.0,1.0,long-beach-va
897,0.0,55.0,1.0,4.0,1.0,100.0,74.0,210.0,100.0,122.0,70.0,0.0,0.0,0.0,4.0,17.0,86.0,2.0,long-beach-va


In [24]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
df[df['num'] >= 1] = 1
df['num'] = labelEncoder.fit_transform(df['num'])

In [25]:
# Compute the correlation matrix
corr = df.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap="coolwarm", vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<AxesSubplot:>

In [26]:
X = df.loc[:,(df.columns!= 'num') & (df.columns != 'dataset')]

y = df['num']

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from  sklearn.naive_bayes import *
estimators_and_hyperparameters=[
    (CatBoostClassifier(random_state=42, thread_count=-1, silent= True),{}),
    (XGBClassifier(random_state=42, n_jobs=-1),{}),
    (SVC(kernel='linear',random_state=42),{}),
    (SVC(kernel='poly',random_state=42),{}),
    (SVC(kernel='rbf',random_state=42),{}),
    (SVC(kernel='sigmoid',random_state=42),{}),
    #(SVC(kernel='precomputed',random_state=42),{}),
    # (BernoulliNB(),{}),
    #(CategoricalNB(),{}),
    # (ComplementNB(),{}),
    # (GaussianNB(),{}),
    # (MultinomialNB(),{}),
    (DecisionTreeClassifier(random_state=42),{}),
    (KNeighborsClassifier(n_jobs=-1),{}),
    (RandomForestClassifier(random_state=42, n_jobs=-1), {}),
    (SGDClassifier(),{})
]

ModuleNotFoundError: No module named 'xgboost'

In [None]:
from sklearn.preprocessing import *
scalers = [
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PowerTransformer(),
    QuantileTransformer(output_distribution='uniform'),
    QuantileTransformer(output_distribution='normal'),
    RobustScaler(),
    StandardScaler(),
]

In [None]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import cross_val_score
# from statistics import mean
# for scaler in scalers:
#     print(f'Current Sclaer: {scaler.__class__.__name__}')
#     for estimator in estimators_and_hyperparameters:
#         skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
#         try:
#             X_trans = scaler.fit_transform(X)
#             scores = cross_val_score(estimator[0], X_trans, y, scoring='f1',cv=skf, n_jobs=-1)
#             print(f'F1 score for {estimator[0].__class__.__name__}: {mean(scores)}')
#         except Exception as e:
#             print(e)
#             print(f'Skipping the combination of {scaler.__class__.__name__} and {estimator.__class__.__name__}')
#     print('-----------------------------------------------------------------')

Current Sclaer: MaxAbsScaler
F1 score for CatBoostClassifier: 1.0
F1 score for XGBClassifier: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for DecisionTreeClassifier: 1.0
F1 score for KNeighborsClassifier: 1.0
F1 score for RandomForestClassifier: 1.0
F1 score for SGDClassifier: 1.0
-----------------------------------------------------------------
Current Sclaer: MinMaxScaler
F1 score for CatBoostClassifier: 1.0
F1 score for XGBClassifier: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for DecisionTreeClassifier: 1.0
F1 score for KNeighborsClassifier: 1.0
F1 score for RandomForestClassifier: 1.0
F1 score for SGDClassifier: 1.0
-----------------------------------------------------------------
Current Sclaer: Normalizer
F1 score for CatBoostClassifier: 1.0
F1 score for XGBClassifier: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 sco



F1 score for CatBoostClassifier: 1.0




F1 score for XGBClassifier: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0




F1 score for SVC: 1.0
F1 score for DecisionTreeClassifier: 1.0
F1 score for KNeighborsClassifier: 1.0




F1 score for RandomForestClassifier: 1.0
F1 score for SGDClassifier: 1.0
-----------------------------------------------------------------
Current Sclaer: QuantileTransformer




F1 score for CatBoostClassifier: 1.0




F1 score for XGBClassifier: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0




F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for DecisionTreeClassifier: 1.0




F1 score for KNeighborsClassifier: 1.0




F1 score for RandomForestClassifier: 1.0
F1 score for SGDClassifier: 0.998876404494382
-----------------------------------------------------------------
Current Sclaer: RobustScaler




F1 score for CatBoostClassifier: 1.0
F1 score for XGBClassifier: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for DecisionTreeClassifier: 1.0
F1 score for KNeighborsClassifier: 1.0
F1 score for RandomForestClassifier: 1.0
F1 score for SGDClassifier: 1.0
-----------------------------------------------------------------
Current Sclaer: StandardScaler
F1 score for CatBoostClassifier: 1.0
F1 score for XGBClassifier: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for SVC: 1.0
F1 score for DecisionTreeClassifier: 1.0
F1 score for KNeighborsClassifier: 1.0
F1 score for RandomForestClassifier: 1.0
F1 score for SGDClassifier: 1.0
-----------------------------------------------------------------


In [10]:
import pandas as pd 
datasets = ["hungarian", "cleveland", "switzerland", "va"]
df_processed = pd.DataFrame()
for dataset in datasets:
    dataset_df = pd.read_csv("./Data/processed."+ dataset +".data", header=None, sep=",")
    dataset_df['dataset'] = dataset
    df_processed = pd.concat([df_processed,dataset_df ], ignore_index=True)
df_processed.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num', 'dataset']
df_processed = df_processed.replace('?', float('nan'))


In [12]:
df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']] = df_processed[['trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']].apply(pd.to_numeric)

In [13]:
df_processed.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num,dataset
0,28.0,1.0,2.0,130.0,132.0,0.0,2.0,185.0,0.0,0.0,,,,0,hungarian
1,29.0,1.0,2.0,120.0,243.0,0.0,0.0,160.0,0.0,0.0,,,,0,hungarian
2,29.0,1.0,2.0,140.0,,0.0,0.0,170.0,0.0,0.0,,,,0,hungarian
3,30.0,0.0,1.0,170.0,237.0,0.0,1.0,170.0,0.0,0.0,,,6.0,0,hungarian
4,31.0,0.0,2.0,100.0,219.0,0.0,1.0,150.0,0.0,0.0,,,,0,hungarian


In [14]:
df_processed.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
count,920.0,920.0,920.0,861.0,890.0,830.0,918.0,865.0,865.0,858.0,611.0,309.0,434.0,920.0
mean,53.51087,0.78913,3.25,132.132404,199.130337,0.166265,0.604575,137.545665,0.389595,0.878788,1.770867,0.676375,5.087558,0.995652
std,9.424685,0.408148,0.930969,19.06607,110.78081,0.372543,0.805827,25.926276,0.487941,1.091226,0.619256,0.935653,1.919075,1.142693
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,1.0,0.0,3.0,0.0
25%,47.0,1.0,3.0,120.0,175.0,0.0,0.0,120.0,0.0,0.0,1.0,0.0,3.0,0.0
50%,54.0,1.0,4.0,130.0,223.0,0.0,0.0,140.0,0.0,0.5,2.0,0.0,6.0,1.0
75%,60.0,1.0,4.0,140.0,268.0,0.0,1.0,157.0,1.0,1.5,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0


In [16]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df_processed, title='Pandas Profiling Report')
profile.to_file("df_processed.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]