In [208]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn import pipeline
from sklearn import compose
from sklearn import impute
from sklearn import preprocessing
import time
from sklearn import metrics

from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting 
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier



In [209]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [210]:
df.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

In [211]:
np.random.seed(0)
dfc = df.copy()
for i in dfc['output'].unique():
        ha = dfc[dfc['output'] == i]
        trtbs_mean = ha['trtbps'].mean()
        chol_mean = ha['chol'].std()
        thalachh_mean = ha['thalachh'].mean()
        
        for j in dfc[dfc['output'] == i].index:
            if np.random.randint(2) == 1:
                dfc['trtbps'].values[j] +=trtbs_mean/10
            else:
                dfc['trtbps'].values[j] -= trtbs_mean/10

            if np.random.randint(2) == 1:
                dfc['chol'].values[j] += chol_mean/10
            else:
                dfc['chol'].values[j] -= chol_mean/10

            if np.random.randint(2) == 1:
                dfc['thalachh'].values[j] += thalachh_mean/10
            else:
                dfc['thalachh'].values[j] += thalachh_mean/10


In [212]:
X,y = df.drop(['output'], axis=1), df['output']

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [214]:
extra_sample = dfc.sample(dfc.shape[0] // 5)
X_train = pd.concat([X_train, extra_sample.drop(['output'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['output'] ])


In [215]:
num_vars = ['age','trtbps','chol','thalachh','oldpeak']
cat_vars = ['sex', 'cp', 'fbs', 'restecg','exng','slp', 'caa', 'thall']


num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),])
cat_4_treeModels = pipeline.Pipeline(steps=[('ordinal', preprocessing.OrdinalEncoder())])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num',  num_4_treeModels,num_vars),
    ('cat',cat_4_treeModels, cat_vars),
], remainder='drop') 

In [216]:
tree_classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Random Forest": RandomForestClassifier(random_state=0, max_depth=4, n_estimators=200),
    "AdaBoost": AdaBoostClassifier(),
    "Skl GBM": GradientBoostingClassifier(),
    "Skl HistGBM":HistGradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier()}

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

In [217]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})
for model_name, model in tree_classifiers.items():
    start_time = time.time()        
    model.fit(X_train,y_train)
    pred =model.predict(X_test)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                            "Accuracy": round(metrics.accuracy_score(y_test, pred)*100),
                            "Bal Acc.": round(metrics.balanced_accuracy_score(y_test, pred)*100),
                            "Time":     total_time},
                            ignore_index=True)
                                
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')
results_ord


  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,




  results = results.append({"Model":    model_name,


Learning rate set to 0.006179
0:	learn: 0.6884129	total: 1.46ms	remaining: 1.45s
1:	learn: 0.6830981	total: 2.96ms	remaining: 1.48s
2:	learn: 0.6786731	total: 4.34ms	remaining: 1.44s
3:	learn: 0.6740424	total: 5.83ms	remaining: 1.45s
4:	learn: 0.6699627	total: 7.37ms	remaining: 1.47s
5:	learn: 0.6653987	total: 9.32ms	remaining: 1.54s
6:	learn: 0.6613366	total: 10.8ms	remaining: 1.53s
7:	learn: 0.6564942	total: 12.3ms	remaining: 1.53s
8:	learn: 0.6527229	total: 13.6ms	remaining: 1.5s
9:	learn: 0.6496602	total: 14.7ms	remaining: 1.45s
10:	learn: 0.6447346	total: 16.1ms	remaining: 1.45s
11:	learn: 0.6396274	total: 17.4ms	remaining: 1.44s
12:	learn: 0.6360557	total: 18.8ms	remaining: 1.43s
13:	learn: 0.6311571	total: 20.2ms	remaining: 1.42s
14:	learn: 0.6267598	total: 21.5ms	remaining: 1.41s
15:	learn: 0.6220371	total: 23.3ms	remaining: 1.43s
16:	learn: 0.6179450	total: 24.8ms	remaining: 1.43s
17:	learn: 0.6138973	total: 26.3ms	remaining: 1.43s
18:	learn: 0.6102029	total: 27.7ms	remaining:

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Extra Trees,92.0,92.0,0.125196
2,Random Forest,92.0,92.0,0.279814
3,AdaBoost,90.0,90.0,0.086403
4,Skl GBM,89.0,89.0,0.091722
5,Skl HistGBM,87.0,87.0,0.360203
6,CatBoost,87.0,87.0,1.619319
7,XGBoost,84.0,84.0,0.12381
8,LightGBM,82.0,83.0,0.067818
9,Decision Tree,74.0,73.0,0.018915
