In [278]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn import pipeline
from sklearn import compose
from sklearn import impute
from sklearn import preprocessing
import time
from sklearn import metrics

from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting 
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier



In [279]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [280]:
df.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

In [281]:
np.random.seed(0)
dfc = df.copy()
for i in dfc['output'].unique():
        ha = dfc[dfc['output'] == i]
        trtbs_mean = ha['trtbps'].mean()
        chol_mean = ha['chol'].std()
        thalachh_mean = ha['thalachh'].mean()
        
        for j in dfc[dfc['output'] == i].index:
            if np.random.randint(2) == 1:
                dfc['trtbps'].values[j] +=trtbs_mean/10
            else:
                dfc['trtbps'].values[j] -= trtbs_mean/10

            if np.random.randint(2) == 1:
                dfc['chol'].values[j] += chol_mean/10
            else:
                dfc['chol'].values[j] -= chol_mean/10

            if np.random.randint(2) == 1:
                dfc['thalachh'].values[j] += thalachh_mean/10
            else:
                dfc['thalachh'].values[j] += thalachh_mean/10


In [282]:
X,y = df.drop(['output'], axis=1), df['output']

In [283]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [284]:
extra_sample = dfc.sample(dfc.shape[0] // 5)
X_train = pd.concat([X_train, extra_sample.drop(['output'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['output'] ])


In [285]:
num_vars = ['age','trtbps','chol','thalachh','oldpeak']
cat_vars = ['sex', 'cp', 'fbs', 'restecg','exng','slp', 'caa', 'thall']


num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),])
cat_4_treeModels = pipeline.Pipeline(steps=[('ordinal', preprocessing.OrdinalEncoder())])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num',  num_4_treeModels,num_vars),
    ('cat',cat_4_treeModels, cat_vars),
], remainder='drop') 

In [286]:
tree_classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Random Forest": RandomForestClassifier(random_state=0, max_depth=4, n_estimators=200),
    "AdaBoost": AdaBoostClassifier(),
    "Skl GBM": GradientBoostingClassifier(),
    "Skl HistGBM":HistGradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier()}

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

In [287]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})
for model_name, model in tree_classifiers.items():
    start_time = time.time()        
    model.fit(X_train,y_train)
    pred =model.predict(X_test)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                            "Accuracy": round(metrics.accuracy_score(y_test, pred)*100, 1),
                            "Bal Acc.": round(metrics.balanced_accuracy_score(y_test, pred)*100, 1),
                            "Time":     total_time},
                            ignore_index=True)
                                
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')
results_ord


  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,




  results = results.append({"Model":    model_name,


Learning rate set to 0.006179
0:	learn: 0.6884129	total: 1.59ms	remaining: 1.59s
1:	learn: 0.6830981	total: 3.05ms	remaining: 1.52s
2:	learn: 0.6786731	total: 4.47ms	remaining: 1.49s
3:	learn: 0.6740424	total: 5.87ms	remaining: 1.46s
4:	learn: 0.6699627	total: 7.3ms	remaining: 1.45s
5:	learn: 0.6653987	total: 8.62ms	remaining: 1.43s
6:	learn: 0.6613366	total: 9.99ms	remaining: 1.42s
7:	learn: 0.6564942	total: 11.8ms	remaining: 1.46s
8:	learn: 0.6527229	total: 13.7ms	remaining: 1.5s
9:	learn: 0.6496602	total: 15.1ms	remaining: 1.49s
10:	learn: 0.6447346	total: 16.5ms	remaining: 1.48s
11:	learn: 0.6396274	total: 18ms	remaining: 1.48s
12:	learn: 0.6360557	total: 19.5ms	remaining: 1.48s
13:	learn: 0.6311571	total: 21ms	remaining: 1.48s
14:	learn: 0.6267598	total: 22.3ms	remaining: 1.46s
15:	learn: 0.6220371	total: 23.8ms	remaining: 1.46s
16:	learn: 0.6179450	total: 25.3ms	remaining: 1.46s
17:	learn: 0.6138973	total: 27.2ms	remaining: 1.48s
18:	learn: 0.6102029	total: 28.8ms	remaining: 1.48

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Extra Trees,91.8,91.9,0.236206
2,Random Forest,91.8,91.9,0.328121
3,AdaBoost,90.2,90.4,0.12862
4,Skl GBM,88.5,88.9,0.112056
5,Skl HistGBM,86.9,87.1,0.369619
6,CatBoost,86.9,87.1,1.643692
7,XGBoost,83.6,84.2,0.119342
8,LightGBM,82.0,82.7,0.06792
9,Decision Tree,73.8,73.0,0.028924


In [288]:
grid={'n_estimators': [10, 100, 300, 500],
 'criterion':['gini', 'entropy'],
 'max_depth': [2,8,16,32,50],
  }
mod=ExtraTreesClassifier()
mod_cv=GridSearchCV(mod,grid,cv=10)
mod_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",mod_cv.best_params_)

tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_depth': 16, 'n_estimators': 100}


In [300]:
mod2 = ExtraTreesClassifier(n_estimators=300, criterion='entropy', max_depth=16)
mod2.fit(X_train,y_train)
predic = mod2.predict(X_test)
print("score:", metrics.accuracy_score(y_test, predic))

score: 0.9180327868852459


In [301]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, pred ))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86        27
           1       0.91      0.85      0.88        34

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [302]:
print(confusion_matrix(y_test, pred))

[[24  3]
 [ 5 29]]
