In [26]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn import pipeline
from sklearn import compose
from sklearn import impute
from sklearn import preprocessing
import time
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier



In [27]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [28]:
df.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

In [29]:
X,y = df.drop(['output'], axis=1), df['output']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [31]:
num_vars = ['age','trtbps','chol','thalachh','oldpeak']
cat_vars = ['sex', 'cp', 'fbs', 'restecg','exng','slp', 'caa', 'thall']


num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),])
cat_4_treeModels = pipeline.Pipeline(steps=[('ordinal', preprocessing.OrdinalEncoder())])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num',  num_4_treeModels,num_vars),
    ('cat',cat_4_treeModels, cat_vars),
], remainder='drop') 

In [32]:
tree_classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Random Forest": RandomForestClassifier(random_state=0, max_depth=4, n_estimators=200),
    "AdaBoost": AdaBoostClassifier(),
    "Skl GBM": GradientBoostingClassifier(),
    "Skl HistGBM":HistGradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier()}

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

In [33]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})
for model_name, model in tree_classifiers.items():
    start_time = time.time()        
    model.fit(X_train,y_train)
    pred =model.predict(X_test)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                            "Accuracy": round(metrics.accuracy_score(y_test, pred)*100),
                            "Bal Acc.": round(metrics.balanced_accuracy_score(y_test, pred)*100),
                            "Time":     total_time},
                            ignore_index=True)
                                
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')
results_ord


  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,




  results = results.append({"Model":    model_name,


Learning rate set to 0.005621
0:	learn: 0.6887887	total: 1.97ms	remaining: 1.97s
1:	learn: 0.6839242	total: 3.31ms	remaining: 1.65s
2:	learn: 0.6798594	total: 4.63ms	remaining: 1.54s
3:	learn: 0.6751661	total: 6.03ms	remaining: 1.5s
4:	learn: 0.6721501	total: 7.26ms	remaining: 1.45s
5:	learn: 0.6687665	total: 8.59ms	remaining: 1.42s
6:	learn: 0.6646959	total: 9.96ms	remaining: 1.41s
7:	learn: 0.6602593	total: 11.3ms	remaining: 1.41s
8:	learn: 0.6557655	total: 12.5ms	remaining: 1.38s
9:	learn: 0.6530049	total: 13.7ms	remaining: 1.35s
10:	learn: 0.6483069	total: 15.5ms	remaining: 1.39s
11:	learn: 0.6439382	total: 16.9ms	remaining: 1.39s
12:	learn: 0.6399754	total: 18.4ms	remaining: 1.4s
13:	learn: 0.6358013	total: 20.1ms	remaining: 1.42s
14:	learn: 0.6328240	total: 21.4ms	remaining: 1.4s
15:	learn: 0.6296357	total: 22.8ms	remaining: 1.4s
16:	learn: 0.6263390	total: 24.1ms	remaining: 1.39s
17:	learn: 0.6227491	total: 25.3ms	remaining: 1.38s
18:	learn: 0.6205867	total: 26.1ms	remaining: 1.

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,AdaBoost,90.0,90.0,0.099738
2,Extra Trees,87.0,86.0,0.129687
3,CatBoost,85.0,85.0,1.545986
4,Random Forest,84.0,83.0,0.307873
5,Skl HistGBM,84.0,83.0,0.313162
6,LightGBM,84.0,83.0,0.058843
7,Skl GBM,80.0,80.0,0.086776
8,XGBoost,79.0,79.0,0.122673
9,Decision Tree,75.0,76.0,0.018989


In [34]:
grid={'n_estimators': [10, 100, 300, 500, 1000],
 'criterion':['gini', 'entropy'],
  }
mod=ExtraTreesClassifier()
mod_cv=GridSearchCV(mod,grid,cv=10)
mod_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",mod_cv.best_params_)

tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'n_estimators': 1000}


In [35]:
mod2 = ExtraTreesClassifier(n_estimators=1000, criterion='entropy')
mod2.fit(X_train,y_train)
print("score",mod2.score(X_test,y_test))

score 0.8852459016393442
