In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn import pipeline
from sklearn import compose
from sklearn import impute
from sklearn import preprocessing
import time
from sklearn import metrics

from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting 
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier



In [19]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [20]:
df.columns

Index(['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall', 'output'],
      dtype='object')

In [21]:
np.random.seed(0)
dfc = df.copy()
for i in dfc['sex'].unique():
        ha = dfc[dfc['sex'] == i]
        trtbs_mean = ha['trtbps'].mean()
        chol_mean = ha['chol'].mean()
        thalachh_mean = ha['thalachh'].mean()
        
        for j in dfc[dfc['output'] == i].index:
            if np.random.randint(2) == 1:
                dfc['trtbps'].values[j] +=trtbs_mean/10
            else:
                dfc['trtbps'].values[j] -= trtbs_mean/10

            if np.random.randint(2) == 1:
                dfc['chol'].values[j] += chol_mean/10
            else:
                dfc['chol'].values[j] -= chol_mean/10

            if np.random.randint(2) == 1:
                dfc['thalachh'].values[j] += thalachh_mean/10
            else:
                dfc['thalachh'].values[j] += thalachh_mean/10


In [22]:
X,y = df.drop(['output'], axis=1), df['output']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
extra_sample = dfc.sample(dfc.shape[0] // 5)
X_train = pd.concat([X_train, extra_sample.drop(['output'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['output'] ])


In [25]:
num_vars = ['age','trtbps','chol','thalachh','oldpeak']
cat_vars = ['sex', 'cp', 'fbs', 'restecg','exng','slp', 'caa', 'thall']


num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),])
cat_4_treeModels = pipeline.Pipeline(steps=[('ordinal', preprocessing.OrdinalEncoder())])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num',  num_4_treeModels,num_vars),
    ('cat',cat_4_treeModels, cat_vars),
], remainder='drop') 

In [26]:
tree_classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Random Forest": RandomForestClassifier(random_state=0, max_depth=4, n_estimators=200),
    "AdaBoost": AdaBoostClassifier(),
    "Skl GBM": GradientBoostingClassifier(),
    "Skl HistGBM":HistGradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier()}

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

In [27]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})
for model_name, model in tree_classifiers.items():
    start_time = time.time()        
    model.fit(X_train,y_train)
    pred =model.predict(X_test)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                            "Accuracy": round(metrics.accuracy_score(y_test, pred)*100, 1),
                            "Bal Acc.": round(metrics.balanced_accuracy_score(y_test, pred)*100, 1),
                            "Time":     total_time},
                            ignore_index=True)
                                
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')
results_ord


  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,




  results = results.append({"Model":    model_name,


Learning rate set to 0.006179
0:	learn: 0.6884129	total: 1.63ms	remaining: 1.63s
1:	learn: 0.6830981	total: 3.06ms	remaining: 1.53s
2:	learn: 0.6786584	total: 6.05ms	remaining: 2.01s
3:	learn: 0.6740577	total: 7.64ms	remaining: 1.9s
4:	learn: 0.6699769	total: 9.62ms	remaining: 1.91s
5:	learn: 0.6654487	total: 13.3ms	remaining: 2.2s
6:	learn: 0.6613858	total: 15.7ms	remaining: 2.23s
7:	learn: 0.6565420	total: 18.2ms	remaining: 2.26s
8:	learn: 0.6528390	total: 19.6ms	remaining: 2.16s
9:	learn: 0.6497761	total: 22.8ms	remaining: 2.25s
10:	learn: 0.6454295	total: 24.1ms	remaining: 2.16s
11:	learn: 0.6403078	total: 25.5ms	remaining: 2.1s
12:	learn: 0.6367321	total: 26.9ms	remaining: 2.04s
13:	learn: 0.6318227	total: 28.3ms	remaining: 1.99s
14:	learn: 0.6273436	total: 29.7ms	remaining: 1.95s
15:	learn: 0.6225680	total: 31.2ms	remaining: 1.92s
16:	learn: 0.6184664	total: 32.5ms	remaining: 1.88s
17:	learn: 0.6148949	total: 33.8ms	remaining: 1.84s
18:	learn: 0.6111905	total: 35.3ms	remaining: 1

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Random Forest,91.8,91.9,0.437462
2,Extra Trees,90.2,90.4,0.210701
3,AdaBoost,88.5,88.6,0.125696
4,Skl HistGBM,88.5,88.6,0.42933
5,LightGBM,88.5,88.6,0.079928
6,CatBoost,86.9,87.1,2.010375
7,Skl GBM,85.2,86.0,0.115657
8,XGBoost,85.2,85.6,0.130163
9,Decision Tree,77.0,76.7,0.027924


In [29]:
grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
mod=RandomForestClassifier()
mod_cv=GridSearchCV(mod,grid,cv=10)
mod_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",mod_cv.best_params_)

tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 7, 'max_features': 'log2', 'n_estimators': 200}


In [30]:
mod2 = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=7, max_features='log2')
mod2.fit(X_train,y_train)
predic = mod2.predict(X_test)
print("score:", metrics.accuracy_score(y_test, predic))

score: 0.8852459016393442


In [31]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, pred ))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86        27
           1       0.91      0.85      0.88        34

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [None]:
print(confusion_matrix(y_test, pred))

[[24  3]
 [ 5 29]]
