In [52]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn import pipeline
from sklearn import compose
from sklearn import impute
from sklearn import preprocessing
import time
from sklearn import metrics

from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

In [53]:
df = pd.read_csv("heart_attack\heart.csv")
saturate =  pd.read_csv("heart_attack\o2Saturation.csv")
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [54]:
df['max_attack'] = 220
df['precis_of_beats'] = (df['age']+df['thalachh'])/df['max_attack']
df['chol_in_gram'] = df['chol']/1000
df = df.drop(['max_attack', 'chol'], axis=1)
df.head()

Unnamed: 0,age,sex,cp,trtbps,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,precis_of_beats,chol_in_gram
0,63,1,3,145,1,0,150,0,2.3,0,0,1,1,0.968182,0.233
1,37,1,2,130,0,1,187,0,3.5,0,0,2,1,1.018182,0.25
2,41,0,1,130,0,0,172,0,1.4,2,0,2,1,0.968182,0.204
3,56,1,1,120,0,1,178,0,0.8,2,0,2,1,1.063636,0.236
4,57,0,0,120,0,1,163,1,0.6,2,0,2,1,1.0,0.354


In [55]:
np.random.seed(0)
dfc = df.copy()
for i in dfc['output'].unique():
        ha = dfc[dfc['output'] == i]
        trtbs_mean = ha['trtbps'].mean()
        chol_mean = ha['chol_in_gram'].mean()
        thalachh_mean = ha['thalachh'].mean()
        
        for j in dfc[dfc['output'] == i].index:
            if np.random.randint(2) == 1:
                dfc['trtbps'].values[j] +=trtbs_mean/10
            else:
                dfc['trtbps'].values[j] -= trtbs_mean/10

            if np.random.randint(2) == 1:
                dfc['chol_in_gram'].values[j] += chol_mean/10
            else:
                dfc['chol_in_gram'].values[j] -= chol_mean/10

            if np.random.randint(2) == 1:
                dfc['thalachh'].values[j] += thalachh_mean/10
            else:
                dfc['thalachh'].values[j] += thalachh_mean/10

In [56]:
X,y = df.drop(['output'], axis=1), df['output']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [58]:
extra_sample = dfc.sample(dfc.shape[0] // 5)
X_train = pd.concat([X_train, extra_sample.drop(['output'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['output'] ])


In [59]:
num_vars = ['age','trtbps','precis_of_beats','thalachh','chol_in_gram','oldpeak']
cat_vars = ['sex', 'cp', 'fbs', 'restecg','exng','slp', 'caa', 'thall']


num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),])
cat_4_treeModels = pipeline.Pipeline(steps=[('ordinal', preprocessing.OrdinalEncoder())])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num',  num_4_treeModels,num_vars),
    ('cat',cat_4_treeModels, cat_vars),
], remainder='drop') 

In [60]:
tree_classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Skl GBM": GradientBoostingClassifier(),
    "Skl HistGBM":HistGradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier()}

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

In [61]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})
for model_name, model in tree_classifiers.items():
    start_time = time.time()        
    model.fit(X_train,y_train)
    pred =model.predict(X_test)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                            "Accuracy": round(metrics.accuracy_score(y_test, pred)*100, 1),
                            "Bal Acc.": round(metrics.balanced_accuracy_score(y_test, pred)*100, 1),
                            "Time":     total_time},
                            ignore_index=True)
                                
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')
results_ord


  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,




  results = results.append({"Model":    model_name,


Learning rate set to 0.006179
0:	learn: 0.6874619	total: 1.73ms	remaining: 1.73s
1:	learn: 0.6820646	total: 3.31ms	remaining: 1.65s
2:	learn: 0.6763409	total: 5.18ms	remaining: 1.72s
3:	learn: 0.6716067	total: 6.93ms	remaining: 1.73s
4:	learn: 0.6678194	total: 8.5ms	remaining: 1.69s
5:	learn: 0.6632111	total: 10.3ms	remaining: 1.7s
6:	learn: 0.6587882	total: 12.1ms	remaining: 1.72s
7:	learn: 0.6546350	total: 13.9ms	remaining: 1.73s
8:	learn: 0.6507990	total: 15.4ms	remaining: 1.69s
9:	learn: 0.6460214	total: 16.7ms	remaining: 1.66s
10:	learn: 0.6433895	total: 18.7ms	remaining: 1.68s
11:	learn: 0.6393606	total: 20.1ms	remaining: 1.66s
12:	learn: 0.6358227	total: 21.7ms	remaining: 1.65s
13:	learn: 0.6319157	total: 23.6ms	remaining: 1.66s
14:	learn: 0.6271429	total: 25.1ms	remaining: 1.65s
15:	learn: 0.6241274	total: 27.2ms	remaining: 1.67s
16:	learn: 0.6198893	total: 28.8ms	remaining: 1.67s
17:	learn: 0.6159025	total: 30.4ms	remaining: 1.66s
18:	learn: 0.6119042	total: 32.3ms	remaining: 

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Random Forest,90.2,90.4,0.383944
2,Extra Trees,86.9,86.7,0.213458
3,AdaBoost,86.9,87.1,0.106733
4,Skl HistGBM,86.9,87.1,0.409996
5,LightGBM,86.9,87.1,0.078789
6,CatBoost,86.9,87.1,1.963752
7,Skl GBM,83.6,83.8,0.112664
8,XGBoost,83.6,84.2,0.12666
9,Decision Tree,75.4,74.9,0.024931


In [66]:
grid = { 
    'n_estimators': [200, 300,500, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
mod=RandomForestClassifier()
mod_cv=GridSearchCV(mod,grid,cv=10)
mod_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",mod_cv.best_params_)

In [None]:
mod2 = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=7, max_features='log2')
mod2.fit(X_train,y_train)
predic = mod2.predict(X_test)
print("score:", metrics.accuracy_score(y_test, predic))

score: 0.8688524590163934


In [64]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, pred ))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86        27
           1       0.91      0.85      0.88        34

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [65]:
print(confusion_matrix(y_test, pred))

[[24  3]
 [ 5 29]]
