In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
from tensorflow import keras
from keras.callbacks import EarlyStopping

np.set_printoptions(precision=3, suppress=True)

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score

In [None]:
dataset = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")
print(dataset.columns)

Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
       'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')


In [None]:
print('Original dataset shape %s' % Counter(dataset['HeartDiseaseorAttack']))

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(dataset.drop('HeartDiseaseorAttack', axis=1), dataset['HeartDiseaseorAttack'])

print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0.0: 229787, 1.0: 23893})
Resampled dataset shape Counter({0.0: 229787, 1.0: 229787})


In [None]:
feature_drop=[ 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth']

X = X_res.drop(feature_drop, axis=1).values
y = y_res.values

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.8, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:

model = BaggingClassifier(estimator=DecisionTreeClassifier(),
                           n_estimators=100,
                           random_state=42)
model.fit(X_train, y_train)
print('Training completed')


Training completed


In [None]:
y_pred = model.predict(X_test)
print('Test accuracy %s' % accuracy_score(y_test, y_pred))


Test accuracy 0.9397928543452718


In [None]:
def print_score(clf, X_train, X_test, y_train, y_test, train=True):

    if train:
        '''
        training performance
        '''
        res = model.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train,
                                                                res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train,
                                                                            res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train,
                                                                  res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(y_train,
                                                      res)))

        res = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

    elif train==False:
        '''
        test performance
        '''
        res_test = model.predict(X_test)
        print("Test Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test,
                                                                res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test,
                                                                            res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test,
                                                                  res_test)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(y_test,
                                                      res_test)))

In [None]:
print_score(model, X_train, X_test, y_train, y_test, train=True)
print_score(model, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.9936

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.99      1.00      0.99    183754
         1.0       1.00      0.99      0.99    183905

    accuracy                           0.99    367659
   macro avg       0.99      0.99      0.99    367659
weighted avg       0.99      0.99      0.99    367659


Confusion Matrix: 
 [[183055    699]
 [  1652 182253]]

ROC AUC: 0.9936

Average Accuracy: 	 0.9380
Accuracy SD: 		 0.0006
Test Result:

accuracy score: 0.9398

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.92      0.96      0.94     22931
         1.0       0.96      0.92      0.94     23027

    accuracy                           0.94     45958
   macro avg       0.94      0.94      0.94     45958
weighted avg       0.94      0.94      0.94     45958


Confusion Matrix: 
 [[22083   848]
 [ 1919 21108]]

ROC AUC: 0.9398

