In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import VotingClassifier

from tensorflow import keras

from imblearn.over_sampling import SMOTE


import xgboost as xgb

import joblib



df = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [8]:
# Load the Random Forest model from the file
rf_model = joblib.load('./Models/random_forest_resampled_model.pkl')

In [10]:
# Load the XGBoost model from the file
xgb_model = xgb.XGBClassifier()
xgb_model.load_model('./Models/xgboost_model.model')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [14]:
df = df.drop(columns = ['GenHlth', 'MentHlth','PhysHlth','Income','Education'])
df.columns

Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
       'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex',
       'Age'],
      dtype='object')

In [15]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  DiffWalk              253680 non-null  float64
 15  

In [16]:
df['HeartDiseaseorAttack'].value_counts()
# shows inbalance in data

HeartDiseaseorAttack
0.0    229787
1.0     23893
Name: count, dtype: int64

In [17]:
y = df['HeartDiseaseorAttack'].values
X = df.drop(columns='HeartDiseaseorAttack', axis=1).values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Creating The voting classifier and training


In [21]:
# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model)],
    voting='soft'  # 'soft' averaging final prediction
)

In [22]:
# Train the voting classifier
voting_clf.fit(X_train, y_train)

### Making Predictions on the Test data

In [23]:
y_pred_voting = voting_clf.predict(X_test)

In [24]:
# Evaluate the voting classifier
print(confusion_matrix(y_test, y_pred_voting))
print(classification_report(y_test, y_pred_voting))
print("Accuracy:", accuracy_score(y_test, y_pred_voting))

[[45356   612]
 [ 4274   494]]
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     45968
         1.0       0.45      0.10      0.17      4768

    accuracy                           0.90     50736
   macro avg       0.68      0.55      0.56     50736
weighted avg       0.87      0.90      0.88     50736

Accuracy: 0.9036975717439294


In [26]:
# creating a SMOTE object
smote = SMOTE(random_state=42)


In [27]:
# applying SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

#### Training voting classifier on resampled data

In [28]:
# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model)],
    voting='soft'  # 'soft' averaging final prediction
)

In [29]:
# Train the voting classifier
voting_clf.fit(X_train_resampled, y_train_resampled)

##### Predictions and Evaluation 

In [30]:
# Make predictions using the resampled model
y_pred_resampled = voting_clf.predict(X_test)

In [31]:

# Evaluate the resampled model
print(confusion_matrix(y_test, y_pred_resampled))
print(classification_report(y_test, y_pred_resampled))
print("Accuracy:", accuracy_score(y_test, y_pred_resampled))

[[43294  2674]
 [ 3570  1198]]
              precision    recall  f1-score   support

         0.0       0.92      0.94      0.93     45968
         1.0       0.31      0.25      0.28      4768

    accuracy                           0.88     50736
   macro avg       0.62      0.60      0.61     50736
weighted avg       0.87      0.88      0.87     50736

Accuracy: 0.8769315673289183


     Even applying SMOTE, the accuracy is not improved.