In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import joblib

from imblearn.over_sampling import SMOTE


df = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [5]:
df = df.drop(columns = ['GenHlth', 'MentHlth','PhysHlth','Income','Education'])
df.columns

Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
       'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex',
       'Age'],
      dtype='object')

In [6]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  DiffWalk              253680 non-null  float64
 15  

In [7]:
df['HeartDiseaseorAttack'].value_counts()
# shows inbalance in data

HeartDiseaseorAttack
0.0    229787
1.0     23893
Name: count, dtype: int64

In [8]:
y = df['HeartDiseaseorAttack'].values
X = df.drop(columns='HeartDiseaseorAttack', axis=1).values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Creating and Train the Random Forest Model

In [10]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [11]:
rf_model.fit(X_train, y_train)

### Making Predictions on the Test data

In [12]:
y_pred = rf_model.predict(X_test)

In [13]:
# Model Evaluation
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

[[44453  1515]
 [ 4046   722]]
              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94     45968
         1.0       0.32      0.15      0.21      4768

    accuracy                           0.89     50736
   macro avg       0.62      0.56      0.57     50736
weighted avg       0.86      0.89      0.87     50736

Accuracy: 0.8903934090192368


        --- Still Low for predicting heart disease so I will try to adjust the tresold to see if I can get a better result ---

In [14]:
# Predict probabilities
y_pred_probs = rf_model.predict_proba(X_test)[:, 1]


#### Adjusting tresold

In [15]:
threshold = 0.3
y_pred_adjusted = (y_pred_probs > threshold).astype(int)


In [16]:
# Evaluate the adjusted predictions
print(confusion_matrix(y_test, y_pred_adjusted))
print(classification_report(y_test, y_pred_adjusted))

[[42026  3942]
 [ 3253  1515]]
              precision    recall  f1-score   support

         0.0       0.93      0.91      0.92     45968
         1.0       0.28      0.32      0.30      4768

    accuracy                           0.86     50736
   macro avg       0.60      0.62      0.61     50736
weighted avg       0.87      0.86      0.86     50736



    #### This is better but still not great. I will try to use SMOTE to see if I can get a better result 

In [17]:
# creating a SMOTE object
smote = SMOTE(random_state=42)


In [18]:
# applying SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

#### Training Random Forest on resampled data

In [19]:
#creating random forest model with resampled data
rf_model_resampled = RandomForestClassifier(n_estimators=100, random_state=42)

In [20]:
#training the model
rf_model_resampled.fit(X_train_resampled, y_train_resampled)

In [21]:
# Save the trained model to a file
joblib.dump(rf_model_resampled, './Models/random_forest_resampled_model.pkl')

['./Models/random_forest_resampled_model.pkl']

##### Predictions and Evaluation 

In [22]:
# Make predictions using the resampled model
y_pred_resampled = rf_model_resampled.predict(X_test)

In [23]:

# Evaluate the resampled model
print(confusion_matrix(y_test, y_pred_resampled))
print(classification_report(y_test, y_pred_resampled))
print("Accuracy:", accuracy_score(y_test, y_pred_resampled))

[[42610  3358]
 [ 3534  1234]]
              precision    recall  f1-score   support

         0.0       0.92      0.93      0.93     45968
         1.0       0.27      0.26      0.26      4768

    accuracy                           0.86     50736
   macro avg       0.60      0.59      0.59     50736
weighted avg       0.86      0.86      0.86     50736

Accuracy: 0.8641595711132135


     Even applying SMOTE, the accuracy is not improved.