In [1]:
# Import the modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Loading and Preprocessing Data

In [2]:
# Read in CSV
df = pd.read_csv('heart_2020_cleaned.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
# Check data types
df.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [4]:
# Create dummy variables
dummies_df = pd.get_dummies(df)

# Drop columns to limit repetitive features
dummies_df = dummies_df.drop(columns=['HeartDisease_No', 'Smoking_No', 'AlcoholDrinking_No', 'Stroke_No',
                                      'DiffWalking_No', 'Diabetic_No', 'Diabetic_No, borderline diabetes', 'Diabetic_Yes (during pregnancy)',
                                     'PhysicalActivity_No', 'Asthma_No', 'KidneyDisease_No', 'SkinCancer_No'])
dummies_df.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Female,...,Diabetic_Yes,PhysicalActivity_Yes,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,0,1,0,0,0,1,...,1,1,0,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,0,1,0,0,0,0,...,1,1,0,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,0


In [5]:
# Define features set
X = dummies_df.copy()
X.drop("HeartDisease_Yes", axis=1, inplace=True)
X.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Female,Sex_Male,...,Diabetic_Yes,PhysicalActivity_Yes,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,1,0,0,0,1,0,...,1,1,0,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,1,0,0,0,0,1,...,1,1,0,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,0,0


In [6]:
# Define target vector 
y = dummies_df["HeartDisease_Yes"].ravel()

In [7]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [8]:
# Fit to StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Random Forest Model

In [9]:
# Create a random forest classifier 
rf_model = RandomForestClassifier(n_estimators=300, random_state=42)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make prediction using the test data
predictions = rf_model.predict(X_test_scaled)

In [10]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71414,1692
Actual 1,6049,794


Accuracy Score : 0.9031757745562796
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     73106
           1       0.32      0.12      0.17      6843

    accuracy                           0.90     79949
   macro avg       0.62      0.55      0.56     79949
weighted avg       0.87      0.90      0.88     79949



In [11]:
# Calculate feature importance
importances = rf_model.feature_importances_

# Sort and display feature importance 
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3496642554286117, 'BMI'),
 (0.1121023620680717, 'SleepTime'),
 (0.07952210220865989, 'PhysicalHealth'),
 (0.0684491199133936, 'MentalHealth'),
 (0.02817841210858876, 'Stroke_Yes'),
 (0.02607677134662993, 'PhysicalActivity_Yes'),
 (0.025168116609156684, 'DiffWalking_Yes'),
 (0.02325386933542871, 'Diabetic_Yes'),
 (0.021921987992176768, 'Asthma_Yes'),
 (0.02083129863618352, 'Smoking_Yes'),
 (0.017074111985287365, 'SkinCancer_Yes'),
 (0.01697994812179069, 'AgeCategory_80 or older'),
 (0.016408872110133157, 'KidneyDisease_Yes'),
 (0.014231983095507718, 'GenHealth_Poor'),
 (0.013404442571370823, 'GenHealth_Fair'),
 (0.012656658163254607, 'AgeCategory_75-79'),
 (0.012554118779267926, 'Race_White'),
 (0.012526484341151351, 'AgeCategory_70-74'),
 (0.010510990479144916, 'AgeCategory_65-69'),
 (0.009918788927652853, 'AlcoholDrinking_Yes'),
 (0.00949069289447484, 'AgeCategory_60-64'),
 (0.008778605880807762, 'GenHealth_Good'),
 (0.00860817829719681, 'Sex_Male'),
 (0.008541335345813708, 'Sex_F

In [12]:
# model name
rf_filename = "rf_model.joblib"

# save model
joblib.dump(rf_model, rf_filename)

['rf_model.joblib']

# Balanced Random Forest Model

In [13]:
# Create a balanced random forest classifier
brf_model = BalancedRandomForestClassifier(n_estimators=300, random_state=42)

#Fit the model
brf_model.fit(X_train, y_train)

# Calculated the balanced accuracy score
brf_predictions = brf_model.predict(X_test)

In [14]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, brf_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculate the accuracy score
brf_accuracy_score = balanced_accuracy_score(y_test, brf_predictions)

In [15]:
# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {brf_accuracy_score}")
print("Classification Report")
print(classification_report(y_test, brf_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51513,21593
Actual 1,1405,5438


Accuracy Score : 0.7496575310688034
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.70      0.82     73106
           1       0.20      0.79      0.32      6843

    accuracy                           0.71     79949
   macro avg       0.59      0.75      0.57     79949
weighted avg       0.91      0.71      0.78     79949



In [16]:
# Calculate feature importance
importances = brf_model.feature_importances_

# Sort and display feature importance 
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.23951574716186386, 'BMI'),
 (0.08729214607048648, 'SleepTime'),
 (0.06754713073279783, 'PhysicalHealth'),
 (0.05560818605033178, 'MentalHealth'),
 (0.041671674435726676, 'Diabetic_Yes'),
 (0.041560363763645226, 'DiffWalking_Yes'),
 (0.030362881264197537, 'AgeCategory_80 or older'),
 (0.02859825669470943, 'GenHealth_Excellent'),
 (0.027770996571649702, 'Stroke_Yes'),
 (0.025368296872318535, 'Smoking_Yes'),
 (0.023232227801995756, 'PhysicalActivity_Yes'),
 (0.020969176672916937, 'AgeCategory_75-79'),
 (0.020322518454810142, 'GenHealth_Fair'),
 (0.020022479694021874, 'GenHealth_Very good'),
 (0.019553603642610818, 'AgeCategory_70-74'),
 (0.01861798394237326, 'Asthma_Yes'),
 (0.016605062203696258, 'SkinCancer_Yes'),
 (0.01447396595036415, 'KidneyDisease_Yes'),
 (0.014263162618084357, 'AgeCategory_65-69'),
 (0.01347649809882638, 'GenHealth_Poor'),
 (0.013463173747706603, 'GenHealth_Good'),
 (0.01327258848876755, 'Sex_Female'),
 (0.013031096019136096, 'Sex_Male'),
 (0.012781345739052279,

In [17]:
# model name
brf_filename = "brf_model.joblib"

# save model
joblib.dump(brf_model, brf_filename)

['brf_model.joblib']

# Resample and use Balanced Random Forest Model

In [18]:
# Instantiate the random oversampler model
ros = RandomOverSampler(random_state=42)

# Fit the original training data to the ROS model
X_res, y_res = ros.fit_resample(X_train, y_train)

In [19]:
# Create a balanced random forest classifier
res_brf_model = BalancedRandomForestClassifier(n_estimators=300, random_state=42, n_jobs = 5)

#Fit the model
res_brf_model.fit(X_res, y_res)

# Calculated the balanced accuracy score
res_brf_predictions = res_brf_model.predict(X_test)

In [20]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, res_brf_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculate the accuracy score
res_brf_accuracy_score = balanced_accuracy_score(y_test, res_brf_predictions)

In [21]:
# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {res_brf_accuracy_score}")
print("Classification Report")
print(classification_report(y_test, res_brf_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69420,3686
Actual 1,5326,1517


Accuracy Score : 0.5856332283420439
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     73106
           1       0.29      0.22      0.25      6843

    accuracy                           0.89     79949
   macro avg       0.61      0.59      0.60     79949
weighted avg       0.87      0.89      0.88     79949



In [22]:
# Calculate feature importance
importances = res_brf_model.feature_importances_

# Sort and display feature importance 
sorted(zip(res_brf_model.feature_importances_, X.columns), reverse=True)

[(0.2748036066973997, 'BMI'),
 (0.08925302494451429, 'SleepTime'),
 (0.06796019784303571, 'PhysicalHealth'),
 (0.05744885318250986, 'MentalHealth'),
 (0.040948875243876845, 'DiffWalking_Yes'),
 (0.040142337911176613, 'Diabetic_Yes'),
 (0.02976597104436228, 'AgeCategory_80 or older'),
 (0.0281630528248814, 'GenHealth_Excellent'),
 (0.026922727354183103, 'Stroke_Yes'),
 (0.022229037742775744, 'Smoking_Yes'),
 (0.02111908312144576, 'PhysicalActivity_Yes'),
 (0.019638793081056573, 'AgeCategory_75-79'),
 (0.018872976099151114, 'GenHealth_Very good'),
 (0.01851033896906336, 'GenHealth_Fair'),
 (0.017880324088006088, 'AgeCategory_70-74'),
 (0.0168079939364921, 'Asthma_Yes'),
 (0.014231665480985693, 'SkinCancer_Yes'),
 (0.013032734323353175, 'GenHealth_Poor'),
 (0.01284888358652288, 'KidneyDisease_Yes'),
 (0.012628428262664177, 'AgeCategory_65-69'),
 (0.011752521648392916, 'Sex_Male'),
 (0.01156952391250512, 'Sex_Female'),
 (0.011555767883668452, 'GenHealth_Good'),
 (0.011075350171622027, 'Rac

In [23]:
# model name
res_brf_filename = "res_brf_model.joblib"

# save model
joblib.dump(res_brf_model, res_brf_filename)

['res_brf_model.joblib']