In [1]:
# Import the modules
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Create mount to Google Drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Loading and Preprocessing Data

In [3]:
# Read in CSV
df = pd.read_csv('gdrive/MyDrive/Colab Notebooks/Resources/heart_2020_cleaned.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
# Check data types
df.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [5]:
# Create dummy variables
dummies_df = pd.get_dummies(df)

# Drop columns to limit repetitive features
dummies_df = dummies_df.drop(columns=['HeartDisease_No', 'Smoking_No', 'AlcoholDrinking_No', 'Stroke_No',
                                      'DiffWalking_No', 'Diabetic_No', 'Diabetic_No, borderline diabetes', 'Diabetic_Yes (during pregnancy)',
                                     'PhysicalActivity_No', 'Asthma_No', 'KidneyDisease_No', 'SkinCancer_No'])
dummies_df.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Female,...,Diabetic_Yes,PhysicalActivity_Yes,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,0,1,0,0,0,1,...,1,1,0,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,0,1,0,0,0,0,...,1,1,0,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,0


In [6]:
# Define features set
X = dummies_df.copy()
X.drop("HeartDisease_Yes", axis=1, inplace=True)
X.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Female,Sex_Male,...,Diabetic_Yes,PhysicalActivity_Yes,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,1,0,0,0,1,0,...,1,1,0,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,1,0,0,0,0,1,...,1,1,0,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,0,0


In [7]:
# Define target vector 
y = dummies_df["HeartDisease_Yes"].ravel()

In [8]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
# Fit to StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Random Forest Model

In [10]:
# Create a random forest classifier 
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make prediction using the test data
predictions = rf_model.predict(X_test_scaled)

In [11]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71387,1719
Actual 1,6037,806


Accuracy Score : 0.9029881549487798
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     73106
           1       0.32      0.12      0.17      6843

    accuracy                           0.90     79949
   macro avg       0.62      0.55      0.56     79949
weighted avg       0.87      0.90      0.88     79949



In [12]:
# Calculate feature importance
importances = rf_model.feature_importances_

# Sort and display feature importance 
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3500249962276639, 'BMI'),
 (0.11204644113304141, 'SleepTime'),
 (0.07938923975197652, 'PhysicalHealth'),
 (0.06839396599314732, 'MentalHealth'),
 (0.02792667235784461, 'Stroke_Yes'),
 (0.026285025986056833, 'PhysicalActivity_Yes'),
 (0.025360016542168917, 'DiffWalking_Yes'),
 (0.02355242824317988, 'Diabetic_Yes'),
 (0.021929960373365892, 'Asthma_Yes'),
 (0.02078735617245873, 'Smoking_Yes'),
 (0.01708725344577446, 'SkinCancer_Yes'),
 (0.01703104791141813, 'AgeCategory_80 or older'),
 (0.01635449385128816, 'KidneyDisease_Yes'),
 (0.014349526770013839, 'GenHealth_Poor'),
 (0.013364087109309109, 'GenHealth_Fair'),
 (0.012583373164495218, 'AgeCategory_75-79'),
 (0.012484036342080337, 'Race_White'),
 (0.01240447615827438, 'AgeCategory_70-74'),
 (0.01045380960664876, 'AgeCategory_65-69'),
 (0.009926109768895396, 'AlcoholDrinking_Yes'),
 (0.009460008458390252, 'AgeCategory_60-64'),
 (0.008789712164867838, 'GenHealth_Good'),
 (0.008634393704234118, 'Sex_Female'),
 (0.008611124531346435, 'Se

# Balanced Random Forest Model

In [13]:
# Create a balanced random forest classifier
brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=42)

#Fit the model
brf_model.fit(X_train, y_train)

# Calculated the balanced accuracy score
brf_predictions = brf_model.predict(X_test)

In [14]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, brf_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculate the accuracy score
brf_accuracy_score = balanced_accuracy_score(y_test, brf_predictions)

In [15]:
# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {brf_accuracy_score}")
print("Classification Report")
print(classification_report(y_test, brf_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51514,21592
Actual 1,1396,5447


Accuracy Score : 0.7503219767657323
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.70      0.82     73106
           1       0.20      0.80      0.32      6843

    accuracy                           0.71     79949
   macro avg       0.59      0.75      0.57     79949
weighted avg       0.91      0.71      0.78     79949



In [16]:
# Calculate feature importance
importances = brf_model.feature_importances_

# Sort and display feature importance 
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.2398732328517074, 'BMI'),
 (0.08744227961382767, 'SleepTime'),
 (0.06742553742390066, 'PhysicalHealth'),
 (0.055589038550394404, 'MentalHealth'),
 (0.04183128398750288, 'DiffWalking_Yes'),
 (0.04077270381520001, 'Diabetic_Yes'),
 (0.030129142776066327, 'AgeCategory_80 or older'),
 (0.028366315564143523, 'GenHealth_Excellent'),
 (0.027648130932290675, 'Stroke_Yes'),
 (0.025278671783474923, 'Smoking_Yes'),
 (0.02314538170503914, 'PhysicalActivity_Yes'),
 (0.020946756065032344, 'AgeCategory_75-79'),
 (0.02086091017303481, 'GenHealth_Fair'),
 (0.019906881400819195, 'GenHealth_Very good'),
 (0.019479518466913197, 'AgeCategory_70-74'),
 (0.018522963303604232, 'Asthma_Yes'),
 (0.016457502242186017, 'SkinCancer_Yes'),
 (0.014396965710675207, 'KidneyDisease_Yes'),
 (0.014182535322929547, 'AgeCategory_65-69'),
 (0.014119879065735906, 'GenHealth_Poor'),
 (0.013687993051944352, 'GenHealth_Good'),
 (0.013296033964499744, 'Sex_Female'),
 (0.013070362543935602, 'Sex_Male'),
 (0.012833842472976996

# Resample and use Balanced Random Forest Model

In [10]:
# Instantiate the random oversampler model
ros = RandomOverSampler(random_state=42)

# Fit the original training data to the ROS model
X_res, y_res = ros.fit_resample(X_train, y_train)

In [14]:
# Create a balanced random forest classifier
res_brf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=42, n_jobs = 5, verbose=3)

#Fit the model
res_brf_model.fit(X_res, y_res)

# Calculated the balanced accuracy score
res_brf_predictions = res_brf_model.predict(X_test)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.


building tree 1 of 500
building tree 2 of 500
building tree 5 of 500
building tree 3 of 500
building tree 4 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 10 of 500
building tree 9 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500


[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   24.0s


building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500building tree 34 of 500

building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 43 of 500
building tree 42 of 500
building tree 44 of 500
building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 49 of 500
building tree 48 of 500
building tree 50 of 500
building tree 51 of 500
building tree 52 of 500
building tree 53 of 500
building tree 54 of 500
building tree 55 of 500
building tree 56 of 500
building tree 57 of 500
building tree 59 of 500
building tree 58 of 500
building tree 60 of 500
building tree 61 of 500
building tree 62 of 500
building tree 63 of 500
building tree 64 of 500
building tree 65 of 500
building tree 66 of 500
building tree 67

[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  1.9min


building tree 122 of 500
building tree 123 of 500
building tree 124 of 500
building tree 125 of 500
building tree 126 of 500
building tree 127 of 500
building tree 128 of 500
building tree 129 of 500building tree 130 of 500

building tree 131 of 500
building tree 132 of 500
building tree 133 of 500
building tree 134 of 500
building tree 135 of 500
building tree 136 of 500
building tree 138 of 500
building tree 137 of 500
building tree 139 of 500
building tree 140 of 500
building tree 141 of 500
building tree 142 of 500
building tree 143 of 500
building tree 144 of 500
building tree 145 of 500
building tree 146 of 500
building tree 147 of 500
building tree 148 of 500
building tree 149 of 500
building tree 150 of 500
building tree 151 of 500
building tree 152 of 500
building tree 153 of 500building tree 154 of 500

building tree 155 of 500
building tree 156 of 500
building tree 157 of 500
building tree 158 of 500
building tree 160 of 500
building tree 159 of 500
building tree 161 of 500


[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:  4.3min


building tree 282 of 500
building tree 283 of 500
building tree 284 of 500
building tree 285 of 500
building tree 286 of 500
building tree 287 of 500
building tree 288 of 500
building tree 289 of 500
building tree 290 of 500
building tree 291 of 500
building tree 292 of 500
building tree 293 of 500
building tree 294 of 500
building tree 295 of 500
building tree 296 of 500
building tree 297 of 500
building tree 298 of 500
building tree 299 of 500
building tree 300 of 500
building tree 301 of 500
building tree 302 of 500
building tree 303 of 500
building tree 304 of 500
building tree 305 of 500
building tree 306 of 500
building tree 307 of 500
building tree 308 of 500
building tree 309 of 500
building tree 310 of 500
building tree 311 of 500
building tree 312 of 500
building tree 313 of 500
building tree 314 of 500
building tree 315 of 500
building tree 316 of 500
building tree 317 of 500
building tree 318 of 500
building tree 320 of 500building tree 319 of 500

building tree 321 of 500


[Parallel(n_jobs=5)]: Done 500 out of 500 | elapsed:  7.5min finished
[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    0.5s
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:    2.8s
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:    6.5s
[Parallel(n_jobs=5)]: Done 500 out of 500 | elapsed:   10.5s finished


In [15]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, res_brf_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculate the accuracy score
res_brf_accuracy_score = balanced_accuracy_score(y_test, res_brf_predictions)

In [16]:
# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {res_brf_accuracy_score}")
print("Classification Report")
print(classification_report(y_test, res_brf_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69436,3670
Actual 1,5332,1511


Accuracy Score : 0.5853042542758963
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     73106
           1       0.29      0.22      0.25      6843

    accuracy                           0.89     79949
   macro avg       0.61      0.59      0.60     79949
weighted avg       0.87      0.89      0.88     79949



In [20]:
display(sum(y_res)/len(y_res))

0.5

In [22]:
sum(y_train)/len(y_train)

0.08559659114598533