In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
df = pd.read_csv('heart_2020_cleaned.csv')
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [3]:
df.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [4]:
dummies_df = pd.get_dummies(df)

In [5]:
dummies_df = dummies_df.drop(columns=['HeartDisease_No', 'Smoking_No', 'AlcoholDrinking_No', 'Stroke_No',
                                      'DiffWalking_No', 'Diabetic_No', 'Diabetic_No, borderline diabetes', 'Diabetic_Yes (during pregnancy)',
                                     'PhysicalActivity_No', 'Asthma_No', 'KidneyDisease_No', 'SkinCancer_No'])
dummies_df

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Female,...,Diabetic_Yes,PhysicalActivity_Yes,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.60,3.0,30.0,5.0,0,1,0,0,0,1,...,1,1,0,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,0,1,0,0,0,0,...,1,1,0,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,27.41,7.0,0.0,6.0,1,1,0,0,1,0,...,1,0,0,1,0,0,0,1,0,0
319791,29.84,0.0,0.0,5.0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
319792,24.24,0.0,0.0,6.0,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
319793,32.81,0.0,0.0,12.0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [6]:
X = dummies_df.copy()
X.drop("HeartDisease_Yes", axis=1, inplace=True)
X.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Female,Sex_Male,...,Diabetic_Yes,PhysicalActivity_Yes,GenHealth_Excellent,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,1,0,0,0,1,0,...,1,1,0,0,0,0,1,1,0,1
1,20.34,0.0,0.0,7.0,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,0,0
2,26.58,20.0,30.0,8.0,1,0,0,0,0,1,...,1,1,0,1,0,0,0,1,0,0
3,24.21,0.0,0.0,6.0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
4,23.71,28.0,0.0,8.0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,0,0


In [7]:
y = dummies_df["HeartDisease_Yes"].ravel()
y

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14)

In [9]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
rf_model = RandomForestClassifier(n_estimators=150, random_state=42)

In [11]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
predictions = rf_model.predict(X_test_scaled)

In [13]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71458,1662
Actual 1,5999,830


In [14]:
acc_score = accuracy_score(y_test, predictions)

In [15]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71458,1662
Actual 1,5999,830


Accuracy Score : 0.9041764124629451
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     73120
           1       0.33      0.12      0.18      6829

    accuracy                           0.90     79949
   macro avg       0.63      0.55      0.56     79949
weighted avg       0.87      0.90      0.88     79949



In [16]:
importances = rf_model.feature_importances_

In [17]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.34932263598660573, 'BMI'),
 (0.11128692993021412, 'SleepTime'),
 (0.07956517285793889, 'PhysicalHealth'),
 (0.06876943100106, 'MentalHealth'),
 (0.028834863459870602, 'Stroke_Yes'),
 (0.02595505431160364, 'PhysicalActivity_Yes'),
 (0.024717927289688028, 'DiffWalking_Yes'),
 (0.022928144550659245, 'Diabetic_Yes'),
 (0.022174538075900946, 'Asthma_Yes'),
 (0.02048302232968573, 'Smoking_Yes'),
 (0.017496114304512988, 'AgeCategory_80 or older'),
 (0.016756709710852934, 'SkinCancer_Yes'),
 (0.016089032745666546, 'KidneyDisease_Yes'),
 (0.015044108543686756, 'GenHealth_Poor'),
 (0.012896520402142298, 'GenHealth_Fair'),
 (0.012702627232041469, 'Race_White'),
 (0.01248611101382768, 'AgeCategory_75-79'),
 (0.01237185180377366, 'AgeCategory_70-74'),
 (0.010684971303750783, 'AgeCategory_65-69'),
 (0.010183739999961828, 'AlcoholDrinking_Yes'),
 (0.00957771598344391, 'AgeCategory_60-64'),
 (0.008877086156468502, 'GenHealth_Good'),
 (0.008782761836144296, 'Sex_Female'),
 (0.008443501778898221, 'S