In [5]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image
from sklearn.utils import resample

# Loading Data

In [6]:
#load full dataset
df_heart = pd.read_csv('heart_2020_cleaned.csv')
df_heart.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [7]:
df_heart.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


# Convert Features
### Age categories to numerical (0-12)
### Race categories to numerical (0-5)
### Diabetic categories to numerical (0-3)
### GenHealth from categories to numerical (0-4)

In [8]:
#change category data to numerical 
df_heart.loc[df_heart['AgeCategory'].str.contains('18-24'), 'AgeCategories'] = 0
df_heart.loc[df_heart['AgeCategory'].str.contains('25-29'), 'AgeCategories'] = 1
df_heart.loc[df_heart['AgeCategory'].str.contains('30-34'), 'AgeCategories'] = 2
df_heart.loc[df_heart['AgeCategory'].str.contains('35-39'), 'AgeCategories'] = 3
df_heart.loc[df_heart['AgeCategory'].str.contains('40-44'), 'AgeCategories'] = 4
df_heart.loc[df_heart['AgeCategory'].str.contains('45-49'), 'AgeCategories'] = 5
df_heart.loc[df_heart['AgeCategory'].str.contains('50-54'), 'AgeCategories'] = 6
df_heart.loc[df_heart['AgeCategory'].str.contains('55-59'), 'AgeCategories'] = 7
df_heart.loc[df_heart['AgeCategory'].str.contains('60-64'), 'AgeCategories'] = 8
df_heart.loc[df_heart['AgeCategory'].str.contains('65-69'), 'AgeCategories'] = 9
df_heart.loc[df_heart['AgeCategory'].str.contains('70-74'), 'AgeCategories'] = 10
df_heart.loc[df_heart['AgeCategory'].str.contains('75-79'), 'AgeCategories'] = 11
df_heart.loc[df_heart['AgeCategory'].str.contains('80 or older'), 'AgeCategories'] = 12

df_heart.loc[df_heart['Race'].str.contains('American Indian/Alaskan Native'), 'RaceCategories'] = 0
df_heart.loc[df_heart['Race'].str.contains('Asian'), 'RaceCategories'] = 1
df_heart.loc[df_heart['Race'].str.contains('Black'), 'RaceCategories'] = 2
df_heart.loc[df_heart['Race'].str.contains('Hispanic'), 'RaceCategories'] = 3
df_heart.loc[df_heart['Race'].str.contains('Other'), 'RaceCategories'] = 4
df_heart.loc[df_heart['Race'].str.contains('White'), 'RaceCategories'] = 5

df_heart.loc[df_heart['Diabetic'].str.contains('No'), 'DiabeticCategories'] = 0
df_heart.loc[df_heart['Diabetic'].str.contains('No, borderline diabetes'), 'DiabeticCategories'] = 1
df_heart.loc[df_heart['Diabetic'].str.contains('Yes (during pregnancy)'), 'DiabeticCategories'] = 2
df_heart.loc[df_heart['Diabetic'].str.contains('Yes'), 'DiabeticCategories'] = 3

df_heart.loc[df_heart['GenHealth'].str.contains('Excellent'), 'GenHealthCategories'] = 0
df_heart.loc[df_heart['GenHealth'].str.contains('Very good'), 'GenHealthCategories'] = 1
df_heart.loc[df_heart['GenHealth'].str.contains('Good'), 'GenHealthCategories'] = 2
df_heart.loc[df_heart['GenHealth'].str.contains('Fair'), 'GenHealthCategories'] = 3
df_heart.loc[df_heart['GenHealth'].str.contains('Poor'), 'GenHealthCategories'] = 4

df_heart.head()
df_heart.columns




  df_heart.loc[df_heart['Diabetic'].str.contains('Yes (during pregnancy)'), 'DiabeticCategories'] = 2


Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer', 'AgeCategories',
       'RaceCategories', 'DiabeticCategories', 'GenHealthCategories'],
      dtype='object')

In [9]:
#info for data types 
df_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   HeartDisease         319795 non-null  object 
 1   BMI                  319795 non-null  float64
 2   Smoking              319795 non-null  object 
 3   AlcoholDrinking      319795 non-null  object 
 4   Stroke               319795 non-null  object 
 5   PhysicalHealth       319795 non-null  float64
 6   MentalHealth         319795 non-null  float64
 7   DiffWalking          319795 non-null  object 
 8   Sex                  319795 non-null  object 
 9   AgeCategory          319795 non-null  object 
 10  Race                 319795 non-null  object 
 11  Diabetic             319795 non-null  object 
 12  PhysicalActivity     319795 non-null  object 
 13  GenHealth            319795 non-null  object 
 14  SleepTime            319795 non-null  float64
 15  Asthma           

In [10]:
#drop old category columns 
df_one = df_heart.copy()
df_one.drop(["AgeCategory","Race","Diabetic","GenHealth"], axis=1, inplace=True)
df_one.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer,AgeCategories,RaceCategories,DiabeticCategories,GenHealthCategories
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,Yes,5.0,Yes,No,Yes,7.0,5.0,3.0,1.0
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,Yes,7.0,No,No,No,12.0,5.0,0.0,1.0
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,Yes,8.0,Yes,No,No,9.0,5.0,3.0,3.0
3,No,24.21,No,No,No,0.0,0.0,No,Female,No,6.0,No,No,Yes,11.0,5.0,0.0,2.0
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,Yes,8.0,No,No,No,4.0,5.0,0.0,1.0


# Transform data (yes and no) to dummy variables

In [11]:
#transform to dummy categories and view columns
df_dummies = pd.get_dummies(df_one)
df_dummies.columns

Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'AgeCategories',
       'RaceCategories', 'DiabeticCategories', 'GenHealthCategories',
       'HeartDisease_No', 'HeartDisease_Yes', 'Smoking_No', 'Smoking_Yes',
       'AlcoholDrinking_No', 'AlcoholDrinking_Yes', 'Stroke_No', 'Stroke_Yes',
       'DiffWalking_No', 'DiffWalking_Yes', 'Sex_Female', 'Sex_Male',
       'PhysicalActivity_No', 'PhysicalActivity_Yes', 'Asthma_No',
       'Asthma_Yes', 'KidneyDisease_No', 'KidneyDisease_Yes', 'SkinCancer_No',
       'SkinCancer_Yes'],
      dtype='object')

In [12]:
#grab only one of the dummy columns (Yes) do not use (NO)
df_two = df_dummies[['HeartDisease_Yes','BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime', 'Smoking_Yes', 'AlcoholDrinking_Yes','Stroke_Yes', 'DiffWalking_Yes','Sex_Male','PhysicalActivity_Yes','Asthma_Yes' , 'KidneyDisease_Yes',"AgeCategories","RaceCategories","DiabeticCategories","GenHealthCategories"]].copy()
df_two.columns
df_two.head()

Unnamed: 0,HeartDisease_Yes,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Male,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,AgeCategories,RaceCategories,DiabeticCategories,GenHealthCategories
0,0,16.6,3.0,30.0,5.0,1,0,0,0,0,1,1,0,7.0,5.0,3.0,1.0
1,0,20.34,0.0,0.0,7.0,0,0,1,0,0,1,0,0,12.0,5.0,0.0,1.0
2,0,26.58,20.0,30.0,8.0,1,0,0,0,1,1,1,0,9.0,5.0,3.0,3.0
3,0,24.21,0.0,0.0,6.0,0,0,0,0,0,0,0,0,11.0,5.0,0.0,2.0
4,0,23.71,28.0,0.0,8.0,0,0,0,1,0,1,0,0,4.0,5.0,0.0,1.0


In [13]:
#Data types
df_two.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDisease_Yes      319795 non-null  uint8  
 1   BMI                   319795 non-null  float64
 2   PhysicalHealth        319795 non-null  float64
 3   MentalHealth          319795 non-null  float64
 4   SleepTime             319795 non-null  float64
 5   Smoking_Yes           319795 non-null  uint8  
 6   AlcoholDrinking_Yes   319795 non-null  uint8  
 7   Stroke_Yes            319795 non-null  uint8  
 8   DiffWalking_Yes       319795 non-null  uint8  
 9   Sex_Male              319795 non-null  uint8  
 10  PhysicalActivity_Yes  319795 non-null  uint8  
 11  Asthma_Yes            319795 non-null  uint8  
 12  KidneyDisease_Yes     319795 non-null  uint8  
 13  AgeCategories         319795 non-null  float64
 14  RaceCategories        319795 non-null  float64
 15  

# Stats on all columns

In [14]:
'''Stats on all columns'''
df_two.describe()

Unnamed: 0,HeartDisease_Yes,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Male,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,AgeCategories,RaceCategories,DiabeticCategories,GenHealthCategories
count,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0
mean,0.085595,28.325399,3.37171,3.898366,7.097075,0.412477,0.068097,0.03774,0.13887,0.475273,0.775362,0.134061,0.036833,6.514536,4.396742,0.427974,1.404972
std,0.279766,6.3561,7.95085,7.955235,1.436007,0.492281,0.251912,0.190567,0.345812,0.499389,0.417344,0.340718,0.188352,3.564759,1.212208,1.028764,1.042918
min,0.0,12.02,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,24.03,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,5.0,0.0,1.0
50%,0.0,27.34,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.0,5.0,0.0,1.0
75%,0.0,31.42,2.0,3.0,8.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,9.0,5.0,0.0,2.0
max,1.0,94.85,30.0,30.0,24.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,5.0,3.0,4.0


# Preprocessing

In [15]:
# Define features set
X = df_two.copy()
X.drop("HeartDisease_Yes", axis=1, inplace=True)
X.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Male,PhysicalActivity_Yes,Asthma_Yes,KidneyDisease_Yes,AgeCategories,RaceCategories,DiabeticCategories,GenHealthCategories
0,16.6,3.0,30.0,5.0,1,0,0,0,0,1,1,0,7.0,5.0,3.0,1.0
1,20.34,0.0,0.0,7.0,0,0,1,0,0,1,0,0,12.0,5.0,0.0,1.0
2,26.58,20.0,30.0,8.0,1,0,0,0,1,1,1,0,9.0,5.0,3.0,3.0
3,24.21,0.0,0.0,6.0,0,0,0,0,0,0,0,0,11.0,5.0,0.0,2.0
4,23.71,28.0,0.0,8.0,0,0,0,1,0,1,0,0,4.0,5.0,0.0,1.0


In [16]:
# Define target vector
y = df_two["HeartDisease_Yes"].ravel()
y

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [17]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14)

# Creating StandardScaler instance
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

#scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Random Forest Model

In [18]:
# Create a random forest classifier 
rf_model = RandomForestClassifier(n_estimators=150, random_state=42)

#fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

#make predictions using the scaled test data
predictions = rf_model.predict(X_test_scaled)

In [19]:
#calculate confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71551,1569
Actual 1,6010,819


In [93]:
acc_score = accuracy_score(y_test, predictions)

In [94]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,71551,1569
Actual 1,6010,819


Accuracy Score : 0.9052020663172773
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     73120
           1       0.34      0.12      0.18      6829

    accuracy                           0.91     79949
   macro avg       0.63      0.55      0.56     79949
weighted avg       0.87      0.91      0.88     79949



In [20]:
# Calculate feature importance
importances = rf_model.feature_importances_

# Sort and display feature importance 
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3656149544561324, 'BMI'),
 (0.1071145053533656, 'SleepTime'),
 (0.10389289046139177, 'AgeCategories'),
 (0.07590716406753314, 'PhysicalHealth'),
 (0.06782017762465399, 'MentalHealth'),
 (0.060033224979172094, 'GenHealthCategories'),
 (0.03671812954261984, 'RaceCategories'),
 (0.02556850834350531, 'DiabeticCategories'),
 (0.025227310146670155, 'Stroke_Yes'),
 (0.025053692376693356, 'PhysicalActivity_Yes'),
 (0.022762972330551034, 'DiffWalking_Yes'),
 (0.02061955051535811, 'Asthma_Yes'),
 (0.020088300690720925, 'Sex_Male'),
 (0.019278700463941097, 'Smoking_Yes'),
 (0.014404967752168187, 'KidneyDisease_Yes'),
 (0.009894950895522937, 'AlcoholDrinking_Yes')]