In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report

In [26]:
heart_2022_df = pd.read_csv('heart_2022_cleaned.csv')

In [27]:
heart_2022_df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,27.99,Yes,No,No,4.0,0.0,No,Female,65-69,White,No,Yes,Very good,9.0,No,No,No
1,No,30.13,Yes,No,No,0.0,0.0,No,Male,70-74,White,Yes,Yes,Very good,6.0,No,No,No
2,No,31.66,Yes,Yes,No,0.0,0.0,Yes,Male,75-79,White,No,No,Very good,8.0,No,No,No
3,No,31.32,No,No,No,5.0,0.0,Yes,Female,80 or older,White,No,Yes,Fair,9.0,No,No,Yes
4,No,33.07,No,No,No,3.0,15.0,No,Female,80 or older,White,No,Yes,Good,5.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,No,32.28,No,Yes,No,0.0,0.0,No,Male,60-64,White,No,Yes,Very good,6.0,No,No,No
246018,No,24.34,No,No,No,0.0,7.0,No,Female,25-29,Black,No,Yes,Fair,7.0,No,No,No
246019,No,29.86,No,Yes,Yes,0.0,15.0,No,Male,65-69,Other,Yes,Yes,Good,7.0,No,No,No
246020,No,28.66,No,No,No,2.0,2.0,No,Female,50-54,Black,No,Yes,Excellent,7.0,No,No,No


In [28]:
X = heart_2022_df.drop('HeartDisease', axis=1)
y = heart_2022_df['HeartDisease']

In [29]:
label_encoder = LabelEncoder()

In [30]:
y_encoded = label_encoder.fit_transform(y)

In [31]:
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse=False), categorical_cols)
    ],
    remainder='passthrough'
)
X_encoded = preprocessor.fit_transform(X)



In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [36]:
random_forest_model = RandomForestClassifier(random_state=42)

In [37]:
random_forest_model.fit(X_train, y_train)

In [38]:
y_pred = random_forest_model.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.94


In [40]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     46573
           1       0.21      0.03      0.05      2632

    accuracy                           0.94     49205
   macro avg       0.58      0.51      0.51     49205
weighted avg       0.91      0.94      0.92     49205

