## Import the Library

In [1]:
import pandas as pd
import numpy as np

## Load and Inspect the Data

### Reading the Dataset

In [2]:
data = pd.read_csv("data/heart_attack_prediction_indonesia.csv")
data.head()

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,60,Male,Rural,Middle,0,1,211,0,83,0,...,62,173,48,121,101,Normal,0,0,0,0
1,53,Female,Urban,Low,0,0,208,0,106,1,...,76,70,58,83,138,Normal,1,0,1,0
2,62,Female,Urban,Low,0,0,231,1,112,1,...,74,118,69,130,171,Abnormal,0,1,0,1
3,73,Male,Urban,Low,1,0,202,0,82,1,...,65,98,52,85,146,Normal,0,1,1,0
4,52,Male,Urban,Middle,1,0,232,0,89,0,...,75,104,59,127,139,Normal,1,0,1,1


### Check for Missing Values

In [3]:
data.isna().sum()

age                                   0
gender                                0
region                                0
income_level                          0
hypertension                          0
diabetes                              0
cholesterol_level                     0
obesity                               0
waist_circumference                   0
family_history                        0
smoking_status                        0
alcohol_consumption               94848
physical_activity                     0
dietary_habits                        0
air_pollution_exposure                0
stress_level                          0
sleep_hours                           0
blood_pressure_systolic               0
blood_pressure_diastolic              0
fasting_blood_sugar                   0
cholesterol_hdl                       0
cholesterol_ldl                       0
triglycerides                         0
EKG_results                           0
previous_heart_disease                0


### Fix the Missing Values

In [4]:
data = pd.read_csv("data/heart_attack_prediction_indonesia.csv", keep_default_na=False)
data.isna().sum()

age                               0
gender                            0
region                            0
income_level                      0
hypertension                      0
diabetes                          0
cholesterol_level                 0
obesity                           0
waist_circumference               0
family_history                    0
smoking_status                    0
alcohol_consumption               0
physical_activity                 0
dietary_habits                    0
air_pollution_exposure            0
stress_level                      0
sleep_hours                       0
blood_pressure_systolic           0
blood_pressure_diastolic          0
fasting_blood_sugar               0
cholesterol_hdl                   0
cholesterol_ldl                   0
triglycerides                     0
EKG_results                       0
previous_heart_disease            0
medication_usage                  0
participated_in_free_screening    0
heart_attack                

## Preprocessing

### Check the Data Types

In [5]:
data.dtypes

age                                 int64
gender                             object
region                             object
income_level                       object
hypertension                        int64
diabetes                            int64
cholesterol_level                   int64
obesity                             int64
waist_circumference                 int64
family_history                      int64
smoking_status                     object
alcohol_consumption                object
physical_activity                  object
dietary_habits                     object
air_pollution_exposure             object
stress_level                       object
sleep_hours                       float64
blood_pressure_systolic             int64
blood_pressure_diastolic            int64
fasting_blood_sugar                 int64
cholesterol_hdl                     int64
cholesterol_ldl                     int64
triglycerides                       int64
EKG_results                       

In [6]:
data.select_dtypes(include="object").shape[1]

10

### Shuffle and Reduce the Data

In [7]:
data_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)

data_reduced = data_shuffled.head(10000)

print(data_reduced.shape)

(10000, 28)


### Split the Data into Features and Target (X & y)

In [8]:
X = data_reduced.drop('heart_attack', axis = 1)
X.head()

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_systolic,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening
0,52,Female,Urban,Middle,0,1,217,0,84,0,...,154,67,70,58,125,57,Abnormal,0,1,0
1,69,Male,Urban,Low,1,0,233,0,114,0,...,129,68,90,52,128,144,Normal,1,0,1
2,74,Male,Urban,Middle,0,0,176,0,57,0,...,152,90,137,31,133,150,Normal,0,0,1
3,48,Female,Urban,Middle,1,0,143,0,96,1,...,101,79,81,57,127,159,Normal,0,0,0
4,38,Female,Urban,Low,1,0,176,0,89,1,...,142,79,136,42,160,214,Normal,1,1,1


In [9]:
y = data_reduced['heart_attack']
y.head()

0    1
1    1
2    0
3    0
4    0
Name: heart_attack, dtype: int64

### Data Conversion

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['gender', 'region', 'income_level', 'smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits',
                        'air_pollution_exposure', 'stress_level', 'EKG_results']

transformer = ColumnTransformer(
            [('one_hot', OneHotEncoder(), categorical_features)],
            remainder = "passthrough")

X_transformed = transformer.fit_transform(X)

In [12]:
X.head()

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_systolic,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening
0,52,Female,Urban,Middle,0,1,217,0,84,0,...,154,67,70,58,125,57,Abnormal,0,1,0
1,69,Male,Urban,Low,1,0,233,0,114,0,...,129,68,90,52,128,144,Normal,1,0,1
2,74,Male,Urban,Middle,0,0,176,0,57,0,...,152,90,137,31,133,150,Normal,0,0,1
3,48,Female,Urban,Middle,1,0,143,0,96,1,...,101,79,81,57,127,159,Normal,0,0,0
4,38,Female,Urban,Low,1,0,176,0,89,1,...,142,79,136,42,160,214,Normal,1,1,1


In [13]:
pd.DataFrame(X_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,5.408982,154.0,67.0,70.0,58.0,125.0,57.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,8.899686,129.0,68.0,90.0,52.0,128.0,144.0,1.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,6.502508,152.0,90.0,137.0,31.0,133.0,150.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,7.391399,101.0,79.0,81.0,57.0,127.0,159.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,9.000000,142.0,79.0,136.0,42.0,160.0,214.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,6.086351,129.0,74.0,160.0,50.0,111.0,100.0,0.0,1.0,0.0
9996,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,8.364264,141.0,70.0,70.0,59.0,146.0,190.0,1.0,1.0,1.0
9997,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,4.534615,126.0,83.0,94.0,54.0,161.0,101.0,0.0,0.0,1.0
9998,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,6.610461,133.0,84.0,153.0,60.0,98.0,148.0,0.0,0.0,1.0


In [14]:
dummies = pd.get_dummies(
    data_reduced[['gender', 'region', 'income_level', 'smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits', 'air_pollution_exposure',
          'stress_level', 'EKG_results']])
dummies

Unnamed: 0,gender_Female,gender_Male,region_Rural,region_Urban,income_level_High,income_level_Low,income_level_Middle,smoking_status_Current,smoking_status_Never,smoking_status_Past,...,dietary_habits_Healthy,dietary_habits_Unhealthy,air_pollution_exposure_High,air_pollution_exposure_Low,air_pollution_exposure_Moderate,stress_level_High,stress_level_Low,stress_level_Moderate,EKG_results_Abnormal,EKG_results_Normal
0,True,False,False,True,False,False,True,False,True,False,...,False,True,False,True,False,False,True,False,True,False
1,False,True,False,True,False,True,False,False,True,False,...,True,False,False,False,True,False,False,True,False,True
2,False,True,False,True,False,False,True,False,False,True,...,False,True,True,False,False,False,True,False,False,True
3,True,False,False,True,False,False,True,False,True,False,...,False,True,False,False,True,False,False,True,False,True
4,True,False,False,True,False,True,False,False,True,False,...,True,False,False,False,True,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,True,True,False,False,True,False,True,False,False,...,False,True,False,False,True,False,True,False,False,True
9996,True,False,False,True,False,True,False,False,True,False,...,False,True,False,False,True,False,False,True,False,True
9997,False,True,True,False,False,False,True,True,False,False,...,False,True,False,True,False,False,False,True,False,True
9998,False,True,True,False,False,False,True,False,True,False,...,False,True,True,False,False,False,True,False,False,True


## Splitting Data Into Training and Testing

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

## Fit into Model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

model_svc = SVC()
model_svc.fit(X_train, y_train)
y_pred_svc = model_svc.predict(X_test)

model_gb = GradientBoostingClassifier()
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)

model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)


In [18]:
from sklearn.model_selection import cross_val_score

models = {
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

for name, model in models.items():
    scores = cross_val_score(model, X_transformed, y, cv=5, scoring='accuracy')
    print(f"\n{name} Cross-Validation Accuracy:")
    print(f"Scores: {scores}")
    print(f"Mean Accuracy: {scores.mean():.4f} | Std: {scores.std():.4f}")


Random Forest Cross-Validation Accuracy:
Scores: [0.7375 0.729  0.7195 0.7255 0.7215]
Mean Accuracy: 0.7266 | Std: 0.0064

SVM Cross-Validation Accuracy:
Scores: [0.591  0.591  0.591  0.591  0.5915]
Mean Accuracy: 0.5911 | Std: 0.0002

Gradient Boosting Cross-Validation Accuracy:
Scores: [0.741  0.73   0.7415 0.7375 0.733 ]
Mean Accuracy: 0.7366 | Std: 0.0045

XGBoost Cross-Validation Accuracy:
Scores: [0.704  0.7185 0.7015 0.706  0.7155]
Mean Accuracy: 0.7091 | Std: 0.0067
