# Import Library

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance


# Reading Dataset

In [2]:
"""
data = pd.read_csv("data/heart_attack_prediction_indonesia.csv", keep_default_na=False)
data.head()

data_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
data_reduced = data_shuffled.head(10000)
print(data_reduced.shape)

data_reduced.to_csv('data/data_reduced.csv', index=False)
""";

In [3]:
data = pd.read_csv("data/data_reduced.csv", keep_default_na=False)
data

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,52,Female,Urban,Middle,0,1,217,0,84,0,...,67,70,58,125,57,Abnormal,0,1,0,1
1,69,Male,Urban,Low,1,0,233,0,114,0,...,68,90,52,128,144,Normal,1,0,1,1
2,74,Male,Urban,Middle,0,0,176,0,57,0,...,90,137,31,133,150,Normal,0,0,1,0
3,48,Female,Urban,Middle,1,0,143,0,96,1,...,79,81,57,127,159,Normal,0,0,0,0
4,38,Female,Urban,Low,1,0,176,0,89,1,...,79,136,42,160,214,Normal,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,43,Male,Rural,Low,1,0,255,1,110,0,...,74,160,50,111,100,Normal,0,1,0,1
9996,53,Female,Urban,Low,0,0,115,0,90,1,...,70,70,59,146,190,Normal,1,1,1,0
9997,55,Male,Rural,Middle,0,1,150,0,95,1,...,83,94,54,161,101,Normal,0,0,1,0
9998,59,Male,Rural,Middle,0,0,222,0,80,0,...,84,153,60,98,148,Normal,0,0,1,0


# Define Features and Target

In [4]:
features = ['age', 'gender', 'region', 'income_level', 'hypertension', 'diabetes',
    'cholesterol_level', 'obesity', 'waist_circumference', 'family_history',
    'smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits',
    'air_pollution_exposure', 'stress_level', 'sleep_hours',
    'blood_pressure_systolic', 'blood_pressure_diastolic', 'fasting_blood_sugar',
    'cholesterol_hdl', 'cholesterol_ldl', 'triglycerides', 'EKG_results',
    'medication_usage', 'participated_in_free_screening', 'previous_heart_disease']
target = 'heart_attack'

print(f"Number of Features: {len(features)}")

Number of Features: 27


# Preprocessing Non-Numerical Features

In [5]:
categorical_features = ['gender', 'region', 'income_level', 'smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits',
                        'air_pollution_exposure', 'stress_level', 'EKG_results']
numeric_features = [col for col in features if col not in categorical_features]

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)
svc_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ]
)

# Splitting Data Into 5 Folds

In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define Model

In [7]:
models = {
    'XGB': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(random_state=42))
    ]),
    'RFC': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ]),
    'GBC': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(random_state=42))
    ]),
    'SVC': Pipeline(steps=[
        ('preprocessor', svc_preprocessor),
        ('classifier', SVC(random_state=42))
    ])
}

results = {model: {'accuracy': [],'precision': [],'recall': [],'f1': []} for model in models}

# Training

In [8]:
for train_index, test_index in kf.split(data[features]):
    X_train, X_test = data[features].iloc[train_index], data[features].iloc[test_index]
    y_train, y_test = data[target].iloc[train_index], data[target].iloc[test_index]
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        results[name]['accuracy'].append(accuracy_score(y_test, y_pred))
        results[name]['precision'].append(precision_score(y_test, y_pred))
        results[name]['recall'].append(recall_score(y_test, y_pred))
        results[name]['f1'].append(f1_score(y_test, y_pred))

warnings.filterwarnings("ignore", category=UserWarning)

# Evaluate Model

In [9]:
for name, scores in results.items():
    print(f"\nModel: {name}")
    print(f"Accuracy:  {np.mean(scores['accuracy'])*100:.4f}%")
    print(f"Precision: {np.mean(scores['precision'])*100:.4f}%")
    print(f"Recall:    {np.mean(scores['recall'])*100:.4f}%")
    print(f"F1 Score:  {np.mean(scores['f1'])*100:.4f}%")


Model: XGB
Accuracy:  71.3400%
Precision: 65.8847%
Recall:    61.9236%
F1 Score:  63.8406%

Model: RFC
Accuracy:  72.5100%
Precision: 69.6552%
Recall:    58.0710%
F1 Score:  63.3135%

Model: GBC
Accuracy:  73.5900%
Precision: 69.6278%
Recall:    62.7344%
F1 Score:  66.0001%

Model: SVC
Accuracy:  72.6400%
Precision: 68.6519%
Recall:    60.8029%
F1 Score:  64.4879%


# Check Feature Importance

In [10]:
def get_feature_importance(model, model_name, X, y):
    feature_names = model.named_steps['preprocessor'].get_feature_names_out()

    if model_name == 'XGB':
        importances = model.named_steps['classifier'].feature_importances_
    elif model_name == 'RFC':
        importances = model.named_steps['classifier'].feature_importances_
    elif model_name == 'GBC':
        importances = model.named_steps['classifier'].feature_importances_
    elif model_name == 'SVC':
        perm_importance = permutation_importance(model, X, y, n_repeats=3, random_state=42)
        importances = perm_importance.importances_mean
    else:
        return None

    feature_importance = dict(zip(feature_names, importances))
    
    return dict(sorted(feature_importance.items(), key=lambda item: abs(item[1]), reverse=True))

In [None]:
X = data[features]
y = data[target]

for name, model in models.items():
    print(f"\nFeature Importance for {name}:")
    importance = get_feature_importance(model, name, X, y)
    if importance:
        for feature, value in importance.items():
            print(f"{feature}: {value}")
    print()


Feature Importance for XGB:
remainder__previous_heart_disease: 0.14246341586112976
remainder__hypertension: 0.1413353532552719
remainder__diabetes: 0.07951324433088303
cat__smoking_status_Current: 0.0750470981001854
remainder__obesity: 0.06057510897517204
remainder__age: 0.02210092917084694
cat__alcohol_consumption_High: 0.020117957144975662
remainder__cholesterol_level: 0.019334951415657997
remainder__fasting_blood_sugar: 0.018457522615790367
cat__EKG_results_Abnormal: 0.01784081943333149
cat__physical_activity_High: 0.01620306633412838
cat__smoking_status_Past: 0.01608036458492279
cat__physical_activity_Low: 0.016031891107559204
cat__region_Rural: 0.015810443088412285
remainder__cholesterol_hdl: 0.015507325530052185
remainder__waist_circumference: 0.015462658368051052
remainder__triglycerides: 0.015316380187869072
cat__alcohol_consumption_None: 0.015276569873094559
remainder__cholesterol_ldl: 0.015145831741392612
cat__dietary_habits_Healthy: 0.015134856104850769
cat__stress_level_Lo