# Import Library

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance

# Reading Dataset

In [2]:
"""
data = pd.read_csv("data/heart_attack_prediction_indonesia.csv", keep_default_na=False)
data.head()

data_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)
data_reduced = data_shuffled.head(10000)
print(data_reduced.shape)

data_reduced.to_csv('data/data_reduced.csv', index=False)
""";

In [3]:
data = pd.read_csv("data/data_reduced.csv", keep_default_na=False)
data

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,52,Female,Urban,Middle,0,1,217,0,84,0,...,67,70,58,125,57,Abnormal,0,1,0,1
1,69,Male,Urban,Low,1,0,233,0,114,0,...,68,90,52,128,144,Normal,1,0,1,1
2,74,Male,Urban,Middle,0,0,176,0,57,0,...,90,137,31,133,150,Normal,0,0,1,0
3,48,Female,Urban,Middle,1,0,143,0,96,1,...,79,81,57,127,159,Normal,0,0,0,0
4,38,Female,Urban,Low,1,0,176,0,89,1,...,79,136,42,160,214,Normal,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,43,Male,Rural,Low,1,0,255,1,110,0,...,74,160,50,111,100,Normal,0,1,0,1
9996,53,Female,Urban,Low,0,0,115,0,90,1,...,70,70,59,146,190,Normal,1,1,1,0
9997,55,Male,Rural,Middle,0,1,150,0,95,1,...,83,94,54,161,101,Normal,0,0,1,0
9998,59,Male,Rural,Middle,0,0,222,0,80,0,...,84,153,60,98,148,Normal,0,0,1,0


# Manual Mapping for Ordinal Encoding

In [4]:
ordinal_mappings = {
    'income_level': {'Low': 0, 'Middle': 1, 'High': 2},
    'alcohol_consumption': {'None': 0, 'Moderate': 1, 'High': 2},
    'physical_activity': {'Low': 0, 'Moderate': 1, 'High': 2},
    'dietary_habits': {'Unhealthy': 0, 'Healthy': 1},
    'air_pollution_exposure': {'Low': 0, 'Moderate': 1, 'High': 2},
    'stress_level': {'Low': 0, 'Moderate': 1, 'High': 2},
    'smoking_status': {'Never': 0, 'Past': 1, 'Current': 2},
    'gender': {'Female': 0, 'Male': 1},
    'region': {'Rural': 0, 'Urban': 1},
    'EKG_results': {'Normal': 0, 'Abnormal': 1}
}

In [5]:
pd.set_option('future.no_silent_downcasting', True)
data_ordinal = data.copy()
data_ordinal.replace(ordinal_mappings, inplace=True)

# Define Features and Target

In [6]:
features = ['age', 'gender', 'region', 'income_level', 'hypertension', 'diabetes',
    'cholesterol_level', 'obesity', 'waist_circumference', 'family_history',
    'smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits',
    'air_pollution_exposure', 'stress_level', 'sleep_hours',
    'blood_pressure_systolic', 'blood_pressure_diastolic', 'fasting_blood_sugar',
    'cholesterol_hdl', 'cholesterol_ldl', 'triglycerides', 'EKG_results',
    'medication_usage', 'participated_in_free_screening', 'previous_heart_disease']
target = 'heart_attack'

print(f"Number of Features: {len(features)}")

Number of Features: 27


# Preprocessing Non-Numerical Features

In [7]:
categorical_features = ['gender', 'region', 'income_level', 'smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits',
                        'air_pollution_exposure', 'stress_level', 'EKG_results']
numeric_features = [col for col in features if col not in categorical_features]

onehot_transformer = Pipeline(steps=[ 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[['Low', 'Middle', 'High'], ['None', 'Moderate', 'High'], 
                                            ['Low', 'Moderate', 'High'], ['Unhealthy', 'Healthy'], 
                                            ['Low', 'Moderate', 'High'], ['Low', 'Moderate', 'High'], 
                                            ['Never', 'Past', 'Current'], ['Female', 'Male'], 
                                            ['Rural', 'Urban'], ['Normal', 'Abnormal']], 
                               handle_unknown='use_encoded_value', unknown_value=-1))
])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor_onehot = ColumnTransformer(
    transformers=[
        ('cat', onehot_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)  # Apply scaling for numeric features
    ],
    remainder='passthrough'
)

preprocessor_ordinal = ColumnTransformer(
    transformers=[
        ('cat', ordinal_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)  # Apply scaling for numeric features
    ],
    remainder='passthrough'
)

# Splitting Data Into 5 Folds

In [8]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define Model

In [9]:
models = {
    'XGB': XGBClassifier(random_state=42),
    'RFC': RandomForestClassifier(random_state=42),
    'GBC': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(random_state=42)
}

results = {model: {'accuracy_onehot': [], 'precision_onehot': [], 'recall_onehot': [], 'f1_onehot': [],
                   'accuracy_ordinal': [], 'precision_ordinal': [], 'recall_ordinal': [], 'f1_ordinal': []}
           for model in models}

# Training

In [31]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    for name, model in models.items():
        model_pipeline_onehot = Pipeline(steps=[
            ('preprocessor', preprocessor_onehot),
            ('classifier', model)
        ])
        model_pipeline_onehot.fit(X_train, y_train)
        y_pred_onehot = model_pipeline_onehot.predict(X_test)

        model_pipeline_ordinal = Pipeline(steps=[
            ('preprocessor', preprocessor_ordinal),
            ('classifier', model)
        ])
        model_pipeline_ordinal.fit(X_train, y_train)
        y_pred_ordinal = model_pipeline_ordinal.predict(X_test)

        # Simpan skor
        results[name]['accuracy_onehot'].append(accuracy_score(y_test, y_pred_onehot))
        results[name]['precision_onehot'].append(precision_score(y_test, y_pred_onehot))
        results[name]['recall_onehot'].append(recall_score(y_test, y_pred_onehot))
        results[name]['f1_onehot'].append(f1_score(y_test, y_pred_onehot))

        results[name]['accuracy_ordinal'].append(accuracy_score(y_test, y_pred_ordinal))
        results[name]['precision_ordinal'].append(precision_score(y_test, y_pred_ordinal))
        results[name]['recall_ordinal'].append(recall_score(y_test, y_pred_ordinal))
        results[name]['f1_ordinal'].append(f1_score(y_test, y_pred_ordinal))

# Get Feature Importance

In [32]:
def get_feature_importance(model, model_name, X, y):
    try:
        feature_names = model.named_steps['preprocessor'].get_feature_names_out()
    except:
        feature_names = X.columns.tolist()

    classifier = model.named_steps['classifier']

    if model_name == 'SVC':
        if hasattr(classifier, 'kernel') and classifier.kernel == 'linear':
            importances = classifier.coef_[0]
            feature_importance = dict(zip(feature_names, importances))
        else:
            result = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=-1)
            feature_importance = dict(zip(feature_names, result.importances_mean))
    elif model_name in ['XGB', 'RFC', 'GBC']:
        importances = classifier.feature_importances_
        feature_importance = dict(zip(feature_names, importances))
    else:
        return None

    return dict(sorted(feature_importance.items(), key=lambda item: abs(item[1]), reverse=True))

# Evaluate Model

In [33]:
for name, scores in results.items():
    print(f"\nModel: {name}")
    print(f"OneHot Encoding:")
    print(f"Accuracy:  {np.mean(scores['accuracy_onehot'])*100:.4f}%")
    print(f"Precision: {np.mean(scores['precision_onehot'])*100:.4f}%")
    print(f"Recall:    {np.mean(scores['recall_onehot'])*100:.4f}%")
    print(f"F1 Score:  {np.mean(scores['f1_onehot'])*100:.4f}%")

    print(f"\nOrdinal Encoding:")
    print(f"Accuracy:  {np.mean(scores['accuracy_ordinal'])*100:.4f}%")
    print(f"Precision: {np.mean(scores['precision_ordinal'])*100:.4f}%")
    print(f"Recall:    {np.mean(scores['recall_ordinal'])*100:.4f}%")
    print(f"F1 Score:  {np.mean(scores['f1_ordinal'])*100:.4f}%")

    print(f"\nFeature Importance for {name} (Ordinal Encoding):")
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor_ordinal),
        ('classifier', models[name])
    ])
    final_pipeline.fit(X, y)
    importance = get_feature_importance(final_pipeline, name, X, y)
    if importance:
        for feature, value in importance.items():
            print(f"{feature}: {value:.10f}")


Model: XGB
OneHot Encoding:
Accuracy:  71.1417%
Precision: 65.5733%
Recall:    61.4374%
F1 Score:  63.4354%

Ordinal Encoding:
Accuracy:  69.7250%
Precision: 64.0533%
Recall:    58.5291%
F1 Score:  61.1560%

Feature Importance for XGB (Ordinal Encoding):
num__hypertension: 0.2499072552
num__previous_heart_disease: 0.2162176967
num__diabetes: 0.1010818183
num__obesity: 0.0910568163
num__age: 0.0291501693
num__fasting_blood_sugar: 0.0247416422
num__cholesterol_level: 0.0240666680
cat__physical_activity: 0.0202209894
num__cholesterol_hdl: 0.0199990086
num__cholesterol_ldl: 0.0199563541
num__triglycerides: 0.0196641404
cat__income_level: 0.0195765849
num__blood_pressure_diastolic: 0.0194303691
num__waist_circumference: 0.0194263048
num__sleep_hours: 0.0193641987
cat__alcohol_consumption: 0.0190321561
num__blood_pressure_systolic: 0.0188078806
num__medication_usage: 0.0176913477
cat__EKG_results: 0.0173380096
num__participated_in_free_screening: 0.0170121882
num__family_history: 0.01625840