# Import Library

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance

# Reading Dataset

In [2]:
# 1. Baca file CSV
df = pd.read_csv("data/heart_attack_prediction_indonesia.csv")

# 2. Pisahkan berdasarkan kelas target (0 dan 1)
df_0 = df[df['heart_attack'] == 0]
df_1 = df[df['heart_attack'] == 1]

# 3. Tentukan jumlah minimum dari kedua kelas
n_samples = min(len(df_0), len(df_1))

# 4. Undersample agar seimbang
df_0_balanced = resample(df_0, replace=False, n_samples=n_samples, random_state=42)
df_1_balanced = resample(df_1, replace=False, n_samples=n_samples, random_state=42)

# 5. Gabungkan dan acak (shuffle)
df_balanced = pd.concat([df_0_balanced, df_1_balanced])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# 6. (Opsional) Batasi jumlah total data, misalnya hanya ambil 10.000 data
df_balanced = df_balanced.sample(n=10000, random_state=42).reset_index(drop=True)

# 7. Cek distribusi akhir
print("Distribusi heart_attack setelah penyeimbangan dan pemotongan:")
print(df_balanced['heart_attack'].value_counts())

# 8. (Opsional) Simpan ke file baru
df_balanced.to_csv("data/heart_attack_balanced_10000.csv", index=False)

# 9. Lihat 5 baris pertama
df_balanced.head()

Distribusi heart_attack setelah penyeimbangan dan pemotongan:
heart_attack
0    5006
1    4994
Name: count, dtype: int64


Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,61,Male,Urban,Middle,0,0,190,1,106,0,...,79,90,45,128,162,Normal,0,0,1,0
1,53,Male,Urban,Low,0,0,215,0,98,0,...,73,89,53,146,227,Normal,0,0,1,0
2,45,Male,Rural,Middle,1,0,242,0,80,0,...,78,70,52,82,96,Normal,0,0,1,0
3,46,Male,Urban,Middle,1,0,244,0,90,0,...,63,114,65,110,156,Normal,0,1,0,1
4,45,Female,Rural,Middle,1,1,284,1,120,0,...,93,112,46,102,128,Normal,0,1,1,1


In [3]:
data = pd.read_csv("data/heart_attack_balanced_10000.csv", keep_default_na=False)
data

Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,61,Male,Urban,Middle,0,0,190,1,106,0,...,79,90,45,128,162,Normal,0,0,1,0
1,53,Male,Urban,Low,0,0,215,0,98,0,...,73,89,53,146,227,Normal,0,0,1,0
2,45,Male,Rural,Middle,1,0,242,0,80,0,...,78,70,52,82,96,Normal,0,0,1,0
3,46,Male,Urban,Middle,1,0,244,0,90,0,...,63,114,65,110,156,Normal,0,1,0,1
4,45,Female,Rural,Middle,1,1,284,1,120,0,...,93,112,46,102,128,Normal,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,46,Male,Rural,Middle,0,0,272,0,96,0,...,77,168,57,127,181,Normal,1,0,1,1
9996,51,Male,Urban,High,1,0,185,1,91,0,...,76,76,47,104,85,Normal,0,1,1,1
9997,33,Female,Rural,Middle,1,0,201,0,91,1,...,51,116,40,116,215,Normal,0,1,0,0
9998,60,Female,Rural,Middle,0,0,175,0,86,0,...,86,134,75,126,127,Normal,0,1,1,0


# Manual Mapping for Ordinal Encoding

In [4]:
ordinal_mappings = {
    'income_level': {'Low': 0, 'Middle': 1, 'High': 2},
    'alcohol_consumption': {'None': 0, 'Moderate': 1, 'High': 2},
    'physical_activity': {'Low': 0, 'Moderate': 1, 'High': 2},
    'dietary_habits': {'Unhealthy': 0, 'Healthy': 1},
    'air_pollution_exposure': {'Low': 0, 'Moderate': 1, 'High': 2},
    'stress_level': {'Low': 0, 'Moderate': 1, 'High': 2},
    'smoking_status': {'Never': 0, 'Past': 1, 'Current': 2},
    'gender': {'Female': 0, 'Male': 1},
    'region': {'Rural': 0, 'Urban': 1},
    'EKG_results': {'Normal': 0, 'Abnormal': 1}
}

In [5]:
pd.set_option('future.no_silent_downcasting', True)
data_ordinal = data.copy()
data_ordinal.replace(ordinal_mappings, inplace=True)

# Define Features and Target

In [6]:
features = ['age', 'gender', 'region', 'income_level', 'hypertension', 'diabetes',
    'cholesterol_level', 'obesity', 'waist_circumference', 'family_history',
    'smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits',
    'air_pollution_exposure', 'stress_level', 'sleep_hours',
    'blood_pressure_systolic', 'blood_pressure_diastolic', 'fasting_blood_sugar',
    'cholesterol_hdl', 'cholesterol_ldl', 'triglycerides', 'EKG_results',
    'medication_usage', 'participated_in_free_screening', 'previous_heart_disease']
target = 'heart_attack'

print(f"Number of Features: {len(features)}")

Number of Features: 27


# Preprocessing Non-Numerical Features

In [7]:
categorical_features = ['gender', 'region', 'income_level', 'smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits',
                        'air_pollution_exposure', 'stress_level', 'EKG_results']
numeric_features = [col for col in features if col not in categorical_features]

onehot_transformer = Pipeline(steps=[ 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[['Low', 'Middle', 'High'], ['None', 'Moderate', 'High'], 
                                            ['Low', 'Moderate', 'High'], ['Unhealthy', 'Healthy'], 
                                            ['Low', 'Moderate', 'High'], ['Low', 'Moderate', 'High'], 
                                            ['Never', 'Past', 'Current'], ['Female', 'Male'], 
                                            ['Rural', 'Urban'], ['Normal', 'Abnormal']], 
                               handle_unknown='use_encoded_value', unknown_value=-1))
])

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor_onehot = ColumnTransformer(
    transformers=[
        ('cat', onehot_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)  # Apply scaling for numeric features
    ],
    remainder='passthrough'
)

preprocessor_ordinal = ColumnTransformer(
    transformers=[
        ('cat', ordinal_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)  # Apply scaling for numeric features
    ],
    remainder='passthrough'
)

# Data setelah one hot

In [9]:
# Transformasi One-Hot Encoding
# Gunakan df_balanced untuk One-Hot Encoding
X = df_balanced[features]
y = df_balanced[target]

# Transformasi One-Hot Encoding

X_onehot = preprocessor_onehot.fit_transform(X)

# Dapatkan nama fitur hasil one-hot
feature_names_onehot = []
for name, transformer, cols in preprocessor_onehot.transformers_:
    if name == 'cat':
        feature_names_onehot.extend(transformer.named_steps['onehot'].get_feature_names_out(cols))
    elif name == 'num':
        feature_names_onehot.extend(cols)

# Konversi ke DataFrame
df_onehot = pd.DataFrame(X_onehot.toarray() if hasattr(X_onehot, "toarray") else X_onehot, columns=feature_names_onehot)

# Tampilkan 5 baris pertama
print("\nContoh Data setelah One-Hot Encoding:")
df_onehot.head()



Contoh Data setelah One-Hot Encoding:


Unnamed: 0,gender_Female,gender_Male,region_Rural,region_Urban,income_level_High,income_level_Low,income_level_Middle,smoking_status_Current,smoking_status_Never,smoking_status_Past,...,sleep_hours,blood_pressure_systolic,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,medication_usage,participated_in_free_screening,previous_heart_disease
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.487391,-0.161475,-0.04792,-0.749232,-0.453479,-0.046044,0.258601,-1.0,0.816326,-0.53387
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.037346,-0.028983,-0.643573,-0.784771,0.344215,0.463195,1.577125,-1.0,0.816326,-0.53387
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,-0.126506,-0.890182,-0.147196,-1.460011,0.244503,-1.34743,-1.080208,-1.0,0.816326,-0.53387
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,-0.515766,0.302248,-1.636327,0.103703,1.540757,-0.555282,0.136891,1.0,-1.225,-0.53387
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.834146,1.29594,1.341935,0.032625,-0.353767,-0.78161,-0.431088,1.0,0.816326,-0.53387


# Data setelah ordinal

In [10]:
# Pastikan data_ordinal didefinisikan dulu
data_ordinal = df_balanced.copy()
data_ordinal.replace(ordinal_mappings, inplace=True)

X_ordinal = data_ordinal[features]
y_ordinal = data_ordinal[target]

# Transformasi Ordinal Encoding
X_ordinal_encoded = preprocessor_ordinal.fit_transform(X_ordinal)

# Gabung nama kolom numerik dan ordinal (dalam urutan sesuai ColumnTransformer)
feature_names_ordinal = numeric_features + categorical_features

# Konversi ke DataFrame
df_ordinal = pd.DataFrame(X_ordinal_encoded, columns=feature_names_ordinal)

# Tampilkan 5 baris pertama
print("\nContoh Data setelah Ordinal Encoding:")
df_ordinal.head()



Contoh Data setelah Ordinal Encoding:


Unnamed: 0,age,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,sleep_hours,blood_pressure_systolic,blood_pressure_diastolic,...,gender,region,income_level,smoking_status,alcohol_consumption,physical_activity,dietary_habits,air_pollution_exposure,stress_level,EKG_results
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.487391,-0.161475,-0.04792,-0.749232,-0.453479,-0.046044,0.258601,-1.0,0.816326,-0.53387
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.037346,-0.028983,-0.643573,-0.784771,0.344215,0.463195,1.577125,-1.0,0.816326,-0.53387
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-0.126506,-0.890182,-0.147196,-1.460011,0.244503,-1.34743,-1.080208,-1.0,0.816326,-0.53387
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-0.515766,0.302248,-1.636327,0.103703,1.540757,-0.555282,0.136891,1.0,-1.225,-0.53387
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.834146,1.29594,1.341935,0.032625,-0.353767,-0.78161,-0.431088,1.0,0.816326,-0.53387


# Splitting

## Split into X and y

In [9]:
X = data[features]
y = data[target]

## Splitting Data Into 5 Folds

In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define Model

In [11]:
models = {
    'XGB': XGBClassifier(random_state=42),
    'RFC': RandomForestClassifier(random_state=42),
    'GBC': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(random_state=42)
}

results = {model: {'accuracy_onehot': [], 'precision_onehot': [], 'recall_onehot': [], 'f1_onehot': [],
                   'accuracy_ordinal': [], 'precision_ordinal': [], 'recall_ordinal': [], 'f1_ordinal': []}
           for model in models}

# Training

In [12]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    for name, model in models.items():
        model_pipeline_onehot = Pipeline(steps=[
            ('preprocessor', preprocessor_onehot),
            ('classifier', model)
        ])
        model_pipeline_onehot.fit(X_train, y_train)
        y_pred_onehot = model_pipeline_onehot.predict(X_test)

        model_pipeline_ordinal = Pipeline(steps=[
            ('preprocessor', preprocessor_ordinal),
            ('classifier', model)
        ])
        model_pipeline_ordinal.fit(X_train, y_train)
        y_pred_ordinal = model_pipeline_ordinal.predict(X_test)

        # Simpan skor
        results[name]['accuracy_onehot'].append(accuracy_score(y_test, y_pred_onehot))
        results[name]['precision_onehot'].append(precision_score(y_test, y_pred_onehot))
        results[name]['recall_onehot'].append(recall_score(y_test, y_pred_onehot))
        results[name]['f1_onehot'].append(f1_score(y_test, y_pred_onehot))

        results[name]['accuracy_ordinal'].append(accuracy_score(y_test, y_pred_ordinal))
        results[name]['precision_ordinal'].append(precision_score(y_test, y_pred_ordinal))
        results[name]['recall_ordinal'].append(recall_score(y_test, y_pred_ordinal))
        results[name]['f1_ordinal'].append(f1_score(y_test, y_pred_ordinal))

# Get Feature Importance

In [13]:
def get_feature_importance(model, model_name, X, y):
    try:
        preprocessor = model.named_steps['preprocessor']
        feature_names = []

        if isinstance(preprocessor, ColumnTransformer):
            for name, transformer, cols in preprocessor.transformers_:
                # Handle pipeline (e.g., Pipeline(steps=[('onehot', OneHotEncoder())]))
                if isinstance(transformer, Pipeline):
                    transformer = transformer.named_steps.get('onehot', transformer)
                
                if hasattr(transformer, 'get_feature_names_out'):
                    try:
                        feature_names.extend(transformer.get_feature_names_out(cols))
                    except:
                        # In case cols is indices, not names
                        feature_names.extend([f"{name}_{i}" for i in range(len(cols))])
                else:
                    feature_names.extend(cols)
        else:
            feature_names = X.columns.tolist()
    except Exception as e:
        # Fallback
        feature_names = X.columns.tolist()

    classifier = model.named_steps['classifier']

    # Handle SVM
    if model_name == 'SVC':
        if hasattr(classifier, 'kernel') and classifier.kernel == 'linear':
            importances = classifier.coef_[0]
            feature_importance = dict(zip(feature_names, importances))
        else:
            result = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=-1)
            feature_importance = dict(zip(feature_names, result.importances_mean))

    # Handle tree-based models
    elif model_name in ['XGB', 'RFC', 'GBC']:
        importances = classifier.feature_importances_
        feature_importance = dict(zip(feature_names, importances))

    else:
        return None

    # Return sorted by absolute importance
    return dict(sorted(feature_importance.items(), key=lambda item: abs(item[1]), reverse=True))

# Evaluate Model

In [14]:
for name, scores in results.items():
    print(f"\nModel: {name}")
    print(f"OneHot Encoding:")
    print(f"Accuracy:  {np.mean(scores['accuracy_onehot'])*100:.4f}%")
    print(f"Precision: {np.mean(scores['precision_onehot'])*100:.4f}%")
    print(f"Recall:    {np.mean(scores['recall_onehot'])*100:.4f}%")
    print(f"F1 Score:  {np.mean(scores['f1_onehot'])*100:.4f}%")

    print(f"\nOrdinal Encoding:")
    print(f"Accuracy:  {np.mean(scores['accuracy_ordinal'])*100:.4f}%")
    print(f"Precision: {np.mean(scores['precision_ordinal'])*100:.4f}%")
    print(f"Recall:    {np.mean(scores['recall_ordinal'])*100:.4f}%")
    print(f"F1 Score:  {np.mean(scores['f1_ordinal'])*100:.4f}%")

    print(f"\nFeature Importance for {name} (OneHot Encoding):")
    final_pipeline_onehot = Pipeline(steps=[
        ('preprocessor', preprocessor_onehot),
        ('classifier', models[name])
    ])
    final_pipeline_onehot.fit(X, y)
    importance_onehot = get_feature_importance(final_pipeline_onehot, name, X, y)
    
    if importance_onehot:
        for feature, value in importance_onehot.items():
            print(f"{feature}: {value:.10f}")

    print(f"\nFeature Importance for {name} (Ordinal Encoding):")
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor_ordinal),
        ('classifier', models[name])
    ])
    final_pipeline.fit(X, y)
    importance = get_feature_importance(final_pipeline, name, X, y)
    if importance:
        for feature, value in importance.items():
            print(f"{feature}: {value:.10f}")


Model: XGB
OneHot Encoding:
Accuracy:  70.4000%
Precision: 69.9178%
Recall:    71.5679%
F1 Score:  70.7118%

Ordinal Encoding:
Accuracy:  68.9100%
Precision: 68.7999%
Recall:    69.1253%
F1 Score:  68.9453%

Feature Importance for XGB (OneHot Encoding):
previous_heart_disease: 0.1675754488
hypertension: 0.1431076527
diabetes: 0.0792480558
smoking_status_Current: 0.0679456890
obesity: 0.0605427511
age: 0.0209072847
cholesterol_level: 0.0184644591
fasting_blood_sugar: 0.0177984964
stress_level_Low: 0.0159116816
alcohol_consumption_High: 0.0157368798
smoking_status_Never: 0.0156275406
alcohol_consumption_Moderate: 0.0156114725
income_level_Low: 0.0153024327
air_pollution_exposure_Low: 0.0150589198
triglycerides: 0.0146861821
sleep_hours: 0.0146354456
waist_circumference: 0.0146217439
stress_level_Moderate: 0.0145281302
physical_activity_Moderate: 0.0143572725
physical_activity_High: 0.0143414568
blood_pressure_systolic: 0.0141177010
cholesterol_ldl: 0.0140646966
stress_level_High: 0.0140