In [20]:
import pandas as pd
import numpy as np
import os, joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier

In [21]:
# 1. LOAD BOTH HEART DATASETS
# ============================================================
heart1 = pd.read_csv("../dataset/heart_disease/heart.csv")            # 918 x 12
heart2 = pd.read_csv("../dataset/heart_disease/heart_dataset.csv")    # 1888 x 14

print("heart1:", heart1.shape)
print("heart2:", heart2.shape)
print(heart1.head(2))
print(heart2.head(2))

heart1: (918, 12)
heart2: (1888, 14)
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
   age  sex  cp  trestbps  chol  fbs  restecg  thalachh  exang  oldpeak  \
0   63    1   3       145   233    1        0       150      0      2.3   
1   37    1   2       130   250    0        1       187      0      3.5   

   slope  ca  thal  target  
0      0   0     1       1  
1      0   0     2       1  


In [22]:
# 2. CLEAN & ALIGN COLUMN NAMES
# ============================================================
heart1 = heart1.rename(columns={
    "HeartDisease": "target"
})

heart2 = heart2.rename(columns={
    "age": "Age",
    "sex": "Sex",
    "trestbps": "RestingBP",
    "chol": "Cholesterol",
    "thalachh": "MaxHR",
    "exang": "ExerciseAngina",
    "oldpeak": "Oldpeak",
    "slope": "ST_Slope",
    "target": "target"
})

if heart1["ExerciseAngina"].dtype == object:
    heart1["ExerciseAngina"] = heart1["ExerciseAngina"].map({"Y": 1, "N": 0})

heart1["target"] = heart1["target"].astype(int)
heart2["target"] = heart2["target"].astype(int)

heart2["ExerciseAngina"] = heart2["ExerciseAngina"].astype(int)

In [23]:
# 3. CREATE FINAL 8 FEATURES
# ============================================================
FEATURES = [
    "Age",
    "RestingBP",
    "Cholesterol",
    "MaxHR",
    "Oldpeak",
    "ChestPainType",
    "ST_Slope",
    "Thal"
]

heart1["Thal"] = "unknown"     # missing in dataset 1
heart1["ChestPainType"] = heart1["ChestPainType"].astype(str)
heart1["ST_Slope"] = heart1["ST_Slope"].astype(str)

heart2["ChestPainType"] = heart2["cp"].astype(str)
heart2["ST_Slope"] = heart2["ST_Slope"].astype(str)
heart2["Thal"] = heart2["thal"].astype(str)

df1 = heart1[FEATURES + ["target"]].copy()
df2 = heart2[FEATURES + ["target"]].copy()

combined = pd.concat([df1, df2], ignore_index=True)
print("Combined:", combined.shape)

Combined: (2806, 9)


In [24]:
# 4. CLEAN DATA TYPES
# ============================================================
numeric_cols = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
cat_cols = ["ChestPainType", "ST_Slope", "Thal"]

for col in numeric_cols:
    combined[col] = pd.to_numeric(combined[col], errors="coerce")
    combined[col] = combined[col].fillna(combined[col].median())

for col in cat_cols:
    combined[col] = combined[col].astype(str)
    combined[col] = combined[col].fillna("unknown")

In [25]:
# 5. BALANCE DATASET (UPSAMPLE MINORITY 1:1)
# ============================================================
majority = combined[combined["target"] == 0]
minority = combined[combined["target"] == 1]

minority_up = resample(minority, replace=True,
                       n_samples=len(majority),
                       random_state=42)

balanced = pd.concat([majority, minority_up]).sample(frac=1, random_state=42)

print("\nBalanced:")
print(balanced["target"].value_counts())

X = balanced[FEATURES]
y = balanced["target"]


Balanced:
target
1    1321
0    1321
Name: count, dtype: int64


In [26]:
# 6. PREPROCESSOR (SCALING + ONE-HOT)
# ============================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

In [27]:
# 7. DEFINE STACKING MODEL
# ============================================================
estimators = [
    ("rf", RandomForestClassifier(n_estimators=300, random_state=42)),
    ("xgb", XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    ))
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    stack_method="predict_proba"
)

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("stack", stack)
])

In [28]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_accuracy = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
cv_auc = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
cv_f1 = cross_val_score(model, X, y, cv=skf, scoring='f1')

print("CV Accuracy:", cv_accuracy.mean())
print("CV ROC-AUC:", cv_auc.mean())
print("CV F1:", cv_f1.mean())

CV Accuracy: 0.9443632926619694
CV ROC-AUC: 0.9869896759187068
CV F1: 0.9453384709444645


In [29]:
# 8. TRAIN-TEST SPLIT
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [30]:
# 9. TRAIN + EVALUATE
# ============================================================
model.fit(X_train, y_train)
pred = model.predict(X_test)

print("\n=== FINAL STACKING MODEL RESULTS ===")
print("Accuracy:", accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))


=== FINAL STACKING MODEL RESULTS ===
Accuracy: 0.943289224952741
[[243  22]
 [  8 256]]
              precision    recall  f1-score   support

           0       0.97      0.92      0.94       265
           1       0.92      0.97      0.94       264

    accuracy                           0.94       529
   macro avg       0.94      0.94      0.94       529
weighted avg       0.94      0.94      0.94       529



In [None]:
# 10. SAVE MODEL
# ============================================================
artifacts = {
    "model": model,
    "feature": FEATURES
}

os.makedir("../saved_mdl", exist_ok=True)
joblib.dump(artifacts, "../saved_mdl/heart_model.pkl")

print("\nSaved → ../saved_mdl/heart_model.pkl")


Saved → ../saved_models/heart_model.pkl
