In [4]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, matthews_corrcoef
)

# -------------------------------
# 1. Load Dataset
# -------------------------------
df = pd.read_csv("heart_disease.csv")

# Replace '?' with NaN if present
df.replace("?", pd.NA, inplace=True)

# Drop ID column
df.drop(columns=["id"], inplace=True)

# -------------------------------
# 2. Target Engineering
# -------------------------------
df["num"] = df["num"].apply(lambda x: 0 if x == 0 else 1)

X = df.drop("num", axis=1)
y = df["num"]

# -------------------------------
# 3. Column Groups
# -------------------------------
num_features = ["age", "trestbps", "chol", "thalch", "oldpeak"]
cat_features = [
    "sex", "dataset", "cp", "fbs",
    "restecg", "exang", "slope",
    "ca", "thal"
]

# -------------------------------
# 4. Preprocessing Pipelines
# -------------------------------
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_features),
        ("cat", categorical_pipeline, cat_features)
    ]
)

# -------------------------------
# 5. Models
# -------------------------------
models = {
    "Logistic_Regression": LogisticRegression(max_iter=1000),
    "Decision_Tree": DecisionTreeClassifier(max_depth=6, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "Naive_Bayes": GaussianNB(),
    "Random_Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        max_depth=4,
        learning_rate=0.1,
        eval_metric="logloss",
        random_state=42
    )
}

# -------------------------------
# 6. Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# -------------------------------
# 7. Train, Evaluate, Save
# -------------------------------
results = []

for name, model in models.items():
    print(f"\nTraining {name}")

    if name == "Naive_Bayes":
        # GaussianNB requires dense input
        X_train_t = preprocessor.fit_transform(X_train)
        X_test_t = preprocessor.transform(X_test)

        model.fit(X_train_t, y_train)
        y_pred = model.predict(X_test_t)
        y_proba = model.predict_proba(X_test_t)[:, 1]

        joblib.dump((preprocessor, model), f"model/{name}.pkl")

    else:
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_proba = pipeline.predict_proba(X_test)[:, 1]

        joblib.dump(pipeline, f"model/{name}.pkl")

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

# -------------------------------
# 8. Save Metrics
# -------------------------------
results_df = pd.DataFrame(results)
results_df.to_csv("model/model_metrics.csv", index=False)
print(results_df)



Training Logistic_Regression

Training Decision_Tree

Training KNN

Training Naive_Bayes

Training Random_Forest

Training XGBoost
                 Model  Accuracy       AUC  Precision    Recall        F1  \
0  Logistic_Regression  0.842391  0.925634   0.834862  0.892157  0.862559   
1        Decision_Tree  0.815217  0.842181   0.814815  0.862745  0.838095   
2                  KNN  0.847826  0.904890   0.842593  0.892157  0.866667   
3          Naive_Bayes  0.847826  0.907221   0.855769  0.872549  0.864078   
4        Random_Forest  0.858696  0.931014   0.858491  0.892157  0.875000   
5              XGBoost  0.853261  0.913917   0.850467  0.892157  0.870813   

        MCC  
0  0.680376  
1  0.624696  
2  0.691317  
3  0.691443  
4  0.713336  
5  0.702303  
