# Heart Disease ML Models


In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

df = pd.read_csv("heart.csv")

# FEATURES & TARGET
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]

numeric_features = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
categorical_features = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Helper function
def train_and_save(model, name):
    pipe = Pipeline(steps=[
        ("preprocessing", preprocessor),
        ("model", model)
    ])

    pipe.fit(X, y)
    pickle.dump(pipe, open(f"{name}.pkl", "wb"))

    y_pred = pipe.predict(X)
    y_prob = pipe.predict_proba(X)[:, 1]

    return [
        accuracy_score(y, y_pred),
        roc_auc_score(y, y_prob),
        precision_score(y, y_pred),
        recall_score(y, y_pred),
        f1_score(y, y_pred),
        matthews_corrcoef(y, y_pred)
    ]

models = {
    "lr_model": LogisticRegression(max_iter=500),
    "dt_model": DecisionTreeClassifier(max_depth=5),
    "knn_model": KNeighborsClassifier(n_neighbors=7),
    "nb_model": GaussianNB(),
    "rf_model": RandomForestClassifier(n_estimators=200),
    "xgb_model": XGBClassifier(eval_metric="logloss")
}

results = {}

for name, model in models.items():
    results[name] = train_and_save(model, name)

pd.DataFrame(results).to_csv("model_comparison.csv")
print("All models trained and saved!")
