# Heart Disease ML Models


In [None]:
# Full ML models code
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pickle

df = pd.read_csv('heart.csv')
print("DataFrame columns:", df.columns)

X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply one-hot encoding
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

def evaluate(model, X_test, y_test):
    y_pred=model.predict(X_test)
    y_prob=model.predict_proba(X_test)[:,1]
    return [
        accuracy_score(y_test,y_pred),
        roc_auc_score(y_test,y_prob),
        precision_score(y_test,y_pred),
        recall_score(y_test,y_pred),
        f1_score(y_test,y_pred),
        matthews_corrcoef(y_test,y_pred)
    ]

models={}

lr=LogisticRegression(max_iter=200)
lr.fit(X_train_scaled,y_train)
models['Logistic Regression']=evaluate(lr,X_test_scaled,y_test)
pickle.dump(lr,open('lr_model.pkl','wb'))

dt=DecisionTreeClassifier(max_depth=5)
dt.fit(X_train,y_train)
models['Decision Tree']=evaluate(dt,X_test,y_test)
pickle.dump(dt,open('dt_model.pkl','wb'))

knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled,y_train)
models['KNN']=evaluate(knn,X_test_scaled,y_test)
pickle.dump(knn,open('knn_model.pkl','wb'))

nb=GaussianNB()
nb.fit(X_train,y_train)
models['Naive Bayes']=evaluate(nb,X_test,y_test)
pickle.dump(nb,open('nb_model.pkl','wb'))

rf=RandomForestClassifier(n_estimators=200)
rf.fit(X_train,y_train)
models['Random Forest']=evaluate(rf,X_test,y_test)
pickle.dump(rf,open('rf_model.pkl','wb'))

xgb=XGBClassifier(eval_metric='logloss')
xgb.fit(X_train,y_train)
models['XGBoost']=evaluate(xgb,X_test,y_test)
pickle.dump(xgb,open('xgb_model.pkl','wb'))

pd.DataFrame(models).T.to_csv('model_comparison.csv')