In [3]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

cols = [
    "age","sex","cp","trestbps","chol","fbs","restecg",
    "thalach","exang","oldpeak","slope","ca","thal","target"
]

df = pd.read_csv(url, names=cols)

# Convert target to binary (0 = No Disease, 1 = Disease)
df["target"] = df["target"].apply(lambda x: 1 if x > 0 else 0)

# Replace ? with median
df = df.replace("?", pd.NA)
df = df.apply(pd.to_numeric, errors="coerce")
df = df.fillna(df.median())

df.to_csv("heart.csv", index=False)

print("heart.csv created successfully")
df.head()


heart.csv created successfully


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
import pandas as pd
import numpy as np
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv("heart.csv")

X = df.drop("target", axis=1)
y = df["target"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

os.makedirs("models", exist_ok=True)

results = []

def train_and_save(model, name, filename):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([name, acc, auc, pre, rec, f1, mcc])

    pickle.dump(model, open("models/"+filename, "wb"))

# Train all models
train_and_save(LogisticRegression(), "Logistic Regression", "logistic_model.pkl")
train_and_save(DecisionTreeClassifier(), "Decision Tree", "decision_tree.pkl")
train_and_save(KNeighborsClassifier(), "KNN", "knn.pkl")
train_and_save(GaussianNB(), "Naive Bayes", "naive_bayes.pkl")
train_and_save(RandomForestClassifier(), "Random Forest", "random_forest.pkl")
train_and_save(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), "XGBoost", "xgboost.pkl")

# Show comparison table
results_df = pd.DataFrame(results, columns=["Model","Accuracy","AUC","Precision","Recall","F1","MCC"])
print(results_df)


                 Model  Accuracy       AUC  Precision   Recall        F1  \
0  Logistic Regression  0.885246  0.921336   0.878788  0.90625  0.892308   
1        Decision Tree  0.737705  0.740302   0.785714  0.68750  0.733333   
2                  KNN  0.918033  0.943966   0.935484  0.90625  0.920635   
3          Naive Bayes  0.836066  0.917026   0.892857  0.78125  0.833333   
4        Random Forest  0.868852  0.949353   0.900000  0.84375  0.870968   
5              XGBoost  0.868852  0.911638   0.875000  0.87500  0.875000   

        MCC  
0  0.769980  
1  0.481643  
2  0.836384  
3  0.679267  
4  0.739505  
5  0.737069  


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
