In [14]:
import pandas as pd
import numpy as np


In [15]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [16]:
df.shape

(1025, 14)

In [17]:
df.columns


Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [18]:
X = df.drop("target", axis=1)
y = df["target"]


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [20]:
!pip install xgboost



In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [22]:
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

In [23]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

In [24]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)

    results.append([name, acc, auc, prec, rec, f1, mcc])

In [25]:
columns = ["Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]

results_df = pd.DataFrame(results, columns=columns)
results_df


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.795122,0.877023,0.756303,0.873786,0.810811,0.597255
1,Decision Tree,0.985366,0.985437,1.0,0.970874,0.985222,0.971151
2,kNN,0.731707,0.860461,0.730769,0.737864,0.7343,0.463398
3,Naive Bayes,0.8,0.87055,0.754098,0.893204,0.817778,0.610224
4,Random Forest,0.985366,1.0,1.0,0.970874,0.985222,0.971151
5,XGBoost,0.985366,0.989435,1.0,0.970874,0.985222,0.971151


In [26]:
import joblib
import os

os.makedirs("models", exist_ok=True)

for name, model in models.items():
    joblib.dump(model, f"models/{name}.pkl")


## Observations on Model Performance

| ML Model Name | Observation about model performance |
|--------------|--------------------------------------|
| Logistic Regression | Provided a strong baseline with balanced precision and recall but struggled to capture complex non-linear patterns. |
| Decision Tree | Achieved very high accuracy and MCC. Good at learning non-linear boundaries but may overfit the training data. |
| kNN | Showed lower performance compared to other models and is sensitive to distance metrics and feature interactions. |
| Naive Bayes | Performed reasonably well despite independence assumptions but slightly behind ensemble methods. |
| Random Forest (Ensemble) | One of the top performers; combining multiple trees improved robustness and reduced overfitting. |
| XGBoost (Ensemble) | Delivered excellent results with high accuracy and AUC due to sequential error correction. |
