In [12]:
# ML Assignment 2 â€“ Training Notebook (BITS Lab)




In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

import joblib
import os


In [14]:
import kagglehub
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")

Using Colab cache for faster access to the 'heart-disease-dataset' dataset.


In [15]:
# Load dataset
df = pd.read_csv(os.path.join(path, "heart.csv"))

print("Dataset Shape:", df.shape)
df.head()

Dataset Shape: (1025, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [16]:
X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (820, 13)
Test size: (205, 13)


In [17]:
models = {
    "logistic": LogisticRegression(max_iter=1000),
    "decision_tree": DecisionTreeClassifier(random_state=42),
    "knn": KNeighborsClassifier(n_neighbors=5),
    "naive_bayes": GaussianNB(),
    "random_forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "xgboost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}


In [18]:
os.makedirs("model/saved_models", exist_ok=True)

metrics = []

for name, model in models.items():
    print(f"Training model: {name}")

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    # Save trained model
    model_path = f"model/saved_models/{name}.pkl"
    joblib.dump(pipe, model_path)
    print(f"Saved: {model_path}")

    # Predictions
    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    metrics.append([name, acc, auc, prec, rec, f1, mcc])

    print(f"{name} -> Acc:{acc:.3f}, AUC:{auc:.3f}, F1:{f1:.3f}\n")

# Create metrics DataFrame
metrics_df = pd.DataFrame(
    metrics,
    columns=["Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]
)

# Save metrics CSV
metrics_df.to_csv("metrics_results.csv", index=False)

metrics_df


Training model: logistic
Saved: model/saved_models/logistic.pkl
logistic -> Acc:0.810, AUC:0.930, F1:0.831

Training model: decision_tree
Saved: model/saved_models/decision_tree.pkl
decision_tree -> Acc:0.985, AUC:0.986, F1:0.986

Training model: knn
Saved: model/saved_models/knn.pkl
knn -> Acc:0.863, AUC:0.963, F1:0.865

Training model: naive_bayes
Saved: model/saved_models/naive_bayes.pkl
naive_bayes -> Acc:0.829, AUC:0.904, F1:0.840

Training model: random_forest
Saved: model/saved_models/random_forest.pkl
random_forest -> Acc:1.000, AUC:1.000, F1:1.000

Training model: xgboost
Saved: model/saved_models/xgboost.pkl
xgboost -> Acc:1.000, AUC:1.000, F1:1.000



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,logistic,0.809756,0.92981,0.761905,0.914286,0.831169,0.630908
1,decision_tree,0.985366,0.985714,1.0,0.971429,0.985507,0.971151
2,knn,0.863415,0.962905,0.873786,0.857143,0.865385,0.726935
3,naive_bayes,0.829268,0.904286,0.807018,0.87619,0.840183,0.660163
4,random_forest,1.0,1.0,1.0,1.0,1.0,1.0
5,xgboost,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
print("Saved model files:")
os.listdir("model/saved_models")

print("\nGenerated CSV:")
os.listdir(".")


Saved model files:

Generated CSV:


['.config', 'metrics_results.csv', 'drive', 'model', 'sample_data']