In [2]:
!pip install kagglehub xgboost scikit-learn pandas numpy

# -----------------------
# Imports
# -----------------------
import os
import pickle
import shutil
import pandas as pd
import kagglehub

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, roc_auc_score,
    confusion_matrix, classification_report
)

from google.colab import files

# -----------------------
# Download Dataset
# -----------------------
dataset_path = kagglehub.dataset_download(
    "johnsmith88/heart-disease-dataset"
)
df = pd.read_csv(os.path.join(dataset_path, "heart.csv"))

print("Dataset shape:", df.shape)

TARGET_COL = "target"
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL]

# -----------------------
# Train / Test Split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# -----------------------
# Save TRAIN & TEST CSV
# -----------------------
os.makedirs("data", exist_ok=True)

train_df = X_train.copy()
train_df[TARGET_COL] = y_train
train_df.to_csv("data/train.csv", index=False)

test_df = X_test.copy()
test_df[TARGET_COL] = y_test
test_df.to_csv("data/test.csv", index=False)

print("âœ… train.csv and test.csv saved")

# -----------------------
# Preprocessing
# -----------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# -----------------------
# Models
# -----------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest (Ensemble)": RandomForestClassifier(
        n_estimators=200, random_state=42
    ),
    "XGBoost (Ensemble)": XGBClassifier(
        eval_metric="logloss", random_state=42
    )
}

os.makedirs("model", exist_ok=True)
evaluation_results = {}

# -----------------------
# Train, Evaluate & Save
# -----------------------
for name, clf in models.items():
    print(f"Training {name}...")

    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", clf)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    y_prob = (
        pipeline.predict_proba(X_test)[:, 1]
        if hasattr(pipeline, "predict_proba")
        else None
    )

    evaluation_results[name] = {
        "metrics": {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1": f1_score(y_test, y_pred),
            "mcc": matthews_corrcoef(y_test, y_pred),
            "auc": roc_auc_score(y_test, y_prob) if y_prob is not None else None
        },
        "confusion_matrix": confusion_matrix(y_test, y_pred),
        "classification_report": classification_report(y_test, y_pred)
    }

    file_name = name.lower().replace(" ", "_").replace("(", "").replace(")", "")
    with open(f"model/{file_name}.pkl", "wb") as f:
        pickle.dump(pipeline, f)

print("âœ… All model .pkl files saved")

# -----------------------
# Save evaluation_results.pkl
# -----------------------
with open("model/evaluation_results.pkl", "wb") as f:
    pickle.dump(evaluation_results, f)

print("âœ… evaluation_results.pkl saved")

# -----------------------
# CREATE MODEL COMPARISON TABLE
# -----------------------

comparison_rows = []

for model_name, results in evaluation_results.items():
    m = results["metrics"]
    comparison_rows.append({
        "ML Model Name": model_name,
        "Accuracy": round(m["accuracy"], 3),
        "AUC": round(m["auc"], 3) if m["auc"] is not None else "N/A",
        "Precision": round(m["precision"], 3),
        "Recall": round(m["recall"], 3),
        "F1": round(m["f1"], 3),
        "MCC": round(m["mcc"], 3)
    })

comparison_df = pd.DataFrame(comparison_rows)

print("\nðŸ“Š MODEL COMPARISON TABLE")
print(comparison_df)
comparison_df.to_csv("model/model_comparison.csv", index=False)
print("\nâœ… model_comparison.csv saved")


# -----------------------
# ZIP EVERYTHING
# -----------------------
zip_name = "ml_assignment_assets"
shutil.make_archive(zip_name, "zip", ".")

# -----------------------
# Download ZIP
# -----------------------
files.download(f"{zip_name}.zip")

print("\nðŸ“¦ ml_assignment_assets.zip downloaded successfully!")

Using Colab cache for faster access to the 'heart-disease-dataset' dataset.
Dataset shape: (1025, 14)
âœ… train.csv and test.csv saved
Training Logistic Regression...
Training Decision Tree...
Training KNN...
Training Naive Bayes...
Training Random Forest (Ensemble)...
Training XGBoost (Ensemble)...
âœ… All model .pkl files saved
âœ… evaluation_results.pkl saved

ðŸ“Š MODEL COMPARISON TABLE
              ML Model Name  Accuracy    AUC  Precision  Recall     F1    MCC
0       Logistic Regression     0.810  0.930      0.762   0.914  0.831  0.631
1             Decision Tree     0.985  0.986      1.000   0.971  0.986  0.971
2                       KNN     0.863  0.963      0.874   0.857  0.865  0.727
3               Naive Bayes     0.829  0.904      0.807   0.876  0.840  0.660
4  Random Forest (Ensemble)     1.000  1.000      1.000   1.000  1.000  1.000
5        XGBoost (Ensemble)     1.000  1.000      1.000   1.000  1.000  1.000

âœ… model_comparison.csv saved


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


ðŸ“¦ ml_assignment_assets.zip downloaded successfully!
