In [1]:
# Heart Disease Project
# Notebook 03: Model Training & Evaluation
import pandas as pd
import numpy as np
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)

In [2]:
# Load Processed Data
data = pd.read_csv("../data/processed/cleaned_train_data.csv")

X = data.drop("target", axis=1)
y = data["target"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [3]:
# Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


In [4]:
# Train & Evaluate Models
results = {}
best_model = None
best_recall = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob)
    }
    
    results[name] = metrics
    
    if metrics["Recall"] > best_recall:
        best_recall = metrics["Recall"]
        best_model = model


In [5]:
# Display Results
results_df = pd.DataFrame(results).T
results_df


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.884817,0.87619,0.910891,0.893204,0.936304
Random Forest,0.931937,0.923077,0.950495,0.936585,0.955006
Gradient Boosting,0.921466,0.921569,0.930693,0.926108,0.955996


In [6]:
# Confusion Matrix of Best Model
cm = confusion_matrix(y_test, best_model.predict(X_test))
cm


array([[82,  8],
       [ 5, 96]])

In [1]:
# FINAL MODEL TRAINING & SAVE 
import pandas as pd
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import recall_score

# Load processed data
data = pd.read_csv("../data/processed/cleaned_train_data.csv")

X = data.drop("target", axis=1)
y = data["target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

best_model = None
best_recall = 0

# Train & select best model
for name, model in models.items():
    model.fit(X_train, y_train)
    recall = recall_score(y_test, model.predict(X_test))
    
    print(f"{name} Recall: {recall:.4f}")
    
    if recall > best_recall:
        best_recall = recall
        best_model = model

# Ensure models directory exists
os.makedirs("../models", exist_ok=True)

# Save best model
model_path = "../models/best_model.pkl"
joblib.dump(best_model, model_path)

print(f"\n✅ Best model saved successfully at: {model_path}")


Logistic Regression Recall: 0.9109
Random Forest Recall: 0.9505
Gradient Boosting Recall: 0.9307

✅ Best model saved successfully at: ../models/best_model.pkl
