In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import load as load_model
import os
from src.utils.evaluate_utils import evaluate_model, load_trained_models
from src.utils.training_utils import prepare_data
from src.utils.data_utils import drop_id

# Quick setup
from src.utils.notebook_setup import setup_notebook_environment
dbs, logger = await setup_notebook_environment()

logger.info("=== STARTING MODEL EVALUATION ===")


# LOAD DATA (same as 04)
gold_data_from_db = await dbs.get_gold_data()
gold_data_df = pd.DataFrame(gold_data_from_db)
gold_data_df = drop_id(gold_data_df)

# RECREATE THE SAME SPLIT (important!)
X = gold_data_df.drop('target', axis=1)
y = gold_data_df['target']
X_train, X_test, y_train, y_test = prepare_data(X, y, test_size=0.2, random_state=42)

logger.info(f"Loaded test set: {X_test.shape}")


2025-07-25 02:09:05,168 - api.services.database_service - INFO - Connected to MongoDB database: healthcare
2025-07-25 02:09:05,291 - src.utils.notebook_setup - INFO - Database connected: True
2025-07-25 02:09:05,292 - src.utils.notebook_setup - INFO - Database collections: ['heart_disease_gold', 'heart_disease_silver', 'heart_disease_bronze']
2025-07-25 02:09:05,293 - src.utils.notebook_setup - INFO - Database collections count: 3
2025-07-25 02:09:05,295 - src.utils.notebook_setup - INFO - === STARTING MODEL EVALUATION ===
2025-07-25 02:09:06,005 - src.utils.training_utils - INFO - Training set: (13984, 17), Test set: (3496, 17)
2025-07-25 02:09:06,007 - src.utils.notebook_setup - INFO - Loaded test set: (3496, 17)


In [2]:
# load the trained models
trained_models = load_trained_models()
logger.info(f"Loaded {len(trained_models)} trained models")

2025-07-25 02:09:06,018 - src.utils.evaluate_utils - INFO - Loaded model: LogisticRegression
2025-07-25 02:09:06,019 - src.utils.notebook_setup - INFO - Loaded 1 trained models


In [3]:
# EVALUATE ALL TRAINED MODELS

all_results = []

for model_name, model in trained_models.items():
    logger.info(f"Evaluating {model_name}...")
    result = evaluate_model(model, X_test, y_test, model_name)
    all_results.append(result)

# Create results DataFrame
all_results_df = pd.DataFrame([
    {k: v for k, v in result.items() if k != 'model'} 
    for result in all_results
])

logger.info("\n=== MODEL COMPARISON ===")
print(all_results_df.round(4))

# %%
# 5. FIND THE BEST MODEL
best_model_idx = all_results_df['roc_auc'].idxmax()
best_model_name = all_results_df.iloc[best_model_idx]['model_name']
best_model = all_results[best_model_idx]['model']

logger.info(f"\n=== BEST MODEL: {best_model_name} ===")
logger.info(f"ROC-AUC: {all_results_df.iloc[best_model_idx]['roc_auc']:.4f}")

2025-07-25 02:09:06,032 - src.utils.notebook_setup - INFO - Evaluating LogisticRegression...
2025-07-25 02:09:06,063 - src.utils.evaluate_utils - INFO - 
LogisticRegression Evaluation Results:
2025-07-25 02:09:06,064 - src.utils.evaluate_utils - INFO - Accuracy: 0.8081
2025-07-25 02:09:06,066 - src.utils.evaluate_utils - INFO - Precision: 0.8109
2025-07-25 02:09:06,067 - src.utils.evaluate_utils - INFO - Recall: 0.8516
2025-07-25 02:09:06,069 - src.utils.evaluate_utils - INFO - F1-Score: 0.8308
2025-07-25 02:09:06,162 - src.utils.evaluate_utils - INFO - ROC-AUC: 0.8853
2025-07-25 02:09:06,164 - src.utils.notebook_setup - INFO - 
=== MODEL COMPARISON ===
2025-07-25 02:09:06,171 - src.utils.notebook_setup - INFO - 
=== BEST MODEL: LogisticRegression ===
2025-07-25 02:09:06,173 - src.utils.notebook_setup - INFO - ROC-AUC: 0.8853


           model_name  accuracy  precision  recall  f1_score  roc_auc
0  LogisticRegression    0.8081     0.8109  0.8516    0.8308   0.8853


In [4]:
# FIND BEST ONE
best_model_idx = all_results_df['roc_auc'].idxmax()
best_model_name = all_results_df.iloc[best_model_idx]['model_name']
best_model = all_results[best_model_idx]['model']

logger.info(f"\n=== BEST MODEL: {best_model_name} ===")
logger.info(f"ROC-AUC: {all_results_df.iloc[best_model_idx]['roc_auc']:.4f}")

2025-07-25 02:09:06,183 - src.utils.notebook_setup - INFO - 
=== BEST MODEL: LogisticRegression ===
2025-07-25 02:09:06,185 - src.utils.notebook_setup - INFO - ROC-AUC: 0.8853


In [5]:
# SAVE BEST MODEL TO DISK!
from joblib import dump as dump_model

# Save the winner as the final production model
dump_model(best_model, "../models/heart_disease_model_FINAL.joblib")
logger.info(f"Saved final model: {best_model_name}")

2025-07-25 02:09:06,197 - src.utils.notebook_setup - INFO - Saved final model: LogisticRegression
