In [1]:
import sys
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from src.utils.data_utils import drop_id
from src.utils.training_utils import (
    prepare_data,
    train_model,
    hyperparameter_tuning,
    save_model,
    remove_old_models,
    get_models,
)
from src.utils.notebook_setup import setup_notebook_environment

# Quick setup
dbs, logger = await setup_notebook_environment()

# Now ready to work
logger.info("=== STARTING MODEL TRAINING ===")

# Checking Gold Layer
gold_data_from_db = await dbs.get_gold_data()
gold_data_df = pd.DataFrame(gold_data_from_db)
gold_data_df = drop_id(gold_data_df)

gold_data_df.head(10)

2025-07-25 11:13:29,372 - api.services.database_service - INFO - Connected to MongoDB database: healthcare
2025-07-25 11:13:29,493 - src.utils.notebook_setup - INFO - Database connected: True
2025-07-25 11:13:29,494 - src.utils.notebook_setup - INFO - Database collections: ['heart_disease_gold', 'heart_disease_silver', 'heart_disease_bronze']
2025-07-25 11:13:29,495 - src.utils.notebook_setup - INFO - Database collections count: 3
2025-07-25 11:13:29,498 - src.utils.notebook_setup - INFO - === STARTING MODEL TRAINING ===


Unnamed: 0,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,cp_asymptomatic,cp_atypical angina,cp_non-anginal,restecg_lv hypertrophy,restecg_normal,slope_flat,slope_not_tested,slope_upsloping,target
0,0.714286,1.0,0.541667,0.285714,1.0,0.633803,0.0,0.556818,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,0.795918,1.0,0.666667,0.388031,0.0,0.338028,1.0,0.465909,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
2,0.795918,1.0,0.333333,0.277992,0.0,0.485915,1.0,0.590909,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
3,0.183673,1.0,0.416667,0.318533,0.0,0.894366,0.0,0.693182,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
4,0.265306,0.0,0.416667,0.22973,0.0,0.788732,0.0,0.454545,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0
5,0.571429,1.0,0.333333,0.291506,0.0,0.830986,0.0,0.386364,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
6,0.693878,0.0,0.5,0.353282,0.0,0.704225,0.0,0.704545,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
7,0.591837,0.0,0.333333,0.519305,0.0,0.725352,1.0,0.363636,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
8,0.714286,1.0,0.416667,0.326255,0.0,0.612676,0.0,0.454545,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
9,0.510204,1.0,0.5,0.227799,1.0,0.669014,1.0,0.647727,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1


In [2]:
# SEPARATE features and target
X = gold_data_df.drop('target', axis=1)  # Features only (18 columns)
y = gold_data_df['target']               # Target only (0s and 1s)

# Then do train/test split
X_train, X_test, y_train, y_test = prepare_data(
    X, y, test_size=0.2, random_state=42
)

logger.info(f"X_train shape: {X_train.shape}")
logger.info(f"X_test shape: {X_test.shape}")
logger.info(f"y_train shape: {y_train.shape}")
logger.info(f"y_test shape: {y_test.shape}")

2025-07-25 11:13:30,340 - src.utils.training_utils - INFO - Training set: (15456, 16), Test set: (3864, 16)
2025-07-25 11:13:30,343 - src.utils.notebook_setup - INFO - X_train shape: (15456, 16)
2025-07-25 11:13:30,345 - src.utils.notebook_setup - INFO - X_test shape: (3864, 16)
2025-07-25 11:13:30,348 - src.utils.notebook_setup - INFO - y_train shape: (15456,)
2025-07-25 11:13:30,351 - src.utils.notebook_setup - INFO - y_test shape: (3864,)


In [3]:
# Define models
models  = get_models(y_train)

pprint(models, indent=4) # Use 4 spaces for inden
pprint(models, width=40) # Set max line width

logger.info(pprint(models))

# Shows ALL available parameters

for model in models:
    logger.info(pprint(f" {model} Parameters: {models[model].get_params()}") ) 
# logger.info(pprint(f" Random Forest Parameters: {models['RandomForest'].get_params()}"))
# logger.info(pprint(f" XGBoost Parameters: {models['XGBoost'].get_params()}"))
# logger.info(pprint(f" Decision Tree Parameters: {models['DecisionTree'].get_params()}"))

2025-07-25 11:13:30,371 - src.utils.notebook_setup - INFO - None
2025-07-25 11:13:30,375 - src.utils.notebook_setup - INFO - None


{'LogisticRegression': LogisticRegression(max_iter=1000)}
{'LogisticRegression': LogisticRegression(max_iter=1000)}
{'LogisticRegression': LogisticRegression(max_iter=1000)}
(" LogisticRegression Parameters: {'C': 1.0, 'class_weight': None, 'dual': "
 "False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, "
 "'max_iter': 1000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', "
 "'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, "
 "'warm_start': False}")


In [4]:
# train the models
param_grids = {
    'LogisticRegression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}
remove_old_models()
# Train models with correct parameter grids
for name, model in models.items():
    logger.info(f"Training {name}...")
    
    # Train basic model first
    model = train_model(model, X_train, y_train, name)
    
    # Get the correct parameter grid for this model
    if name in param_grids:
        param_grid = param_grids[name]
        logger.info(f"Tuning hyperparameters for {name}...")
        model = hyperparameter_tuning(model, X_train, y_train, param_grid=param_grid, cv=5, n_jobs=-1)
    else:
        logger.info(f"No hyperparameter tuning for {name}")
    
    # Save the model
    save_model(model, name) # TODO: maybe save the model after evaluation

2025-07-25 11:13:30,400 - src.utils.training_utils - INFO - Removing old model: heart_disease_classifier.joblib
2025-07-25 11:13:30,404 - src.utils.notebook_setup - INFO - Training LogisticRegression...
2025-07-25 11:13:30,652 - src.utils.training_utils - INFO - Trained LogisticRegression
2025-07-25 11:13:30,654 - src.utils.notebook_setup - INFO - Tuning hyperparameters for LogisticRegression...
2025-07-25 11:13:41,846 - src.utils.training_utils - INFO - Hyperparameter tuning completed for LogisticRegression
