In [None]:
import sys
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from src.utils.data_utils import drop_id
from src.utils.training_utils import (
    prepare_data,
    train_model,
    hyperparameter_tuning,
    save_model,
    remove_old_models,
    get_models,
)
from src.utils.notebook_setup import setup_notebook_environment

# Quick setup
dbs, logger = await setup_notebook_environment()

# Now ready to work
logger.info("=== STARTING MODEL TRAINING ===")

# Checking Gold Layer
gold_data_from_db = await dbs.get_gold_data()
gold_data_df = pd.DataFrame(gold_data_from_db)
gold_data_df = drop_id(gold_data_df)

gold_data_df.head(10)

In [None]:
# SEPARATE features and target
X = gold_data_df.drop('target', axis=1)  # Features only (18 columns)
y = gold_data_df['target']               # Target only (0s and 1s)

# Then do train/test split
X_train, X_test, y_train, y_test = prepare_data(
    X, y, test_size=0.2, random_state=42
)

logger.info(f"X_train shape: {X_train.shape}")
logger.info(f"X_test shape: {X_test.shape}")
logger.info(f"y_train shape: {y_train.shape}")
logger.info(f"y_test shape: {y_test.shape}")

In [None]:
# Define models
# models = {
#     'LogisticRegression'  : LogisticRegression(max_iter=1000),
#     'RandomForest'        : RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
#     'XGBoost'              : XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, random_state=42, scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train)),
#     'DecisionTree'        : DecisionTreeClassifier(max_depth=5, random_state=42, class_weight='balanced'),
# }

models  = get_models(y_train)

pprint(models, indent=4) # Use 4 spaces for inden
pprint(models, width=40) # Set max line width

logger.info(pprint(models))

# Shows ALL available parameters
logger.info(pprint(f" Logistic Regression Parameters: {models['LogisticRegression'].get_params()}") ) 
# logger.info(pprint(f" Random Forest Parameters: {models['RandomForest'].get_params()}"))
# logger.info(pprint(f" XGBoost Parameters: {models['XGBoost'].get_params()}"))
# logger.info(pprint(f" Decision Tree Parameters: {models['DecisionTree'].get_params()}"))

In [None]:
# train the models
param_grids = {
    'LogisticRegression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'RandomForest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}
remove_old_models()
# Train models with correct parameter grids
for name, model in models.items():
    logger.info(f"Training {name}...")
    
    # Train basic model first
    model = train_model(model, X_train, y_train, name)
    
    # Get the correct parameter grid for this model
    if name in param_grids:
        param_grid = param_grids[name]
        logger.info(f"Tuning hyperparameters for {name}...")
        model = hyperparameter_tuning(model, X_train, y_train, param_grid=param_grid, cv=5, n_jobs=-1)
    else:
        logger.info(f"No hyperparameter tuning for {name}")
    
    # Save the model
    save_model(model, name) # TODO: maybe save the model after evaluation