In [2]:
# src/models/smci_models.py

import numpy as np
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from src.evaluation.smci_evaluation import (
    regression_metrics,
    classification_metrics,
    evaluate_strategy,
)


def run_regression_models(X_train, y_train, X_val, y_val, X_test, y_test):
    """Fit baselines, Ridge, GB; return metrics, strategy stats, and predictions."""

    results = {}
    strategies = {}
    preds = {}

    # Naive zero
    y_val_naive = np.zeros_like(y_val)
    y_test_naive = np.zeros_like(y_test)
    results["naive_zero"] = regression_metrics(y_test, y_test_naive)
    strategies["naive_zero"] = evaluate_strategy(y_test, y_test_naive)
    preds["naive_zero"] = y_test_naive

    # Constant-mean
    const_mean = y_train.mean()
    y_val_const = np.full_like(y_val, const_mean)
    y_test_const = np.full_like(y_test, const_mean)
    results["const_mean"] = regression_metrics(y_test, y_test_const)
    strategies["const_mean"] = evaluate_strategy(y_test, y_test_const)
    preds["const_mean"] = y_test_const

    # Ridge with val-based alpha tuning
    alphas = np.logspace(-4, 2, 13)
    best_alpha = None
    best_val_rmse = np.inf

    for alpha in alphas:
        model = Pipeline([
            ("scaler", StandardScaler()),
            ("ridge", Ridge(alpha=alpha)),
        ])
        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
        m_val = regression_metrics(y_val, y_val_pred)
        if m_val["RMSE"] < best_val_rmse:
            best_val_rmse = m_val["RMSE"]
            best_alpha = alpha

    X_trainval = np.vstack([X_train, X_val])
    y_trainval = np.concatenate([y_train, y_val])

    ridge_best = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=best_alpha)),
    ])
    ridge_best.fit(X_trainval, y_trainval)
    y_test_ridge = ridge_best.predict(X_test)

    results["ridge"] = regression_metrics(y_test, y_test_ridge)
    strategies["ridge"] = evaluate_strategy(y_test, y_test_ridge)
    preds["ridge"] = y_test_ridge

    # Gradient Boosting with small manual grid
    param_grid = [
        {"n_estimators": 200, "max_depth": 2, "learning_rate": 0.05},
        {"n_estimators": 300, "max_depth": 2, "learning_rate": 0.05},
        {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.05},
        {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.03},
    ]

    best_params = None
    best_val_rmse = np.inf

    for params in param_grid:
        gb = GradientBoostingRegressor(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            learning_rate=params["learning_rate"],
            random_state=0,
        )
        gb.fit(X_train, y_train)
        y_val_pred = gb.predict(X_val)
        m_val = regression_metrics(y_val, y_val_pred)
        if m_val["RMSE"] < best_val_rmse:
            best_val_rmse = m_val["RMSE"]
            best_params = params

    gb_best = GradientBoostingRegressor(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        learning_rate=best_params["learning_rate"],
        random_state=0,
    )
    gb_best.fit(X_trainval, y_trainval)
    y_test_gb = gb_best.predict(X_test)

    results["gb"] = regression_metrics(y_test, y_test_gb)
    strategies["gb"] = evaluate_strategy(y_test, y_test_gb)
    preds["gb"] = y_test_gb

    return {
        "metrics": results,
        "strategies": strategies,
        "predictions": preds,
        "ridge_model": ridge_best,
        "gb_model": gb_best,
        "best_ridge_alpha": best_alpha,
        "best_gb_params": best_params,
    }


def run_classification_models(X_train, y_class_train, X_val, y_class_val, X_test, y_class_test):
    """Fit majority, Logistic Regression, and RF; return metrics, strategy stats, predictions."""

    class_results = {}
    strategy_results = {}
    preds = {}

    # Majority baseline
    majority_class = int(np.round(np.mean(y_class_train)))
    y_val_major = np.full_like(y_class_val, majority_class)
    y_test_major = np.full_like(y_class_test, majority_class)

    class_results["majority"] = classification_metrics(y_class_test, y_test_major)
    strategy_results["majority"] = evaluate_strategy(y_class_test, y_test_major)
    preds["majority"] = y_test_major

    # Logistic Regression
    log_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(max_iter=1000)),
    ])
    log_clf.fit(X_train, y_class_train)
    y_val_log = log_clf.predict(X_val)
    y_test_log = log_clf.predict(X_test)

    class_results["logistic"] = classification_metrics(y_class_test, y_test_log)
    strategy_results["logistic"] = evaluate_strategy(y_class_test, y_test_log)
    preds["logistic"] = y_test_log

    # Random Forest with small grid
    rf_param_grid = [
        {"n_estimators": 200, "max_depth": 3},
        {"n_estimators": 300, "max_depth": 3},
        {"n_estimators": 300, "max_depth": 4},
    ]

    best_rf = None
    best_rf_params = None
    best_val_acc = -np.inf

    for params in rf_param_grid:
        rf = RandomForestClassifier(
            n_estimators=params["n_estimators"],
            max_depth=params["max_depth"],
            random_state=0,
            n_jobs=-1,
        )
        rf.fit(X_train, y_class_train)
        y_val_rf = rf.predict(X_val)
        m_val = classification_metrics(y_class_val, y_val_rf)
        if m_val["accuracy"] > best_val_acc:
            best_val_acc = m_val["accuracy"]
            best_rf = rf
            best_rf_params = params

    y_test_rf = best_rf.predict(X_test)

    class_results["rf"] = classification_metrics(y_class_test, y_test_rf)
    strategy_results["rf"] = evaluate_strategy(y_class_test, y_test_rf)
    preds["rf"] = y_test_rf

    return {
        "metrics": class_results,
        "strategies": strategy_results,
        "predictions": preds,
        "rf_model": best_rf,
        "rf_params": best_rf_params,
        "majority_class": majority_class,
    }


ModuleNotFoundError: No module named 'src'