# Часть 3: Тюнинг моделей

Этот ноутбук содержит:
- Тюнинг Random Forest (RandomizedSearchCV)
- Тюнинг XGBoost (RandomizedSearchCV)
- Тюнинг LightGBM (RandomizedSearchCV)
- Сравнение результатов тюнинга




## Загрузка данных и функций

Этот ноутбук требует:
1. Подготовленные данные из `01_data_analysis.ipynb`
2. Функции генерации признаков из `02_model_comparison.ipynb`



для тюнинга оставляем Random Forest, XGBoost, LightGBM??, LSTM???

In [5]:
import sys
import time
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# ML
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import joblib


In [7]:
sys.path.append(str(Path().resolve().parent))

from src.data.data_manager import DataManager
from src.data.feature_builder import build_targets_multi_horizon, time_split_Xy


try:
    dm  # Если уже создан в предыдущем ноутбуке
    X, y  # Если уже загружены
    targets  # Если уже созданы
except NameError:
    dm = DataManager()
    X, y = dm.get_ml_features_view()
    targets = build_targets_multi_horizon(y, horizons=(1, 24, 168))

print(f"Данные готовы для тюнинга: X.shape={X.shape}")

 View 'ml_features' загружена из кеша: X.shape=(34896, 34)
Данные готовы для тюнинга: X.shape=(34896, 34)


In [9]:
def build_splits_multi_horizon(
    X: pd.DataFrame,
    targets: dict,
    horizons=(1, 24, 168),
    train_ratio=0.7,
    val_ratio=0.15
):

    splits = {}

    for h in horizons:
        mask = targets[h]["mask"]
        X_h = X.loc[mask]
        y_h = targets[h]["y"].loc[mask]

        X_train, y_train, X_val, y_val, X_test, y_test = time_split_Xy(
            X_h, y_h, train_ratio=train_ratio, val_ratio=val_ratio
        )

        splits[h] = (X_train, y_train, X_val, y_val, X_test, y_test)

    return splits


splits = build_splits_multi_horizon(X, targets)

In [11]:
Path("models/metadata").mkdir(parents=True, exist_ok=True)
joblib.dump(splits, "models/metadata/splits.joblib")

['models/metadata/splits.joblib']

In [13]:
def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred) + eps)
    
    return float(100.0 * np.mean(2.0 * numerator / denominator))

In [15]:
def tune_rf_multi_horizon_v3(
    splits: dict,                 
    horizons=(1, 24, 168),
    n_splits=5,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    random_state=42,
    rf_n_jobs=-1,
    search_n_jobs=1,
    verbose=1,
    store_best_estimator=False
):
    results = {}

    param_dist = {
        "n_estimators": [200, 400, 600],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", 0.5, 0.8],
        "bootstrap": [True],
    }

    for h in tqdm(horizons, desc="RF tuning by horizon"):
        X_train, y_train, X_val, y_val, X_test, y_test = splits[h]

        tscv = TimeSeriesSplit(n_splits=n_splits)

        rf = RandomForestRegressor(
            random_state=random_state,
            n_jobs=rf_n_jobs
        )

        search = RandomizedSearchCV(
            estimator=rf,
            param_distributions=param_dist,
            n_iter=n_iter,
            scoring=scoring,
            cv=tscv,
            random_state=random_state,
            n_jobs=search_n_jobs,
            verbose=verbose,
            refit=True,
            return_train_score=False
        )

        t0 = time.time()
        search.fit(X_train, y_train)
        tuning_time_sec = time.time() - t0

        best_model = search.best_estimator_

        t0 = time.time()
        y_pred_val = best_model.predict(X_val)
        inference_time_sec = time.time() - t0

        mae = mean_absolute_error(y_val, y_pred_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        smape_pct = smape(y_val.values, y_pred_val)

        results[h] = {
            "best_params": search.best_params_,
            "cv_best_score": search.best_score_,
            "val_MAE": mae,
            "val_RMSE": rmse,
            "val_sMAPE_pct": smape_pct,
            "tuning_time_sec": tuning_time_sec,
            "inference_time_sec": inference_time_sec,
            "n_train": len(X_train),
            "n_val": len(X_val),
            "n_test": len(X_test),
        }

        if store_best_estimator:
            results[h]["best_estimator"] = best_model

        print(
            f"[RF tuned] h={h}: "
            f"val_MAE={mae:.2f}, val_RMSE={rmse:.2f}, val_sMAPE={smape_pct:.2f}% | "
            f"tune={tuning_time_sec:.1f}s, infer={inference_time_sec:.3f}s"
        )
        print(f"  best_params: {search.best_params_}")

        del search, best_model

    return results


rf_tuned = tune_rf_multi_horizon_v3(splits=splits)

RF tuning by horizon:   0%|          | 0/3 [00:00<?, ?it/s]

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[RF tuned] h=1: val_MAE=108.78, val_RMSE=142.33, val_sMAPE=11.92% | tune=969.9s, infer=0.127s
  best_params: {'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[RF tuned] h=24: val_MAE=112.23, val_RMSE=146.13, val_sMAPE=12.25% | tune=957.9s, infer=0.080s
  best_params: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[RF tuned] h=168: val_MAE=112.24, val_RMSE=146.37, val_sMAPE=12.30% | tune=929.3s, infer=0.049s
  best_params: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}


In [16]:
def tune_xgb_multi_horizon_v3(
    splits: dict,                   
    horizons=(1, 24, 168),
    n_splits=5,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    random_state=42,
    xgb_n_jobs=1,                   
    search_n_jobs=1,                
    verbose=1,
    store_best_estimator=False
):

    results = {}

    param_dist = {
        "n_estimators": [300, 500, 800, 1200],
        "learning_rate": [0.01, 0.03, 0.05, 0.1],
        "max_depth": [3, 4, 5, 6, 8],
        "min_child_weight": [1, 3, 5, 7],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "gamma": [0, 0.1, 0.3, 1.0],
        "reg_alpha": [0, 1e-3, 1e-2, 0.1],
        "reg_lambda": [1.0, 2.0, 5.0, 10.0],
    }

    for h in tqdm(horizons, desc="XGB tuning by horizon"):
        if h not in splits:
            raise KeyError(f"splits не содержит горизонт {h}. Доступные: {list(splits.keys())}")

        X_train, y_train, X_val, y_val, X_test, y_test = splits[h]

        tscv = TimeSeriesSplit(n_splits=n_splits)

        xgb = XGBRegressor(
            objective="reg:squarederror",
            tree_method="hist",      
            random_state=random_state,
            n_jobs=xgb_n_jobs,
            verbosity=0
        )

        search = RandomizedSearchCV(
            estimator=xgb,
            param_distributions=param_dist,
            n_iter=n_iter,
            scoring=scoring,
            cv=tscv,
            random_state=random_state,
            n_jobs=search_n_jobs,
            verbose=verbose,
            refit=True,
            return_train_score=False
        )

        t0 = time.time()
        search.fit(X_train, y_train)
        tuning_time_sec = time.time() - t0

        best_model = search.best_estimator_

        t0 = time.time()
        y_pred_val = best_model.predict(X_val)
        inference_time_sec = time.time() - t0

        mae = mean_absolute_error(y_val, y_pred_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        smape_pct = smape(y_val.values, y_pred_val)

        results[h] = {
            "best_params": search.best_params_,
            "cv_best_score": search.best_score_,
            "val_MAE": mae,
            "val_RMSE": rmse,
            "val_sMAPE_pct": smape_pct,
            "tuning_time_sec": tuning_time_sec,
            "inference_time_sec": inference_time_sec,
            "n_train": len(X_train),
            "n_val": len(X_val),
            "n_test": len(X_test),
        }

        if store_best_estimator:
            results[h]["best_estimator"] = best_model

        print(
            f"[XGB tuned] h={h}: "
            f"val_MAE={mae:.2f}, val_RMSE={rmse:.2f}, val_sMAPE={smape_pct:.2f}% | "
            f"tune={tuning_time_sec:.1f}s, infer={inference_time_sec:.3f}s"
        )
        print(f"  best_params: {search.best_params_}")

        del best_model, search

    return results


xgb_tuned = tune_xgb_multi_horizon_v3(splits)

XGB tuning by horizon:   0%|          | 0/3 [00:00<?, ?it/s]

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[XGB tuned] h=1: val_MAE=108.86, val_RMSE=142.69, val_sMAPE=11.93% | tune=362.3s, infer=0.035s
  best_params: {'subsample': 0.6, 'reg_lambda': 1.0, 'reg_alpha': 0.001, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1.0, 'colsample_bytree': 0.8}
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[XGB tuned] h=24: val_MAE=114.73, val_RMSE=149.05, val_sMAPE=12.54% | tune=418.4s, infer=0.019s
  best_params: {'subsample': 0.6, 'reg_lambda': 2.0, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 7, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.6}
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[XGB tuned] h=168: val_MAE=110.92, val_RMSE=145.05, val_sMAPE=12.11% | tune=430.3s, infer=0.035s
  best_params: {'subsample': 0.6, 'reg_lambda': 1.0, 'reg_alpha': 0.001, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'lear

In [17]:
def tune_lgbm_multi_horizon_v3(
    splits: dict,                 
    horizons=(1, 24, 168),
    n_splits=5,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    random_state=42,
    n_jobs_cv=1,                 
    verbose=1
):

    results = {}

    for h in tqdm(horizons, desc="LGBM tuning by horizon"):
        if h not in splits:
            raise KeyError(f"splits не содержит горизонт {h}. Доступные: {list(splits.keys())}")

        X_train, y_train, X_val, y_val, X_test, y_test = splits[h]

        tscv = TimeSeriesSplit(n_splits=n_splits)

        base_model = LGBMRegressor(
            objective="regression",
            random_state=random_state,
            n_jobs=1,
            verbose=-1
        )

        param_dist = {
            "n_estimators": [400, 800, 1200, 2000],
            "learning_rate": [0.01, 0.03, 0.05, 0.1],
            "num_leaves": [31, 63, 127, 255],
            "max_depth": [-1, 6, 8, 10, 12],
            "min_child_samples": [10, 20, 40, 80],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.6, 0.8, 1.0],
            "reg_alpha": [0.0, 1e-3, 1e-2, 0.1],
            "reg_lambda": [0.0, 0.1, 1.0, 5.0, 10.0],
            "bagging_freq": [0, 1],
        }

        search = RandomizedSearchCV(
            estimator=base_model,
            param_distributions=param_dist,
            n_iter=n_iter,
            scoring=scoring,
            cv=tscv,
            random_state=random_state,
            n_jobs=n_jobs_cv,
            verbose=verbose,
            refit=True
        )

        start_tune = time.time()
        search.fit(X_train, y_train)
        tuning_time_sec = time.time() - start_tune

        best_model = search.best_estimator_

        start_pred = time.time()
        y_pred_val = best_model.predict(X_val)
        inference_time_sec = time.time() - start_pred

        mae = mean_absolute_error(y_val, y_pred_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        smape_pct = smape(y_val.values, y_pred_val)

        results[h] = {
            "best_params": search.best_params_,
            "cv_best_score": search.best_score_,
            "val_MAE": mae,
            "val_RMSE": rmse,
            "val_sMAPE_pct": smape_pct,
            "tuning_time_sec": tuning_time_sec,
            "inference_time_sec": inference_time_sec,
            "n_train": len(X_train),
            "n_val": len(X_val),
            "n_test": len(X_test),
        }

        print(
            f"[LGBM tuned] h={h}: "
            f"MAE={mae:.2f}, RMSE={rmse:.2f}, sMAPE={smape_pct:.2f}% | "
            f"tune_time={tuning_time_sec:.1f}s, "
            f"val_pred_time={inference_time_sec:.3f}s"
        )

        del best_model, search

    return results


lgbm_tuned = tune_lgbm_multi_horizon_v3(splits)

LGBM tuning by horizon:   0%|          | 0/3 [00:00<?, ?it/s]

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LGBM tuned] h=1: MAE=108.28, RMSE=142.44, sMAPE=11.86% | tune_time=443.2s, val_pred_time=0.261s
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LGBM tuned] h=24: MAE=111.96, RMSE=145.89, sMAPE=12.24% | tune_time=440.8s, val_pred_time=0.275s
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LGBM tuned] h=168: MAE=110.87, RMSE=145.35, sMAPE=12.11% | tune_time=438.9s, val_pred_time=0.154s


In [18]:
def collect_tuning_results(
    rf_results: dict,
    xgb_results: dict,
    lgbm_results: dict
) -> pd.DataFrame:

    rows = []

    for model_name, results in [
        ("RandomForest", rf_results),
        ("XGBoost", xgb_results),
        ("LightGBM", lgbm_results),
    ]:
        for h, res in results.items():
            rows.append({
                "model": model_name,
                "horizon": h,
                "val_MAE": res.get("val_MAE"),
                "val_RMSE": res.get("val_RMSE"),
                "val_sMAPE_pct": res.get("val_sMAPE_pct"),
                "tuning_time_sec": res.get("tuning_time_sec"),
                "inference_time_sec": res.get("inference_time_sec"),
                "n_train": res.get("n_train"),
                "n_val": res.get("n_val"),
            })

    df = pd.DataFrame(rows)

    df = df.sort_values(["horizon", "model"]).reset_index(drop=True)

    return df

In [20]:
comparison_df = collect_tuning_results(
    rf_results=rf_tuned,
    xgb_results=xgb_tuned,
    lgbm_results=lgbm_tuned
)

comparison_df

Unnamed: 0,model,horizon,val_MAE,val_RMSE,val_sMAPE_pct,tuning_time_sec,inference_time_sec,n_train,n_val
0,LightGBM,1,108.276612,142.437545,11.863293,443.196134,0.261393,24426,5234
1,RandomForest,1,108.781393,142.332314,11.918301,969.870981,0.127348,24426,5234
2,XGBoost,1,108.858475,142.694043,11.931903,362.28424,0.03503,24426,5234
3,LightGBM,24,111.96411,145.894578,12.237998,440.831654,0.275194,24410,5230
4,RandomForest,24,112.228788,146.132868,12.25227,957.890449,0.079984,24410,5230
5,XGBoost,24,114.729294,149.050347,12.535573,418.449609,0.018852,24410,5230
6,LightGBM,168,110.874462,145.353144,12.113894,438.853887,0.153938,24309,5209
7,RandomForest,168,112.242838,146.374374,12.304519,929.290376,0.048533,24309,5209
8,XGBoost,168,110.919487,145.054179,12.112122,430.301112,0.034753,24309,5209


In [42]:
Path("models/metadata").mkdir(parents=True, exist_ok=True)
joblib.dump(xgb_tuned, "models/metadata/xgb_tuned_results.joblib")

['models/metadata/xgb_tuned_results.joblib']

In [40]:
Path("artifacts/metadata").mkdir(parents=True, exist_ok=True)
joblib.dump(lgbm_tuned, "artifacts/metadata/lgbm_tuned_results.joblib")

['artifacts/metadata/lgbm_tuned_results.joblib']