# Рекурсивный прогноз с учетом погодных данных

В этом экспериментальном ноутбуке используется объединенный датасет `energy_weather_merged`,
который содержит энергопотребление (`Usage_kWh`) и погодные признаки (`wind_speed`, `T`, `f`).

Мы повторяем стратегию формирования признаков и рекурсивного прогноза из `experiment_recursive.ipynb`,
но дополняем модель погодными признаками.



In [2]:
# Импорты и загрузка energy_weather_merged
import sys
from pathlib import Path

# Добавляем корень проекта в путь (для ноутбуков в experiments/)
project_root = Path().resolve().parent.parent
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

# Путь к объединенному датасету (создан в energy_weather_merge.ipynb)
merged_path = Path("..") / ".." / "data" / "views" / "energy_weather_merged.parquet"
print("Путь к energy_weather_merged:", merged_path)

df_merged = pd.read_parquet(merged_path)

# Убедимся, что индекс — DatetimeIndex
if not isinstance(df_merged.index, pd.DatetimeIndex):
    df_merged.index = pd.to_datetime(df_merged.index)

df_merged = df_merged.sort_index().copy()
df_merged.index.name = "DateTime"

print("Форма df_merged:", df_merged.shape)
df_merged.head()


Путь к energy_weather_merged: ../../data/views/energy_weather_merged.parquet
Форма df_merged: (35064, 4)


Unnamed: 0_level_0,Usage_kWh,wind_speed,T,f
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01 00:00:00,570.685479,2.0,2.6,88.0
2017-01-01 01:00:00,604.642705,2.0,2.6,88.0
2017-01-01 02:00:00,518.732113,2.0,2.6,88.0
2017-01-01 03:00:00,608.188829,4.0,2.6,82.0
2017-01-01 04:00:00,714.140572,4.0,2.6,82.0


In [3]:
# Функция построения признаков для recursive one-step с погодными фичами
import numpy as np
import pandas as pd
from typing import Tuple


def build_features_recursive_step1_weather(
    df: pd.DataFrame,
    target_col: str = "Usage_kWh",
    freq: str = "h",
    lags: Tuple[int, ...] = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168),
    add_month_mean: bool = True,
    add_year_mean: bool = True,
    weather_cols: Tuple[str, ...] = ("wind_speed", "T", "f"),
) -> Tuple[pd.DataFrame, pd.Series]:
    """Как build_features_recursive_step1 из experiment_recursive.ipynb,
    но дополняем X погодными признаками на момент t.

    Важно: не изменяем сам df, только читаем из него значения.
    """
    # 0) порядок + частота
    s = df[[target_col] + [c for c in weather_cols if c in df.columns]].copy()
    s = s.sort_index().asfreq(freq)

    y = s[target_col].astype(np.float32)

    # 1) календарные фичи
    X = pd.DataFrame(index=s.index)
    idx = X.index

    X["hour"] = idx.hour.astype(np.int8)
    X["day_of_week"] = idx.dayofweek.astype(np.int8)
    X["is_weekend"] = (idx.dayofweek >= 5).astype(np.int8)

    X["peak"] = ((idx.hour >= 6) & (idx.hour <= 13)).astype(np.int8)
    X["peak_month"] = (idx.month.isin([1, 2, 3, 9, 10, 11, 12])).astype(np.int8)

    # 2) month/year means без утечки
    if add_month_mean:
        ym = pd.DataFrame({"y": y, "year": idx.year, "month": idx.month}, index=idx)
        month_mean_hist = (
            ym.groupby(["year", "month"])["y"]
            .expanding()
            .mean()
            .reset_index(level=[0, 1], drop=True)
            .shift(1)
        )
        X["month_mean_hist"] = month_mean_hist.astype(np.float32)

    if add_year_mean:
        yy = pd.DataFrame({"y": y, "year": idx.year}, index=idx)
        year_mean_hist = (
            yy.groupby("year")["y"]
            .expanding()
            .mean()
            .reset_index(level=0, drop=True)
            .shift(1)
        )
        X["year_mean_hist"] = year_mean_hist.astype(np.float32)

    # 3) лаги (строго прошлое)
    for lag in lags:
        X[f"lag_{lag}"] = y.shift(lag)

    # 4) Погодные признаки на момент t (без лагов, просто текущее значение)
    for col in weather_cols:
        if col in s.columns:
            X[col] = s[col].astype(np.float32)

    # 5) таргет one-step: y(t+1)
    y_next = y.shift(-1)

    # 6) фильтрация валидных строк
    valid = X.notna().all(axis=1) & y_next.notna()
    X = X.loc[valid]
    y_next = y_next.loc[valid].astype(np.float32)

    # 7) оптимизация типов
    float_cols = X.select_dtypes(include=["float64"]).columns
    if len(float_cols) > 0:
        X[float_cols] = X[float_cols].astype(np.float32)

    return X, y_next


In [4]:
# Вспомогательные функции: sMAPE, time_split_index и рекурсивный прогноз с погодой
import numpy as np
import pandas as pd
from typing import Tuple, Dict
from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor


def smape(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 1e-8) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps)
    return float(100.0 * np.mean(2.0 * np.abs(y_pred - y_true) / denom))


def time_split_index(
    index: pd.DatetimeIndex,
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
) -> Tuple[pd.DatetimeIndex, pd.DatetimeIndex]:
    n = len(index)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    train_idx = index[:train_end]
    val_idx = index[train_end:val_end]
    return train_idx, val_idx


def _predict_recursive_h_steps_weather(
    model,
    full_df: pd.DataFrame,
    target_col: str,
    start_time: pd.Timestamp,
    h: int,
    lags: Tuple[int, ...],
    freq: str = "h",
    add_month_mean: bool = True,
    add_year_mean: bool = True,
    weather_cols: Tuple[str, ...] = ("wind_speed", "T", "f"),
) -> float:
    """Аналог _predict_recursive_h_steps, но с учетом погодных признаков.

    Погодные признаки берем из full_df на соответствующих временных шагах
    (они считаются известными заранее).
    """
    y_hist = full_df[target_col].astype(np.float32).copy()

    current_t = start_time

    for step in range(h):
        idx = current_t
        feat = {}

        # Календарные признаки
        feat["hour"] = np.int8(idx.hour)
        feat["day_of_week"] = np.int8(idx.dayofweek)
        feat["is_weekend"] = np.int8(1 if idx.dayofweek >= 5 else 0)
        feat["peak"] = np.int8(1 if (idx.hour >= 6 and idx.hour <= 13) else 0)
        feat["peak_month"] = np.int8(1 if idx.month in [1, 2, 3, 9, 10, 11, 12] else 0)

        hist_cutoff = current_t - pd.to_timedelta(1, unit=freq)
        y_past = y_hist.loc[:hist_cutoff].dropna()

        if add_month_mean:
            same_month = y_past[(y_past.index.year == idx.year) & (y_past.index.month == idx.month)]
            feat["month_mean_hist"] = np.float32(same_month.mean()) if len(same_month) else np.float32(np.nan)

        if add_year_mean:
            same_year = y_past[(y_past.index.year == idx.year)]
            feat["year_mean_hist"] = np.float32(same_year.mean()) if len(same_year) else np.float32(np.nan)

        # Лаги
        for lag in lags:
            lag_time = current_t - pd.to_timedelta(lag, unit=freq)
            feat[f"lag_{lag}"] = np.float32(y_hist.loc[lag_time]) if lag_time in y_hist.index else np.float32(np.nan)

        # Погодные признаки на момент t (берем из full_df)
        for col in weather_cols:
            if col in full_df.columns and idx in full_df.index:
                feat[col] = np.float32(full_df.loc[idx, col])

        if any(pd.isna(v) for v in feat.values()):
            return np.nan

        X_t = pd.DataFrame([feat])
        y_next_pred = float(model.predict(X_t)[0])

        next_t = current_t + pd.to_timedelta(1, unit=freq)
        y_hist.loc[next_t] = np.float32(y_next_pred)
        current_t = next_t

    return float(y_hist.loc[current_t])


In [5]:
# Оценка стратегии recursive one-step с LGBM и погодными фичами

from typing import Tuple, Dict
from lightgbm import LGBMRegressor


def eval_recursive_one_step_strategy_val_only_weather(
    df: pd.DataFrame,
    target_col: str = "Usage_kWh",
    horizons: Tuple[int, ...] = (1, 24, 168),
    freq: str = "h",
    lags: Tuple[int, ...] = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168),
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
    model_params: Dict = None,
    n_val_points: int = 500,
    random_state: int = 42,
    weather_cols: Tuple[str, ...] = ("wind_speed", "T", "f"),
) -> pd.DataFrame:
    if model_params is None:
        model_params = dict(
            n_estimators=800,
            learning_rate=0.05,
            num_leaves=63,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
        )

    # 1) Признаки и таргет one-step
    X, y_next = build_features_recursive_step1_weather(
        df=df,
        target_col=target_col,
        freq=freq,
        lags=lags,
        add_month_mean=True,
        add_year_mean=True,
        weather_cols=weather_cols,
    )

    # 2) time split по индексу X
    train_idx, val_idx = time_split_index(X.index, train_ratio=train_ratio, val_ratio=val_ratio)

    X_train = X.loc[train_idx]
    y_train = y_next.loc[train_idx]

    # Полный ряд с целевой и погодой для рекурсии
    full_series = df[[target_col] + [c for c in weather_cols if c in df.columns]].copy()
    full_series = full_series.sort_index().asfreq(freq)

    val_start = val_idx.min()
    val_end = val_idx.max()

    candidate_starts = val_idx
    if len(candidate_starts) > n_val_points:
        candidate_starts = candidate_starts[-n_val_points:]

    model = LGBMRegressor(
        objective="regression",
        random_state=random_state,
        n_jobs=1,
        verbose=-1,
        **model_params,
    )
    model.fit(X_train, y_train)

    rows = []

    for h in horizons:
        y_true_list = []
        y_pred_list = []

        for t in candidate_starts:
            t_h = t + pd.to_timedelta(h, unit=freq)

            if t_h < val_start or t_h > val_end:
                continue

            pred = _predict_recursive_h_steps_weather(
                model=model,
                full_df=full_series,
                target_col=target_col,
                start_time=t,
                h=h,
                lags=lags,
                freq=freq,
                add_month_mean=True,
                add_year_mean=True,
                weather_cols=weather_cols,
            )

            if np.isnan(pred):
                continue

            true_val = float(full_series.loc[t_h, target_col])

            y_true_list.append(true_val)
            y_pred_list.append(pred)

        y_true_arr = np.array(y_true_list, dtype=float)
        y_pred_arr = np.array(y_pred_list, dtype=float)

        mae = mean_absolute_error(y_true_arr, y_pred_arr)
        rmse = np.sqrt(mean_squared_error(y_true_arr, y_pred_arr))
        smape_pct = smape(y_true_arr, y_pred_arr)

        rows.append({
            "strategy": "recursive_one_step_weather",
            "model": "LGBM",
            "horizon": h,
            "val_MAE": mae,
            "val_RMSE": rmse,
            "val_sMAPE_pct": smape_pct,
            "n_eval_points": len(y_true_arr),
            "n_features": X.shape[1],
            "lags": list(lags),
        })

        print(
            f"[LGBM Recursive+Weather] h={h}: "
            f"val_MAE={mae:.2f}, val_RMSE={rmse:.2f}, val_sMAPE={smape_pct:.2f}% | n={len(y_true_arr)}"
        )

    return pd.DataFrame(rows).sort_values("horizon").reset_index(drop=True)


In [6]:
# Запуск эксперимента с LGBM и погодными признаками

horizons = (1, 24, 168)
lags = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168)

results_recursive_weather = eval_recursive_one_step_strategy_val_only_weather(
    df=df_merged,
    target_col="Usage_kWh",
    horizons=horizons,
    lags=lags,
    train_ratio=0.7,
    val_ratio=0.15,
    n_val_points=500,
)

results_recursive_weather


[LGBM Recursive+Weather] h=1: val_MAE=102.47, val_RMSE=135.72, val_sMAPE=12.05% | n=499
[LGBM Recursive+Weather] h=24: val_MAE=107.61, val_RMSE=142.43, val_sMAPE=12.61% | n=476
[LGBM Recursive+Weather] h=168: val_MAE=114.35, val_RMSE=152.82, val_sMAPE=13.23% | n=332


Unnamed: 0,strategy,model,horizon,val_MAE,val_RMSE,val_sMAPE_pct,n_eval_points,n_features,lags
0,recursive_one_step_weather,LGBM,1,102.472502,135.720494,12.04961,499,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]"
1,recursive_one_step_weather,LGBM,24,107.610883,142.43237,12.607691,476,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]"
2,recursive_one_step_weather,LGBM,168,114.350263,152.82444,13.231946,332,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]"


In [7]:
# Оценка стратегии recursive one-step с RandomForest и погодными фичами

from sklearn.ensemble import RandomForestRegressor
from typing import Tuple, Dict


def eval_recursive_one_step_strategy_val_only_weather_rf(
    df: pd.DataFrame,
    target_col: str = "Usage_kWh",
    horizons: Tuple[int, ...] = (1, 24, 168),
    freq: str = "h",
    lags: Tuple[int, ...] = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168),
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
    rf_params: Dict = None,
    n_val_points: int = 500,
    random_state: int = 42,
    weather_cols: Tuple[str, ...] = ("wind_speed", "T", "f"),
) -> pd.DataFrame:
    if rf_params is None:
        rf_params = dict(
            n_estimators=500,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            bootstrap=True,
            n_jobs=-1,
        )

    # 1) Признаки и таргет one-step
    X, y_next = build_features_recursive_step1_weather(
        df=df,
        target_col=target_col,
        freq=freq,
        lags=lags,
        add_month_mean=True,
        add_year_mean=True,
        weather_cols=weather_cols,
    )

    # 2) time split по индексу X
    train_idx, val_idx = time_split_index(X.index, train_ratio=train_ratio, val_ratio=val_ratio)

    X_train = X.loc[train_idx]
    y_train = y_next.loc[train_idx]

    # Полный ряд с целевой и погодой для рекурсии
    full_series = df[[target_col] + [c for c in weather_cols if c in df.columns]].copy()
    full_series = full_series.sort_index().asfreq(freq)

    val_start = val_idx.min()
    val_end = val_idx.max()

    candidate_starts = val_idx
    if len(candidate_starts) > n_val_points:
        candidate_starts = candidate_starts[-n_val_points:]

    model = RandomForestRegressor(
        random_state=random_state,
        **rf_params,
    )
    model.fit(X_train, y_train)

    rows = []

    for h in horizons:
        y_true_list = []
        y_pred_list = []

        for t in candidate_starts:
            t_h = t + pd.to_timedelta(h, unit=freq)

            if t_h < val_start or t_h > val_end:
                continue

            pred = _predict_recursive_h_steps_weather(
                model=model,
                full_df=full_series,
                target_col=target_col,
                start_time=t,
                h=h,
                lags=lags,
                freq=freq,
                add_month_mean=True,
                add_year_mean=True,
                weather_cols=weather_cols,
            )

            if np.isnan(pred):
                continue

            true_val = float(full_series.loc[t_h, target_col])

            y_true_list.append(true_val)
            y_pred_list.append(pred)

        y_true_arr = np.array(y_true_list, dtype=float)
        y_pred_arr = np.array(y_pred_list, dtype=float)

        mae = mean_absolute_error(y_true_arr, y_pred_arr)
        rmse = np.sqrt(mean_squared_error(y_true_arr, y_pred_arr))
        smape_pct = smape(y_true_arr, y_pred_arr)

        rows.append({
            "strategy": "recursive_one_step_weather",
            "model": "RandomForest",
            "horizon": h,
            "val_MAE": mae,
            "val_RMSE": rmse,
            "val_sMAPE_pct": smape_pct,
            "n_eval_points": len(y_true_arr),
            "n_features": X.shape[1],
            "lags": list(lags),
            "rf_params": dict(rf_params),
        })

        print(
            f"[RF Recursive+Weather] h={h}: "
            f"val_MAE={mae:.2f}, val_RMSE={rmse:.2f}, val_sMAPE={smape_pct:.2f}% | n={len(y_true_arr)}"
        )

    return pd.DataFrame(rows).sort_values("horizon").reset_index(drop=True)


In [8]:
# Запуск эксперимента с RandomForest и погодными признаками

horizons = (1, 24, 168)
lags = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168)

results_recursive_weather_rf = eval_recursive_one_step_strategy_val_only_weather_rf(
    df=df_merged,
    target_col="Usage_kWh",
    horizons=horizons,
    lags=lags,
    train_ratio=0.7,
    val_ratio=0.15,
    n_val_points=500,
)

results_recursive_weather_rf


[RF Recursive+Weather] h=1: val_MAE=102.60, val_RMSE=135.33, val_sMAPE=12.08% | n=499
[RF Recursive+Weather] h=24: val_MAE=108.01, val_RMSE=143.32, val_sMAPE=12.63% | n=476
[RF Recursive+Weather] h=168: val_MAE=113.50, val_RMSE=152.16, val_sMAPE=13.16% | n=332


Unnamed: 0,strategy,model,horizon,val_MAE,val_RMSE,val_sMAPE_pct,n_eval_points,n_features,lags,rf_params
0,recursive_one_step_weather,RandomForest,1,102.598092,135.329738,12.080915,499,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]","{'n_estimators': 500, 'max_depth': None, 'min_..."
1,recursive_one_step_weather,RandomForest,24,108.007429,143.3234,12.626402,476,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]","{'n_estimators': 500, 'max_depth': None, 'min_..."
2,recursive_one_step_weather,RandomForest,168,113.503348,152.158806,13.162466,332,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]","{'n_estimators': 500, 'max_depth': None, 'min_..."


In [9]:
# Оценка стратегии recursive one-step с XGBoost и погодными фичами

from xgboost import XGBRegressor
from typing import Tuple, Dict


def eval_recursive_one_step_strategy_val_only_weather_xgb(
    df: pd.DataFrame,
    target_col: str = "Usage_kWh",
    horizons: Tuple[int, ...] = (1, 24, 168),
    freq: str = "h",
    lags: Tuple[int, ...] = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168),
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
    xgb_params: Dict = None,
    n_val_points: int = 500,
    random_state: int = 42,
    weather_cols: Tuple[str, ...] = ("wind_speed", "T", "f"),
) -> pd.DataFrame:
    if xgb_params is None:
        xgb_params = dict(
            n_estimators=800,
            learning_rate=0.05,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="reg:squarederror",
            tree_method="hist",
        )

    # 1) Признаки и таргет one-step
    X, y_next = build_features_recursive_step1_weather(
        df=df,
        target_col=target_col,
        freq=freq,
        lags=lags,
        add_month_mean=True,
        add_year_mean=True,
        weather_cols=weather_cols,
    )

    # 2) time split по индексу X
    train_idx, val_idx = time_split_index(X.index, train_ratio=train_ratio, val_ratio=val_ratio)

    X_train = X.loc[train_idx]
    y_train = y_next.loc[train_idx]

    # Полный ряд с целевой и погодой для рекурсии
    full_series = df[[target_col] + [c for c in weather_cols if c in df.columns]].copy()
    full_series = full_series.sort_index().asfreq(freq)

    val_start = val_idx.min()
    val_end = val_idx.max()

    candidate_starts = val_idx
    if len(candidate_starts) > n_val_points:
        candidate_starts = candidate_starts[-n_val_points:]

    model = XGBRegressor(
        random_state=random_state,
        **xgb_params,
    )
    model.fit(X_train, y_train)

    rows = []

    for h in horizons:
        y_true_list = []
        y_pred_list = []

        for t in candidate_starts:
            t_h = t + pd.to_timedelta(h, unit=freq)

            if t_h < val_start or t_h > val_end:
                continue

            pred = _predict_recursive_h_steps_weather(
                model=model,
                full_df=full_series,
                target_col=target_col,
                start_time=t,
                h=h,
                lags=lags,
                freq=freq,
                add_month_mean=True,
                add_year_mean=True,
                weather_cols=weather_cols,
            )

            if np.isnan(pred):
                continue

            true_val = float(full_series.loc[t_h, target_col])

            y_true_list.append(true_val)
            y_pred_list.append(pred)

        y_true_arr = np.array(y_true_list, dtype=float)
        y_pred_arr = np.array(y_pred_list, dtype=float)

        mae = mean_absolute_error(y_true_arr, y_pred_arr)
        rmse = np.sqrt(mean_squared_error(y_true_arr, y_pred_arr))
        smape_pct = smape(y_true_arr, y_pred_arr)

        rows.append({
            "strategy": "recursive_one_step_weather",
            "model": "XGBoost",
            "horizon": h,
            "val_MAE": mae,
            "val_RMSE": rmse,
            "val_sMAPE_pct": smape_pct,
            "n_eval_points": len(y_true_arr),
            "n_features": X.shape[1],
            "lags": list(lags),
            "xgb_params": dict(xgb_params),
        })

        print(
            f"[XGB Recursive+Weather] h={h}: "
            f"val_MAE={mae:.2f}, val_RMSE={rmse:.2f}, val_sMAPE={smape_pct:.2f}% | n={len(y_true_arr)}"
        )

    return pd.DataFrame(rows).sort_values("horizon").reset_index(drop=True)


In [10]:
# Запуск эксперимента с XGBoost и погодными признаками

horizons = (1, 24, 168)
lags = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168)

results_recursive_weather_xgb = eval_recursive_one_step_strategy_val_only_weather_xgb(
    df=df_merged,
    target_col="Usage_kWh",
    horizons=horizons,
    lags=lags,
    train_ratio=0.7,
    val_ratio=0.15,
    n_val_points=500,
)

results_recursive_weather_xgb


[XGB Recursive+Weather] h=1: val_MAE=103.31, val_RMSE=136.39, val_sMAPE=12.09% | n=499
[XGB Recursive+Weather] h=24: val_MAE=112.31, val_RMSE=148.27, val_sMAPE=13.04% | n=476
[XGB Recursive+Weather] h=168: val_MAE=115.21, val_RMSE=154.22, val_sMAPE=13.36% | n=332


Unnamed: 0,strategy,model,horizon,val_MAE,val_RMSE,val_sMAPE_pct,n_eval_points,n_features,lags,xgb_params
0,recursive_one_step_weather,XGBoost,1,103.309938,136.38548,12.094483,499,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]","{'n_estimators': 800, 'learning_rate': 0.05, '..."
1,recursive_one_step_weather,XGBoost,24,112.313172,148.270317,13.038686,476,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]","{'n_estimators': 800, 'learning_rate': 0.05, '..."
2,recursive_one_step_weather,XGBoost,168,115.210121,154.221657,13.360756,332,21,"[1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168]","{'n_estimators': 800, 'learning_rate': 0.05, '..."


In [21]:
import numpy as np
import pandas as pd

from typing import Tuple, Dict
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


# =========================================================
# 1) sMAPE (как и раньше)
# =========================================================
def smape(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 1e-8) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps)
    return float(100.0 * np.mean(2.0 * np.abs(y_pred - y_true) / denom))


# =========================================================
# 2) Фичи для direct strategy + погода
#    (X(t), y(t) и targets для горизонтов: y(t+h))
# =========================================================
def build_features_X_direct_with_weather(
    df: pd.DataFrame,
    target_col: str = "Usage_kWh",
    freq: str = "h",
    lags: Tuple[int, ...] = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168),
    weather_cols: Tuple[str, ...] = ("wind_speed", "T", "f"),
    add_month_mean: bool = True,
    add_year_mean: bool = True,
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Строит X(t) (календарь + month/year hist mean + лаги + погода в момент t)
    и возвращает y(t) (исходный ряд).

    Важно про "без утечки":
      - month_mean_hist/year_mean_hist считаются ТОЛЬКО по прошлым значениям y (shift(1)).
      - лаги - только прошлое.
      - погода берётся как экзогенная на момент t (если она реально известна).
    """
    cols = [target_col] + [c for c in weather_cols if c in df.columns]
    s = df[cols].copy().sort_index().asfreq(freq)

    y = s[target_col].astype(np.float32)

    X = pd.DataFrame(index=s.index)
    idx = X.index

    # календарные
    X["hour"] = idx.hour.astype(np.int8)
    X["day_of_week"] = idx.dayofweek.astype(np.int8)
    X["is_weekend"] = (idx.dayofweek >= 5).astype(np.int8)
    X["peak"] = ((idx.hour >= 6) & (idx.hour <= 13)).astype(np.int8)
    X["peak_month"] = (idx.month.isin([1, 2, 3, 9, 10, 11, 12])).astype(np.int8)

    # month/year historical means без утечки: expanding mean + shift(1)
    if add_month_mean:
        ym = pd.DataFrame({"y": y, "year": idx.year, "month": idx.month}, index=idx)
        month_mean_hist = (
            ym.groupby(["year", "month"])["y"]
            .expanding()
            .mean()
            .reset_index(level=[0, 1], drop=True)
            .shift(1)
        )
        X["month_mean_hist"] = month_mean_hist.astype(np.float32)

    if add_year_mean:
        yy = pd.DataFrame({"y": y, "year": idx.year}, index=idx)
        year_mean_hist = (
            yy.groupby("year")["y"]
            .expanding()
            .mean()
            .reset_index(level=0, drop=True)
            .shift(1)
        )
        X["year_mean_hist"] = year_mean_hist.astype(np.float32)

    # лаги
    for lag in lags:
        X[f"lag_{lag}"] = y.shift(lag)

    # погода в момент t
    for c in weather_cols:
        if c in s.columns:
            X[c] = pd.to_numeric(s[c], errors="coerce").astype(np.float32)

    # валидные строки (чтобы X было без NaN + y(t) не NaN)
    valid = X.notna().all(axis=1) & y.notna()
    X = X.loc[valid]
    y = y.loc[valid]

    # dtype float32 для экономии
    float_cols = X.select_dtypes(include=["float64"]).columns
    if len(float_cols) > 0:
        X[float_cols] = X[float_cols].astype(np.float32)

    return X, y


def build_targets_multi_horizon(y: pd.Series, horizons: Tuple[int, ...] = (1, 24, 168)) -> dict:
    """
    targets[h]["y"] = y(t+h), targets[h]["mask"] = где есть y(t) и y(t+h)
    """
    targets = {}
    for h in horizons:
        y_h = y.shift(-h)
        mask = y.notna() & y_h.notna()
        targets[h] = {"y": y_h, "mask": mask}
    return targets


# =========================================================
# 3) Сплит по времени для каждого горизонта (train/val/test)
#    (логика "одинаковые правила" для всех моделей)
# =========================================================
def time_split_Xy(
    X: pd.DataFrame,
    y: pd.Series,
    train_ratio: float = 0.7,
    val_ratio: float = 0.15
):
    n = len(X)
    train_size = int(n * train_ratio)
    val_size = int(n * val_ratio)

    X_train = X.iloc[:train_size]
    y_train = y.iloc[:train_size]

    X_val = X.iloc[train_size:train_size + val_size]
    y_val = y.iloc[train_size:train_size + val_size]

    X_test = X.iloc[train_size + val_size:]
    y_test = y.iloc[train_size + val_size:]

    return X_train, y_train, X_val, y_val, X_test, y_test


def prepare_splits_multi_horizon_direct(
    X: pd.DataFrame,
    targets: dict,
    horizons: Tuple[int, ...] = (1, 24, 168),
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
) -> dict:
    """
    Делает splits[h] = (X_train, y_train, X_val, y_val, X_test, y_test)
    для DIRECT strategy (отдельная модель на каждый горизонт).
    """
    splits = {}

    for h in horizons:
        mask = targets[h]["mask"]
        X_h = X.loc[mask]
        y_h = targets[h]["y"].loc[mask]

        splits[h] = time_split_Xy(
            X_h, y_h,
            train_ratio=train_ratio,
            val_ratio=val_ratio
        )

    return splits




In [23]:
# =========================================================
# 4) Твой direct RF evaluator (добавили sMAPE; остальное как у тебя)
# =========================================================
def eval_rf_multi_horizon_from_splits(
    splits: dict,
    horizons: Tuple[int, ...] = (1, 24, 168),
    rf_params: Dict = None
):
    """
    RandomForest (direct strategy): отдельная RF на каждый горизонт.
    Оценка на VAL.
    """
    if rf_params is None:
        rf_params = dict(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        )

    results = {}

    for h in horizons:
        X_train, y_train, X_val, y_val, X_test, y_test = splits[h]

        model = RandomForestRegressor(**rf_params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)

        mae = mean_absolute_error(y_val, y_pred)
        rmse = float(np.sqrt(mean_squared_error(y_val, y_pred)))
        smape_pct = smape(np.asarray(y_val), np.asarray(y_pred))

        results[h] = {
            "val_MAE": mae,
            "val_RMSE": rmse,
            "val_sMAPE_pct": smape_pct,
            "n_train": len(X_train),
            "n_val": len(X_val),
            "n_test": len(X_test),
            "model": model
        }

        print(f"[RF direct | h={h}] val_MAE={mae:.2f}, val_RMSE={rmse:.2f}, val_sMAPE={smape_pct:.2f}%")

    return results


# =========================================================
# 5) Пример запуска на твоём новом датасете df_merged_hourly
# =========================================================
# df_merged_hourly:
# index=DateTime (часовой)
# columns: Usage_kWh, wind_speed, T, f

horizons = (1, 24, 168)
lags = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168)

X, y = build_features_X_direct_with_weather(
    df=df_merged,
    target_col="Usage_kWh",
    freq="h",
    lags=lags,
    weather_cols=("wind_speed", "T", "f"),
    add_month_mean=True,
    add_year_mean=True,
)
targets = build_targets_multi_horizon(y, horizons=horizons)

splits = prepare_splits_multi_horizon_direct(
    X=X,
    targets=targets,
    horizons=horizons,
    train_ratio=0.7,
    val_ratio=0.15
)

rf_scores_direct_weather = eval_rf_multi_horizon_from_splits(
    splits=splits,
    horizons=horizons,
    rf_params=dict(n_estimators=300, random_state=42, n_jobs=-1)
)

rf_scores_direct_weather

[RF direct | h=1] val_MAE=107.88, val_RMSE=141.90, val_sMAPE=11.84%
[RF direct | h=24] val_MAE=112.03, val_RMSE=145.66, val_sMAPE=12.28%
[RF direct | h=168] val_MAE=113.94, val_RMSE=148.13, val_sMAPE=12.54%


{1: {'val_MAE': 107.87974614801384,
  'val_RMSE': 141.90375820467543,
  'val_sMAPE_pct': 11.843141266963558,
  'n_train': 24426,
  'n_val': 5234,
  'n_test': 5235,
  'model': RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=42)},
 24: {'val_MAE': 112.02723767712254,
  'val_RMSE': 145.65594165927507,
  'val_sMAPE_pct': 12.282798605795273,
  'n_train': 24410,
  'n_val': 5230,
  'n_test': 5232,
  'model': RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=42)},
 168: {'val_MAE': 113.93871936398,
  'val_RMSE': 148.1269865863092,
  'val_sMAPE_pct': 12.538410383484832,
  'n_train': 24309,
  'n_val': 5209,
  'n_test': 5210,
  'model': RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=42)}}

In [25]:
# После обучения Direct модели на h=168
importances = rf_scores_direct_weather[168]['model'].feature_importances_
feature_names = X.columns

# Посмотри топ-10
top_features = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)[:10]

In [27]:
top_features

[('lag_24', np.float64(0.3310471264428993)),
 ('hour', np.float64(0.2329976991386441)),
 ('lag_72', np.float64(0.13568888310346097)),
 ('lag_48', np.float64(0.0971242522877716)),
 ('lag_168', np.float64(0.04677975192118623)),
 ('month_mean_hist', np.float64(0.017460420900635973)),
 ('T', np.float64(0.014482685040253662)),
 ('lag_1', np.float64(0.014289209256894227)),
 ('year_mean_hist', np.float64(0.014022229929382969)),
 ('lag_5', np.float64(0.01310545146211853))]