In [13]:
# Импорты и загрузка energy_weather_merged
import sys
from pathlib import Path

# Добавляем корень проекта в путь (для ноутбуков в experiments/)
project_root = Path().resolve().parent.parent
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

# Путь к объединенному датасету (создан в energy_weather_merge.ipynb)
merged_path = Path("..") / ".." / "data" / "views" / "energy_weather_merged.parquet"
print("Путь к energy_weather_merged:", merged_path)

df_merged = pd.read_parquet(merged_path)

# Убедимся, что индекс — DatetimeIndex
if not isinstance(df_merged.index, pd.DatetimeIndex):
    df_merged.index = pd.to_datetime(df_merged.index)

df_merged = df_merged.sort_index().copy()
df_merged.index.name = "DateTime"

print("Форма df_merged:", df_merged.shape)
df_merged.head()


Путь к energy_weather_merged: ../../data/views/energy_weather_merged.parquet
Форма df_merged: (35064, 4)


Unnamed: 0_level_0,Usage_kWh,wind_speed,T,f
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01 00:00:00,570.685479,2.0,2.6,88.0
2017-01-01 01:00:00,604.642705,2.0,2.6,88.0
2017-01-01 02:00:00,518.732113,2.0,2.6,88.0
2017-01-01 03:00:00,608.188829,4.0,2.6,82.0
2017-01-01 04:00:00,714.140572,4.0,2.6,82.0


In [15]:
import numpy as np
import pandas as pd

from typing import Tuple, Dict, Iterable
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


# =========================================================
# 1) Метрика sMAPE (опционально, но полезно для сравнения)
# =========================================================
def smape(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 1e-8) -> float:
    """
    Symmetric MAPE в процентах.
    Устойчива, когда значения близки к нулю (за счёт eps).
    """
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps)
    return float(100.0 * np.mean(2.0 * np.abs(y_pred - y_true) / denom))


# =========================================================
# 2) Деление по времени: train/val (без test для эксперимента)
# =========================================================
def time_split_index(
    index: pd.DatetimeIndex,
    train_ratio: float = 0.7,
    val_ratio: float = 0.15
) -> Tuple[pd.DatetimeIndex, pd.DatetimeIndex]:
    """
    Делит временной индекс на train и val по порядку времени.
    """
    n = len(index)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    train_idx = index[:train_end]
    val_idx = index[train_end:val_end]
    return train_idx, val_idx


# =========================================================
# 3) Фичи + one-step таргет (y(t+1)) для стратегии recursive
# =========================================================
def build_features_recursive_step1_calendar_lags(
    df: pd.DataFrame,
    target_col: str = "Usage_kWh",
    freq: str = "h",
    lags: Tuple[int, ...] = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168),
    add_month_mean: bool = True,
    add_year_mean: bool = True,
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Строит X(t) и таргет y(t+1) для стратегии "один шаг за раз" (recursive one-step).

    В X(t) включаем:
      - hour, day_of_week, is_weekend
      - peak (6..13), peak_month (Jan–Mar или Sep–Dec)
      - month_mean_hist и year_mean_hist (исторические средние без утечки)
      - лаги y(t-k)

    Возвращает:
      X: признаки (без NaN)
      y_next: таргет y(t+1), выровненный под X (без NaN)
    """
    s = df[[target_col]].copy()
    s = s.sort_index().asfreq(freq)

    y = s[target_col].astype(np.float32)

    # календарные фичи
    X = pd.DataFrame(index=s.index)
    idx = X.index

    X["hour"] = idx.hour.astype(np.int8)
    X["day_of_week"] = idx.dayofweek.astype(np.int8)
    X["is_weekend"] = (idx.dayofweek >= 5).astype(np.int8)

    X["peak"] = ((idx.hour >= 6) & (idx.hour <= 13)).astype(np.int8)
    X["peak_month"] = (idx.month.isin([1, 2, 3, 9, 10, 11, 12])).astype(np.int8)

    # исторические средние без утечки:
    # expanding mean внутри группы + shift(1), чтобы y(t) не попадал в X(t)
    if add_month_mean:
        ym = pd.DataFrame({"y": y, "year": idx.year, "month": idx.month}, index=idx)
        month_mean_hist = (
            ym.groupby(["year", "month"])["y"]
            .expanding()
            .mean()
            .reset_index(level=[0, 1], drop=True)
            .shift(1)
        )
        X["month_mean_hist"] = month_mean_hist.astype(np.float32)

    if add_year_mean:
        yy = pd.DataFrame({"y": y, "year": idx.year}, index=idx)
        year_mean_hist = (
            yy.groupby("year")["y"]
            .expanding()
            .mean()
            .reset_index(level=0, drop=True)
            .shift(1)
        )
        X["year_mean_hist"] = year_mean_hist.astype(np.float32)

    # лаги
    for lag in lags:
        X[f"lag_{lag}"] = y.shift(lag)

    # таргет one-step
    y_next = y.shift(-1)

    # фильтрация валидных строк
    valid = X.notna().all(axis=1) & y_next.notna()
    X = X.loc[valid]
    y_next = y_next.loc[valid].astype(np.float32)

    # типы для экономии памяти
    float_cols = X.select_dtypes(include=["float64"]).columns
    if len(float_cols) > 0:
        X[float_cols] = X[float_cols].astype(np.float32)

    return X, y_next



In [17]:
# =========================================================
# 4) Рекурсивный прогноз на h шагов (one-step модель)
# =========================================================
def predict_recursive_h_steps(
    model,
    y_hist: pd.Series,
    start_time: pd.Timestamp,
    h: int,
    freq: str,
    lags: Tuple[int, ...],
    add_month_mean: bool = True,
    add_year_mean: bool = True,
) -> float:
    """
    Делает рекурсивный прогноз на горизонт h из точки start_time:
      - строим X(t)
      - предсказываем y(t+1)
      - записываем предсказание в историю
      - повторяем h раз
    Возвращает прогноз y(start_time + h).
    """
    y_buf = y_hist.astype(np.float32).copy()
    current_t = start_time

    for _ in range(h):
        idx = current_t
        feat = {}

        # календарные
        feat["hour"] = np.int8(idx.hour)
        feat["day_of_week"] = np.int8(idx.dayofweek)
        feat["is_weekend"] = np.int8(1 if idx.dayofweek >= 5 else 0)
        feat["peak"] = np.int8(1 if (idx.hour >= 6 and idx.hour <= 13) else 0)
        feat["peak_month"] = np.int8(1 if idx.month in [1, 2, 3, 9, 10, 11, 12] else 0)

        # исторические средние без утечки: используем только прошлое до current_t
        hist_cutoff = current_t - pd.to_timedelta(1, unit=freq)
        y_past = y_buf.loc[:hist_cutoff].dropna()

        if add_month_mean:
            same_month = y_past[(y_past.index.year == idx.year) & (y_past.index.month == idx.month)]
            feat["month_mean_hist"] = np.float32(same_month.mean()) if len(same_month) else np.float32(np.nan)

        if add_year_mean:
            same_year = y_past[(y_past.index.year == idx.year)]
            feat["year_mean_hist"] = np.float32(same_year.mean()) if len(same_year) else np.float32(np.nan)

        # лаги
        for lag in lags:
            lag_time = current_t - pd.to_timedelta(lag, unit=freq)
            feat[f"lag_{lag}"] = np.float32(y_buf.loc[lag_time]) if lag_time in y_buf.index else np.float32(np.nan)

        # если не собрали фичи — прогноз невозможен
        if any(pd.isna(v) for v in feat.values()):
            return np.nan

        X_t = pd.DataFrame([feat])
        y_next_pred = float(model.predict(X_t)[0])

        next_t = current_t + pd.to_timedelta(1, unit=freq)
        y_buf.loc[next_t] = np.float32(y_next_pred)
        current_t = next_t

    return float(y_buf.loc[current_t])


# =========================================================
# 5) Оценка 3 режимов: 1 шаг, сутки (24), неделя (168)
# =========================================================
def eval_rf_recursive_three_modes_val_only(
    df: pd.DataFrame,
    target_col: str = "Usage_kWh",
    freq: str = "h",
    lags: Tuple[int, ...] = (1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168),
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
    horizons: Tuple[int, ...] = (1, 24, 168),   # 1=один шаг, 24=сутки, 168=неделя
    n_val_points: int = 500,
    rf_params: Dict = None,
    random_state: int = 42,
) -> pd.DataFrame:
    """
    RandomForest + recursive one-step модель:
      - обучаем RF на y(t+1)
      - на val делаем рекурсивные прогнозы для горизонтов 1/24/168
      - считаем MAE/RMSE/sMAPE по val

    "Один шаг за раз"   -> horizon=1
    "Сутки за раз"      -> horizon=24 (24 одношаговых прогноза подряд)
    "Неделя за раз"     -> horizon=168
    """
    if rf_params is None:
        rf_params = dict(
            n_estimators=500,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features="sqrt",
            bootstrap=True,
            n_jobs=-1,
        )

    # 1) X(t) и y(t+1)
    X, y_next = build_features_recursive_step1_calendar_lags(
        df=df,
        target_col=target_col,
        freq=freq,
        lags=lags,
        add_month_mean=True,
        add_year_mean=True,
    )

    # 2) train/val по времени
    train_idx, val_idx = time_split_index(X.index, train_ratio=train_ratio, val_ratio=val_ratio)
    X_train = X.loc[train_idx]
    y_train = y_next.loc[train_idx]

    # 3) полная история y (для рекурсии)
    y_full = df[[target_col]].copy().sort_index().asfreq(freq)[target_col].astype(np.float32)

    val_start = val_idx.min()
    val_end = val_idx.max()

    # стартовые точки (ограничиваем для скорости)
    candidate_starts = val_idx
    if len(candidate_starts) > n_val_points:
        candidate_starts = candidate_starts[-n_val_points:]

    # 4) обучаем one-step RF
    rf = RandomForestRegressor(random_state=random_state, **rf_params)
    rf.fit(X_train, y_train)

    rows = []

    # 5) оценка для каждого горизонта (1 / 24 / 168)
    for h in horizons:
        y_true_list, y_pred_list = [], []

        for t in candidate_starts:
            t_h = t + pd.to_timedelta(h, unit=freq)

            # true должен быть внутри val
            if t_h < val_start or t_h > val_end:
                continue

            pred = predict_recursive_h_steps(
                model=rf,
                y_hist=y_full,
                start_time=t,
                h=h,
                freq=freq,
                lags=lags,
                add_month_mean=True,
                add_year_mean=True,
            )
            if np.isnan(pred):
                continue

            true_val = float(y_full.loc[t_h])
            y_true_list.append(true_val)
            y_pred_list.append(pred)

        y_true_arr = np.asarray(y_true_list, dtype=float)
        y_pred_arr = np.asarray(y_pred_list, dtype=float)

        mae = mean_absolute_error(y_true_arr, y_pred_arr)
        rmse = float(np.sqrt(mean_squared_error(y_true_arr, y_pred_arr)))
        smape_pct = smape(y_true_arr, y_pred_arr)

        mode = {1: "one_step", 24: "day_ahead", 168: "week_ahead"}.get(h, f"h{h}")

        rows.append({
            "model": "RandomForest",
            "strategy": "recursive_one_step",
            "mode": mode,
            "horizon": h,
            "val_MAE": mae,
            "val_RMSE": rmse,
            "val_sMAPE_pct": smape_pct,
            "n_eval_points": int(len(y_true_arr)),
            "n_features": int(X.shape[1]),
            "lags": list(lags),
            "rf_params": dict(rf_params),
        })

        print(f"[RF | {mode} | h={h}] MAE={mae:.2f}, RMSE={rmse:.2f}, sMAPE={smape_pct:.2f}% | n={len(y_true_arr)}")

    return pd.DataFrame(rows).sort_values("horizon").reset_index(drop=True)



In [None]:
# =========================================================
# 6) Пример запуска
# =========================================================
# df_merged_hourly — твой DataFrame как на скриншоте:
# index = DateTime (часовой), колонки: Usage_kWh, wind_speed, T, f (погода тут НЕ используется в этом тесте)

results_rf_3modes = eval_rf_recursive_three_modes_val_only(
    df=df_merged,
    target_col="Usage_kWh",
    freq="h",
    lags=(1, 2, 3, 4, 5, 6, 12, 24, 48, 72, 168),
    train_ratio=0.7,
    val_ratio=0.15,
    horizons=(1, 24, 168),     # 1 шаг / сутки / неделя
    n_val_points=500,
    rf_params=None
)
results_rf_3modes

[RF | one_step | h=1] MAE=102.62, RMSE=135.72, sMAPE=12.08% | n=499
[RF | day_ahead | h=24] MAE=107.82, RMSE=143.43, sMAPE=12.65% | n=476
