# 9. Forecasting pipeline

## Setup and imports

In [1]:
from functions import *

In [2]:
df = load_data()
df = df.sort_index()
df = df.asfreq("h")
df = df.interpolate(method="time", limit_direction="both")
df = df.reset_index()

df["hour"] = df["timestamp"].dt.hour
df["weekday"] = df["timestamp"].dt.dayofweek
df["is_weekend"] = (df["weekday"] >= 5).astype(int)
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["cooling_degree"] = np.clip(df["temperature"] - 18, 0, None)
df["heating_degree"] = np.clip(18 - df["temperature"], 0, None)

selected_features = [
    "timestamp", "demand", "hour_sin", "hour_cos", "is_weekend",
    "cooling_degree", "heating_degree", "temperature", "pressure (hPa)",
    "cloud_cover (%)", "wind_speed_10m (km/h)", "shortwave_radiation (W/m²)",
    "direct_radiation (W/m²)", "diffuse_radiation (W/m²)",
    "direct_normal_irradiance (W/m²)", "price"
]

df = df[selected_features].round(5)
df.head()

Unnamed: 0,timestamp,demand,hour_sin,hour_cos,is_weekend,cooling_degree,heating_degree,temperature,pressure (hPa),cloud_cover (%),wind_speed_10m (km/h),shortwave_radiation (W/m²),direct_radiation (W/m²),diffuse_radiation (W/m²),direct_normal_irradiance (W/m²),price
0,2013-07-01 00:00:00+00:00,0.27,0.0,1.0,0,0.0,4.5,13.5,1011.3,4.0,10.5,0.0,0.0,0.0,0.0,0.01605
1,2013-07-01 01:00:00+00:00,0.23,0.25882,0.96593,0,0.0,4.8,13.2,1010.8,27.0,11.9,0.0,0.0,0.0,0.0,0.00095
2,2013-07-01 02:00:00+00:00,0.26,0.5,0.86603,0,0.0,4.9,13.1,1010.3,33.0,11.6,0.0,0.0,0.0,0.0,0.0006
3,2013-07-01 03:00:00+00:00,0.28,0.70711,0.70711,0,0.0,5.0,13.0,1010.3,28.0,11.2,51.45455,2.0,7.0,30.1,0.00046
4,2013-07-01 04:00:00+00:00,0.29,0.86603,0.5,0,0.0,4.2,13.8,1010.2,16.0,11.7,102.90909,30.0,31.0,252.0,0.00046


In [3]:
VALIDATION_DAYS = 7
HORIZON = 24

cutoff = df["timestamp"].max() - pd.Timedelta(days=VALIDATION_DAYS)

train = df[df["timestamp"] < cutoff].copy()
test_week = df[df["timestamp"] >= cutoff].copy().head(VALIDATION_DAYS * HORIZON)

X_train = train.drop(columns=["demand", "timestamp"])
y_train = train["demand"]
X_test_week = test_week.drop(columns=["demand", "timestamp"])
y_test_week = test_week["demand"]

model, eval_history = train_xgboost(X_train, y_train)
y_pred_week = model.predict(X_test_week)

# nädal kokku
metrics_week = evaluate_forecast(y_test_week.values, y_pred_week)
metrics_week_df = pd.DataFrame([{**metrics_week, "model": "XGBoost", "evaluation": "week"}])
save_table(metrics_week_df, "09_xgb_metrics_week.csv")
print(metrics_week_df)


        MAE      RMSE     nRMSE     MAPE    model evaluation
0  0.208362  0.354556  0.161162  0.38668  XGBoost       week


In [4]:
test_week = test_week.copy()
test_week["y_true"] = y_test_week.values
test_week["y_pred"] = y_pred_week
test_week["day"] = test_week["timestamp"].dt.date

def _agg_day(g):
    m = evaluate_forecast(g["y_true"].values, g["y_pred"].values)
    return pd.Series(m)

metrics_day_df = test_week.groupby("day", as_index=False).apply(_agg_day)
metrics_day_df.insert(0, "model", "XGBoost")
metrics_day_df.insert(1, "evaluation", "Per-day over last week")

save_table(metrics_day_df, "09_xgb_metrics_daywise.csv")
metrics_day_df


  metrics_day_df = test_week.groupby("day", as_index=False).apply(_agg_day)


Unnamed: 0,model,evaluation,day,MAE,RMSE,nRMSE,MAPE
0,XGBoost,Per-day over last week,2014-06-23,0.097865,0.097865,97864510000.0,0.362461
1,XGBoost,Per-day over last week,2014-06-24,0.150632,0.190568,0.161498,0.281714
2,XGBoost,Per-day over last week,2014-06-25,0.163124,0.241969,0.2630093,0.346612
3,XGBoost,Per-day over last week,2014-06-26,0.148697,0.197085,0.2463563,0.282343
4,XGBoost,Per-day over last week,2014-06-27,0.362002,0.645595,0.2975093,0.324133
5,XGBoost,Per-day over last week,2014-06-28,0.169598,0.236734,0.4383972,0.496073
6,XGBoost,Per-day over last week,2014-06-29,0.159807,0.204997,0.2733295,0.470534
7,XGBoost,Per-day over last week,2014-06-30,0.313662,0.490964,0.234911,0.511562


In [5]:
# näita kogu nädala prognoosi
fig = plot_forecast(test_week["timestamp"], test_week["y_true"], test_week["y_pred"],
                    "XGBoost - Week", "ex09_fig1_week_forecast.svg")
fig.show()

# esimese päeva overlay
first_day = test_week["day"].iloc[0]
d1 = test_week[test_week["day"] == first_day]
fig = plot_forecast(d1["timestamp"], d1["y_true"], d1["y_pred"],
                    f"XGBoost – {first_day}", "ex09_fig2_day1_forecast.svg")
fig.show()


In [7]:
train_df = load_data().asfreq("h").interpolate(method="time", limit_direction="both").reset_index()
forecast_df = load_forecast_data().reset_index()

train_df = add_time_related_features(train_df)
forecast_df = add_time_related_features(forecast_df)

pred_df, metrics_day_df, metrics_sum_df = rolling_forecast_7days(
    train_full_df=train_df,
    forecast_df=forecast_df,
    feature_cols=FORECAST_FEATURES,
    target="demand",
    arima_order=(2,1,2),
    seasonal_order=(1,1,1,24),
    xgb_params=dict(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        tree_method="hist",
        eval_metric="rmse",
        random_state=42,
    ),
)




No frequency information was provided, so inferred frequency h will be used.



TypeError: xgboost.sklearn.XGBRegressor() got multiple values for keyword argument 'n_estimators'