# 8. Machine learning

## Setup and imports

In [1]:
from functions import *

In [2]:
df = load_data()
df = df.sort_index()
df = df.asfreq("h")
df = df.interpolate(method="time", limit_direction="both")
df = df.reset_index()  # siin on juba veerg nimega 'timestamp', ei ole vaja rename

df["hour"] = df["timestamp"].dt.hour
df["weekday"] = df["timestamp"].dt.dayofweek
df["is_weekend"] = (df["weekday"] >= 5).astype(int)
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["cooling_degree"] = np.clip(df["temperature"] - 18, 0, None)
df["heating_degree"] = np.clip(18 - df["temperature"], 0, None)

selected_features = [
    "timestamp", "demand", "hour_sin", "hour_cos", "is_weekend",
    "cooling_degree", "heating_degree", "temperature", "pressure (hPa)",
    "cloud_cover (%)", "wind_speed_10m (km/h)", "shortwave_radiation (W/m²)",
    "direct_radiation (W/m²)", "diffuse_radiation (W/m²)",
    "direct_normal_irradiance (W/m²)", "price"
]

df = df[selected_features].round(5)


In [3]:
show_table_info(df, "Training data")


TRAINING DATA SUMMARY
Shape: 8,760 rows × 16 columns
Time span: 2013-07-01 00:00:00+00:00 -> 2014-06-30 23:00:00+00:00

                         Column    Type   NA %
                         demand float64  0.00%
                       hour_sin float64  0.00%
                       hour_cos float64  0.00%
                     is_weekend   int64  0.00%
                 cooling_degree float64  0.00%
                 heating_degree float64  0.00%
                    temperature float64  0.00%
                 pressure (hPa) float64  0.00%
                cloud_cover (%) float64  0.00%
          wind_speed_10m (km/h) float64  0.00%
     shortwave_radiation (W/m²) float64  0.00%
        direct_radiation (W/m²) float64  0.00%
       diffuse_radiation (W/m²) float64  0.00%
direct_normal_irradiance (W/m²) float64  0.00%
                          price float64  0.00%



Unnamed: 0,Column,Type,NA %
0,demand,float64,0.00%
1,hour_sin,float64,0.00%
2,hour_cos,float64,0.00%
3,is_weekend,int64,0.00%
4,cooling_degree,float64,0.00%
5,heating_degree,float64,0.00%
6,temperature,float64,0.00%
7,pressure (hPa),float64,0.00%
8,cloud_cover (%),float64,0.00%
9,wind_speed_10m (km/h),float64,0.00%


In [11]:
VALIDATION_DAYS = 7
FORECAST_HORIZON = 24
cutoff = df["timestamp"].max() - pd.Timedelta(days=VALIDATION_DAYS)

train = df[df["timestamp"] < cutoff].copy()
test = df[df["timestamp"] >= cutoff].copy()

X_train = train.drop(columns=["demand", "timestamp"])
y_train = train["demand"]
X_test = test.drop(columns=["demand", "timestamp"])
y_test = test["demand"]

## 1. Train XGBoost forecasting model

In [12]:
model, eval_history = train_xgboost(X_train, y_train, X_test, y_test)

[0]	validation_0-rmse:0.37109	validation_1-rmse:0.36378
[1]	validation_0-rmse:0.36689	validation_1-rmse:0.36093
[2]	validation_0-rmse:0.36401	validation_1-rmse:0.36053
[3]	validation_0-rmse:0.36130	validation_1-rmse:0.35909
[4]	validation_0-rmse:0.35781	validation_1-rmse:0.35722
[5]	validation_0-rmse:0.35611	validation_1-rmse:0.35614
[6]	validation_0-rmse:0.35295	validation_1-rmse:0.35409
[7]	validation_0-rmse:0.34996	validation_1-rmse:0.35231
[8]	validation_0-rmse:0.34789	validation_1-rmse:0.35185
[9]	validation_0-rmse:0.34510	validation_1-rmse:0.35040
[10]	validation_0-rmse:0.34260	validation_1-rmse:0.34900
[11]	validation_0-rmse:0.34028	validation_1-rmse:0.34795
[12]	validation_0-rmse:0.33833	validation_1-rmse:0.34682
[13]	validation_0-rmse:0.33620	validation_1-rmse:0.34647
[14]	validation_0-rmse:0.33444	validation_1-rmse:0.34620
[15]	validation_0-rmse:0.33300	validation_1-rmse:0.34563
[16]	validation_0-rmse:0.33129	validation_1-rmse:0.34484
[17]	validation_0-rmse:0.32944	validation

## Single-day validation

In [13]:
# First 24h
test_day = test.head(FORECAST_HORIZON)
X_test_day = test_day.drop(columns=["demand", "timestamp"])
y_test_day = test_day["demand"]
y_pred_day = model.predict(X_test_day)

metrics_day = evaluate_forecast(y_test_day.values, y_pred_day)
metrics_day_df = pd.DataFrame([{**metrics_day, "model": "XGBoost", "validation": "Single 24h"}])

## Week validation

In [14]:
y_pred_week = model.predict(X_test)
metrics_week = evaluate_forecast(y_test.values, y_pred_week)
metrics_week_df = pd.DataFrame([{**metrics_week, "model": "XGBoost", "validation": "Week"}])

## Summary table for single day and week

In [16]:
combined_metrics = pd.concat([metrics_day_df, metrics_week_df], ignore_index=True)
combined_metrics.round(5)

Unnamed: 0,MAE,RMSE,nRMSE,MAPE,model,validation
0,0.15238,0.19127,0.16209,0.28818,XGBoost,Single 24h
1,0.20718,0.35351,0.16068,0.38457,XGBoost,Week


In [17]:
fig_day = plot_forecast(test_day["timestamp"], y_test_day, y_pred_day, "XGBoost – 24h", "ex8_fig1_xgb_forecast_day.svg")
fig_week = plot_forecast(test["timestamp"], y_test, y_pred_week, "XGBoost – Week", "ex8_fig2_xgb_forecast_week.svg")
fig_day.show()
fig_week.show()

## 2. Table with selected hyperparameters
These values were chosen to balance accuracy and generalization capability.
A small learning rate with sufficient estimators allows gradual convergence.
Moderate depth and subsampling prevent overfitting.
Regularization (reg_lambda) improves robustness.
The histogram tree method provides faster computation without major accuracy loss.

| Parameter         | Value | Description |
|-------------------|--------|-------------|
| n_estimators      | 500 | A large number ensures model stability with a small learning rate. |
| learning_rate     | 0.05 | A small step size allows gradual learning and better generalization. |
| max_depth         | 6 | Moderate depth prevents overfitting while capturing key interactions. |
| subsample         | 0.8 | Sampling rows improves generalization and reduces variance. |
| colsample_bytree  | 0.8 | Feature-level sampling helps reduce correlation between trees. |
| reg_lambda        | 1.0 | L2 regularization adds stability and controls model complexity. |
| tree_method       | hist | Efficient histogram-based algorithm, faster for large datasets. |
| objective         | reg:squarederror | Appropriate for continuous-value regression tasks. |
| eval_metric       | rmse | Root Mean Squared Error used as primary performance metric. |