# 03 — 24h Load Forecast vs TSO
Gradient Boosting vs TSO baseline. Train 2015–2017, test 2018.

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

df = pd.read_parquet('../cleaned_data.parquet')
df['time'] = pd.to_datetime(df['time'], utc=True)
print(f"Shape: {df.shape}")

Shape: (35056, 80)


In [2]:
# Train/test split: 2015-2017 train, 2018 test
train = df[df['time'].dt.year <= 2017]
test = df[df['time'].dt.year == 2018]

print(f"Train: {train.shape[0]} rows ({train['time'].dt.year.min()}-{train['time'].dt.year.max()})")
print(f"Test: {test.shape[0]} rows (2018)")

Train: 26297 rows (2014-2017)
Test: 8759 rows (2018)


In [3]:
# Features: weather + time
weather_cols = [c for c in df.columns if any(
    c.startswith(p) for p in ['temp', 'pressure', 'humidity', 'wind_speed', 'wind_deg',
                               'rain_1h', 'rain_3h', 'snow_3h', 'clouds_all']
)]
time_cols = ['hour', 'day_of_week', 'month', 'is_weekend']
feature_cols = weather_cols + time_cols

target = 'total load actual'

X_train = train[feature_cols].fillna(0)
y_train = train[target]
X_test = test[feature_cols].fillna(0)
y_test = test[target]

print(f"Features: {len(feature_cols)}")

Features: 59


In [4]:
# Train Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
y_pred_tso = test['total load forecast'].values

def calc_metrics(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print(f"{name}: MAE={mae:.0f}, RMSE={rmse:.0f}, MAPE={mape:.2f}%")
    return {'mae': round(mae, 1), 'rmse': round(rmse, 1), 'mape': round(mape, 2)}

metrics_tso = calc_metrics(y_test.values, y_pred_tso, 'TSO')
metrics_gb = calc_metrics(y_test.values, y_pred_gb, 'Our Model')

TSO: MAE=270, RMSE=389, MAPE=0.93%
Our Model: MAE=2050, RMSE=2765, MAPE=7.06%


In [5]:
# Sample week for time series chart (first full week of March 2018)
sample = test[(test['time'] >= '2018-03-05') & (test['time'] < '2018-03-12')].copy()
sample_pred = gb.predict(sample[feature_cols].fillna(0))

timeseries = []
for i, (_, row) in enumerate(sample.iterrows()):
    timeseries.append({
        'time': row['time'].strftime('%Y-%m-%d %H:%M'),
        'actual': round(row[target], 1),
        'tso_forecast': round(row['total load forecast'], 1),
        'our_forecast': round(sample_pred[i], 1),
    })

print(f"Sample week: {len(timeseries)} hours")

Sample week: 168 hours


In [6]:
# Export JSON
import os
os.makedirs('../dashboard/public/data', exist_ok=True)

output = {
    'metrics': {
        'tso': metrics_tso,
        'our_model': metrics_gb,
    },
    'sample_week': timeseries,
}

with open('../dashboard/public/data/forecast_comparison.json', 'w') as f:
    json.dump(output, f, indent=2)

print('Saved forecast_comparison.json')
print(f"TSO:  {metrics_tso}")
print(f"Ours: {metrics_gb}")

Saved forecast_comparison.json
TSO:  {'mae': 269.9, 'rmse': np.float64(389.3), 'mape': np.float64(0.93)}
Ours: {'mae': 2050.3, 'rmse': np.float64(2765.0), 'mape': np.float64(7.06)}
