In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import optuna

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.forecasting.theta import ThetaModel
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.regression.linear_model import OLS

from datetime import datetime, timedelta

from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('once')

Initialize time series

In [2]:
# Consumption data
df = pd.read_csv('../Dataset/ConsumptionIndustry.csv', sep=';')
df['HourDK'] = pd.to_datetime(df['HourDK'])
df['ConsumptionkWh'] = df['ConsumptionkWh'].str.replace(",", ".").astype(float)
df.index = df['HourDK']
df.drop(columns=['HourUTC', 'HourDK', 'MunicipalityNo', 'Branche'], inplace=True)
print(df)

                     ConsumptionkWh
HourDK                             
2021-01-01 00:00:00       37842.849
2021-01-01 01:00:00       35086.772
2021-01-01 02:00:00       31777.762
2021-01-01 03:00:00       28423.659
2021-01-01 04:00:00       25675.926
...                             ...
2024-12-01 19:00:00       52799.179
2024-12-01 20:00:00       48321.570
2024-12-01 21:00:00       44818.234
2024-12-01 22:00:00       40716.144
2024-12-01 23:00:00       36954.822

[34344 rows x 1 columns]


In [3]:
# El-spot prices
df2 = pd.read_csv('../Dataset/ELSpotPrices.csv', sep=';')
df2['HourDK'] = pd.to_datetime(df2['HourDK'])
df2['SpotPriceDKK'] = df2['SpotPriceDKK'].str.replace(",", ".").astype(float)
df2.index = df2['HourDK']
df2 = df2.iloc[1:] # remove first row, since the measurement at that time is not present in other dataset
df2.drop(columns=['HourUTC', 'HourDK', 'PriceArea', 'SpotPriceEUR'], inplace=True)
print(df2)

                     SpotPriceDKK
HourDK                           
2021-01-01 01:00:00    358.579987
2021-01-01 02:00:00    332.459991
2021-01-01 03:00:00    319.369995
2021-01-01 04:00:00    300.540009
2021-01-01 05:00:00    299.130005
...                           ...
2024-12-01 19:00:00    622.979980
2024-12-01 20:00:00    501.920013
2024-12-01 21:00:00    438.660004
2024-12-01 22:00:00    374.140015
2024-12-01 23:00:00    338.559998

[34343 rows x 1 columns]


Functions

In [11]:
def sample_data_with_train_window(df, start_date, end_date, train_window_size):
  start_date = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(hours=train_window_size) + timedelta(hours=24)
  end_date = datetime.strptime(end_date, '%Y-%m-%d') + timedelta(hours=24)
  return df[(df.index >= start_date) & (df.index <= end_date)]

def get_next_window(data, train_window_size, forecast_horizon):
  return data[:train_window_size], data[train_window_size:train_window_size + forecast_horizon]

Objective functions

In [12]:
def objective_SARIMAX(trial, data_train, data_test, forecast_horizon, exog_data_train, exog_data_test, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order, exog=exog_data_train)
  mdl = model.fit(disp=0)
  predictions_scaled = mdl.forecast(steps=forecast_horizon, exog=exog_data_test)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)

def objective_SARIMA(trial, data_train, data_test, forecast_horizon, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order)
  mdl = model.fit(disp=0)
  predictions_scaled = mdl.forecast(steps=forecast_horizon)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)

Optimizing through first time series window

In [13]:
date_start = '2023-11-01'
date_end = '2024-11-01'
window_train_size = 1440 #hours
forecast_horizon = 336 #hours
# 336_24, 1440_336, 17520_8760
trials = 100
model_name = f'SARIMAX_{window_train_size}_{forecast_horizon}'
scaler = MinMaxScaler()

data = sample_data_with_train_window(df, date_start, date_end, window_train_size)
exog_data = sample_data_with_train_window(df2, date_start, date_end, window_train_size)

data_train, data_test = get_next_window(data, window_train_size, forecast_horizon)
exog_data_train, exog_data_test = get_next_window(exog_data, window_train_size, forecast_horizon)

def safe_objective(trial):
  try:
    return objective_SARIMAX(trial, data_train, data_test, forecast_horizon, exog_data_train, exog_data_test, scaler)
  except Exception as e:
    print(f"Failed trial: {e}. Skipped this trial.")
    return float('inf')

warnings.filterwarnings("ignore")
study1 = optuna.create_study(direction='minimize')
study1.optimize(safe_objective, n_trials=trials)

trial=study1.best_trial
print(f"Accuracy: {trial.value}")
print(f"best params for {model_name}: {trial.params}")

warnings.filterwarnings("default")

# Save the results in CSV
if trial.value != float('inf'):
  try:
    df_tuning = pd.read_csv('../Results/whitebox_tuning.csv')
  except:
    df_tuning = pd.DataFrame(columns=['model', 'accuracy', 'params'])

  new_row = {'model': model_name, 'accuracy': trial.value, 'params': str(trial.params)}
  new_row_df = pd.DataFrame([new_row]).dropna(axis=1, how='all')
  df_tuning = pd.concat([df_tuning, new_row_df], ignore_index=True)
  df_tuning = df_tuning.sort_values(by=['model', 'accuracy', 'params'], ascending=True).reset_index(drop=True)
  df_tuning.to_csv('../Results/whitebox_tuning.csv', index=False)

[I 2025-01-08 12:50:13,130] A new study created in memory with name: no-name-a6f3c6d4-b336-413c-bec9-640970133b71
[I 2025-01-08 12:50:24,226] Trial 0 finished with value: 517168.38781283156 and parameters: {'order': (1, 2, 0), 'seasonal_order': (0, 2, 2, 12)}. Best is trial 0 with value: 517168.38781283156.
[I 2025-01-08 12:50:31,624] Trial 1 finished with value: 26779.070814572045 and parameters: {'order': (0, 2, 0), 'seasonal_order': (2, 1, 1, 12)}. Best is trial 1 with value: 26779.070814572045.
[I 2025-01-08 12:50:35,674] Trial 2 finished with value: 8738.124172877233 and parameters: {'order': (2, 1, 1), 'seasonal_order': (1, 0, 2, 12)}. Best is trial 2 with value: 8738.124172877233.
[I 2025-01-08 12:50:43,160] Trial 3 finished with value: 12709.389662271855 and parameters: {'order': (0, 1, 0), 'seasonal_order': (2, 2, 1, 12)}. Best is trial 2 with value: 8738.124172877233.
[I 2025-01-08 12:50:48,339] Trial 4 finished with value: 386557.10181466764 and parameters: {'order': (0, 1, 

Accuracy: 4093.8992251829745
best params for SARIMA_1440_336: {'order': (1, 0, 2), 'seasonal_order': (2, 1, 2, 12)}
