In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import optuna

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.dynamic_factor_mq import DynamicFactorMQ
from statsmodels.tsa.forecasting.theta import ThetaModel
from datetime import datetime, timedelta

from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('once')

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


Initialize time series

In [2]:
# Consumption data
df = pd.read_csv('../Dataset/ConsumptionIndustry.csv', sep=';')
df['HourDK'] = pd.to_datetime(df['HourDK'])
df['ConsumptionkWh'] = df['ConsumptionkWh'].str.replace(",", ".").astype(float)
df.index = df['HourDK']
df.drop(columns=['HourUTC', 'HourDK', 'MunicipalityNo', 'Branche'], inplace=True)

In [3]:
# El-spot prices
df2 = pd.read_csv('../Dataset/ELSpotPrices.csv', sep=';')
df2['HourDK'] = pd.to_datetime(df2['HourDK'])
df2['SpotPriceDKK'] = df2['SpotPriceDKK'].str.replace(",", ".").astype(float)
df2.index = df2['HourDK']
df2 = df2.iloc[1:] # remove first row, since the measurement at that time is not present in other dataset
df2.drop(columns=['HourUTC', 'HourDK', 'PriceArea', 'SpotPriceEUR'], inplace=True)

Functions

In [5]:
def sample_data_with_train_window(df, start_date, train_window_size):
  start_date = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(hours=train_window_size)
  end_date = df.index[-1]
  return df[(df.index >= start_date) & (df.index <= end_date)]

def get_next_window(data, train_window_size, forecast_horizon):
  return data[:train_window_size], data[train_window_size:train_window_size + forecast_horizon]

def forecast_whitebox_model(model, forecast_horizon, model_name, exog_data_test=None):
  model_res = model.fit()

  if "SARIMA" in model_name and "STL" not in model_name:
    return model_res.get_forecast(steps=forecast_horizon, exog=exog_data_test).predicted_mean
  else:
    return model_res.forecast(steps=forecast_horizon)

Objective functions

In [16]:
def objective_SARIMAX(trial, data_train, data_test, forecast_horizon, exog_data_train, exog_data_test, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order, exog=exog_data_train)
  mdl = model.fit(disp=0)
  predictions_scaled = mdl.forecast(steps=forecast_horizon, exog=exog_data_test)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)

def objective_SARIMA(trial, data_train, data_test, forecast_horizon, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order)
  mdl = model.fit(disp=0)
  predictions_scaled = mdl.forecast(steps=forecast_horizon)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)


Optimizing through whole dataset

In [17]:
date_start = '2023-11-01'
window_train_size = 1440 #hours
forecast_horizon = 336 #hours
# 336_24, 1440_336, 17520_8760
trials = 100
model_name = f'SARIMAX_{window_train_size}_{forecast_horizon}'
scaler = MinMaxScaler()

data = sample_data_with_train_window(df, date_start, window_train_size)
exog_data = sample_data_with_train_window(df2, date_start, window_train_size)

data_train, data_test = get_next_window(data, window_train_size, forecast_horizon)
exog_data_train, exog_data_test = get_next_window(exog_data, window_train_size, forecast_horizon)

def safe_objective(trial):
  try:
    return objective_SARIMAX(trial, data_train, data_test, forecast_horizon, exog_data_train, exog_data_test, scaler)
  except Exception as e:
    print(f"Failed trial: {e}. Skipped this trial.")
    return float('inf')

warnings.filterwarnings("ignore")
study1 = optuna.create_study(direction='minimize')
study1.optimize(safe_objective, n_trials=trials)

trial=study1.best_trial
print(f"Accuracy: {trial.value}")
print(f"best params for {model_name}: {trial.params}")

warnings.filterwarnings("default")

# Save the results in CSV
try:
  df_tuning = pd.read_csv('../Results/Whitebox/whitebox_tuning.csv')
except:
  df_tuning = pd.DataFrame(columns=['model', 'accuracy', 'params'])

new_row = {'model': model_name, 'accuracy': trial.value, 'params': str(trial.params)}
df_tuning = pd.concat([df_tuning, pd.DataFrame([new_row])], ignore_index=True)
df_tuning = df_tuning.sort_values(by=['model', 'accuracy', 'params'], ascending=True).reset_index(drop=True)
df_tuning.to_csv('../Results/Whitebox/whitebox_tuning.csv', index=False)

[I 2024-12-26 16:31:30,832] A new study created in memory with name: no-name-3e22c28e-18ec-4c66-b218-78d8eab5b406
[I 2024-12-26 16:31:30,859] Trial 0 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,862] Trial 1 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,864] Trial 2 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,868] Trial 3 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,870] Trial 4 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,873] Trial 5 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,876] Trial 6 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,879] Trial 7 finished with value: inf and parameters: {}. Best is

[I 2024-12-26 16:31:30,881] Trial 8 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.


Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.


[I 2024-12-26 16:31:30,884] Trial 9 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,887] Trial 10 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,889] Trial 11 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,892] Trial 12 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,895] Trial 13 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,899] Trial 14 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,902] Trial 15 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,905] Trial 16 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,910] Trial 17 finished with value: inf and parameters: {}.

Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed tri

[I 2024-12-26 16:31:31,039] Trial 60 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,042] Trial 61 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,044] Trial 62 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,047] Trial 63 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,050] Trial 64 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,052] Trial 65 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,055] Trial 66 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,057] Trial 67 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,061] Trial 68 finished with value: inf and parameters: {}

Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed tri

[I 2024-12-26 16:31:31,072] Trial 72 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,074] Trial 73 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,077] Trial 74 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,079] Trial 75 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,082] Trial 76 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,085] Trial 77 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,087] Trial 78 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,094] Trial 79 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,098] Trial 80 finished with value: inf and parameters: {}

Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed tri