In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import optuna

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.forecasting.theta import ThetaModel
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.regression.linear_model import OLS

from datetime import datetime, timedelta

from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('once')

Initialize time series

In [61]:
# Consumption data
df = pd.read_csv('../Dataset/ConsumptionIndustry.csv', sep=';')
df['HourDK'] = pd.to_datetime(df['HourDK'])
df['ConsumptionkWh'] = df['ConsumptionkWh'].str.replace(",", ".").astype(float)
df.index = df['HourDK']
df.drop(columns=['HourUTC', 'HourDK', 'MunicipalityNo', 'Branche'], inplace=True)

In [62]:
# El-spot prices
df2 = pd.read_csv('../Dataset/ELSpotPrices.csv', sep=';')
df2['HourDK'] = pd.to_datetime(df2['HourDK'])
df2['SpotPriceDKK'] = df2['SpotPriceDKK'].str.replace(",", ".").astype(float)
df2.index = df2['HourDK']
df2 = df2.iloc[1:] # remove first row, since the measurement at that time is not present in other dataset
df2.drop(columns=['HourUTC', 'HourDK', 'PriceArea', 'SpotPriceEUR'], inplace=True)

Functions

In [63]:
def sample_data_with_train_window(df, start_date, end_date, train_window_size):
  start_date = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(hours=train_window_size) + timedelta(hours=24)
  end_date = datetime.strptime(end_date, '%Y-%m-%d') + timedelta(hours=24)
  return df[(df.index >= start_date) & (df.index <= end_date)]

def get_next_window(data, train_window_size, forecast_horizon):
  return data[:train_window_size], data[train_window_size:train_window_size + forecast_horizon]

def forecast_whitebox_model(model, forecast_horizon, model_name, exog_data_test=None):
  model_res = model.fit()

  if "SARIMA" in model_name and "STL" not in model_name:
    return model_res.get_forecast(steps=forecast_horizon, exog=exog_data_test).predicted_mean
  else:
    return model_res.forecast(steps=forecast_horizon)

Objective functions

In [88]:
def objective_SARIMAX(trial, data_train, data_test, forecast_horizon, exog_data_train, exog_data_test, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order, exog=exog_data_train)
  mdl = model.fit(disp=0)
  predictions_scaled = mdl.forecast(steps=forecast_horizon, exog=exog_data_test)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)

def objective_SARIMA(trial, data_train, data_test, forecast_horizon, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order)
  mdl = model.fit(disp=0)
  predictions_scaled = mdl.forecast(steps=forecast_horizon)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)

def objective_ThetaModel(trial, data_train, data_test, forecast_horizon, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  deseasonalize = trial.suggest_categorical('deseasonalize', [True, False])
  use_test = trial.suggest_categorical('use_test', [True, False])
  method = trial.suggest_categorical('method', ['additive', 'multiplicative'])
  difference = trial.suggest_categorical('difference', [True, False])
  model = ThetaModel(data_train, deseasonalize=deseasonalize, use_test=use_test, method=method, difference=difference)
  mdl = model.fit()
  predictions_scaled = mdl.forecast(steps=forecast_horizon)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)

# def objective_OLS(trial, data_train, data_test, forecast_horizon, exog_data_train, exog_data_test, scaler):
#   data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
#   data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
#   model = OLS(data_train, exog_data_train)
#   mdl = model.fit(disp=0)
#   predictions_scaled = mdl.forecast(steps=forecast_horizon, exog=exog_data_test)
#   predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
#   predictions = pd.Series(predictions.flatten(), index=data_test.index)
#   return root_mean_squared_error(data_test, predictions)

Optimizing through first time series window

In [89]:
date_start = '2023-11-01'
date_end = '2024-11-01'
window_train_size = 1440 #hours
forecast_horizon = 336 #hours
# 336_24, 1440_336, 17520_8760
trials = 100
model_name = f'LSTM_{window_train_size}_{forecast_horizon}'
scaler = MinMaxScaler()

data = sample_data_with_train_window(df, date_start, window_train_size)
exog_data = sample_data_with_train_window(df2, date_start, window_train_size)

data_train, data_test = get_next_window(data, window_train_size, forecast_horizon)
exog_data_train, exog_data_test = get_next_window(exog_data, window_train_size, forecast_horizon)

def safe_objective(trial):
  try:
    return objective_ExponentialSmoothing(trial, data_train, data_test, forecast_horizon, scaler)
  except Exception as e:
    print(f"Failed trial: {e}. Skipped this trial.")
    return float('inf')

warnings.filterwarnings("ignore")
study1 = optuna.create_study(direction='minimize')
study1.optimize(safe_objective, n_trials=trials)

trial=study1.best_trial
print(f"Accuracy: {trial.value}")
print(f"best params for {model_name}: {trial.params}")

warnings.filterwarnings("default")

# Save the results in CSV
if trial.value != float('inf'):
  try:
    df_tuning = pd.read_csv('../Results/whitebox_tuning.csv')
  except:
    df_tuning = pd.DataFrame(columns=['model', 'accuracy', 'params'])

  new_row = {'model': model_name, 'accuracy': trial.value, 'params': str(trial.params)}
  new_row_df = pd.DataFrame([new_row]).dropna(axis=1, how='all')
  df_tuning = pd.concat([df_tuning, new_row_df], ignore_index=True)
  df_tuning = df_tuning.sort_values(by=['model', 'accuracy', 'params'], ascending=True).reset_index(drop=True)
  df_tuning.to_csv('../Results/whitebox_tuning.csv', index=False)

[I 2024-12-29 23:19:26,769] A new study created in memory with name: no-name-52fd49dd-aa0d-4480-96a7-6a43c684a5b9
[I 2024-12-29 23:19:26,789] Trial 0 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,807] Trial 1 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,852] Trial 2 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,872] Trial 3 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.


[I 2024-12-29 23:19:26,897] Trial 4 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,913] Trial 5 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,929] Trial 6 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,950] Trial 7 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,968] Trial 8 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,983] Trial 9 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:26,999] Trial 10 finished with value: 9007.786131872 and parameters: {}. Best is trial 0 with value: 9007.786131872.
[I 2024-12-29 23:19:27,021] Trial 11 finished w

Accuracy: 9007.786131872
best params for ExponentialSmoothing_336_24: {}
