In [11]:
!pip install numpy pandas statsmodels matplotlib seaborn prophet sklearn optuna

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import optuna

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.dynamic_factor_mq import DynamicFactorMQ
from statsmodels.tsa.forecasting.theta import ThetaModel
from datetime import datetime, timedelta

from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from prophet import Prophet

try:
  from google.colab import files
  from google.colab import drive
  uploaded = files.upload()
  !mkdir -p "/content/drive/My Drive/p9"
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False
import warnings
warnings.filterwarnings('once')

  del self._target, self._args, self._kwargs
  del self._target, self._args, self._kwargs
  del self._target, self._args, self._kwargs


Initialize time series

In [13]:
# Consumption data
df = pd.read_csv('../Dataset/ConsumptionIndustry.csv' if not IN_COLAB else 'ConsumptionIndustry.csv', sep=';')

df['HourDK'] = pd.to_datetime(df['HourDK'])
df['ConsumptionkWh'] = df['ConsumptionkWh'].str.replace(",", ".").astype(float)
df.index = df['HourDK']

# format data here
df.drop(columns=['HourUTC', 'HourDK', 'MunicipalityNo', 'Branche'], inplace=True)
print(df)

                     ConsumptionkWh
HourDK                             
2021-01-01 00:00:00       37842.849
2021-01-01 01:00:00       35086.772
2021-01-01 02:00:00       31777.762
2021-01-01 03:00:00       28423.659
2021-01-01 04:00:00       25675.926
...                             ...
2024-12-01 19:00:00       52799.179
2024-12-01 20:00:00       48321.570
2024-12-01 21:00:00       44818.234
2024-12-01 22:00:00       40716.144
2024-12-01 23:00:00       36954.822

[34344 rows x 1 columns]


In [14]:
# El-spot prices
df2 = pd.read_csv('../Dataset/ELSpotPrices.csv' if not IN_COLAB else 'ELSpotPrices.csv', sep=';')
df2['HourDK'] = pd.to_datetime(df2['HourDK'])
df2['SpotPriceDKK'] = df2['SpotPriceDKK'].str.replace(",", ".").astype(float)
df2.index = df2['HourDK']
df2 = df2.iloc[1:] # remove first row, since the measurement at that time is not present in other dataset
df2.drop(columns=['HourUTC', 'HourDK', 'PriceArea', 'SpotPriceEUR'], inplace=True)

Functions

In [15]:
def plot_data(data_train, data_test, predictions, save_at=''):
  plt.figure(figsize=(7, 3))
  plt.plot(data_train.index, data_train, label=f'Train ({data_train.index[0]} - {data_train.index[-1]})')
  plt.plot(data_test.index, data_test, label=f'Test ({data_test.index[0]} - {data_test.index[-1]})')
  plt.plot(data_test.index, predictions, label='Prediction')
  plt.title('Consumption in danish private households with prediction')
  plt.xlabel('Measurements')
  plt.ylabel('Power (kW / charger)')
  plt.legend()
  if save_at:
    plt.savefig(save_at)
  plt.show()

def sample_data_with_train_window(df, start_date, train_window_size):
  start_date = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(hours=train_window_size)
  end_date = df.index[-1]
  return df[(df.index >= start_date) & (df.index <= end_date)]

def get_next_window(data, train_window_size, forecast_horizon):
  return data[:train_window_size], data[train_window_size:train_window_size + forecast_horizon]

def forecast_whitebox_model(model, forecast_horizon, model_name, exog_data_test=None):
  model_res = model.fit()

  if "SARIMA" in model_name and "STL" not in model_name:
    return model_res.get_forecast(steps=forecast_horizon, exog=exog_data_test).predicted_mean
  else:
    return model_res.forecast(steps=forecast_horizon)

def create_result_table(results, columns=[]):
  result_table = pd.DataFrame(results)
  result_table.columns = columns
  result_table = result_table.sort_values(by='rmse', ascending=True).reset_index(drop=True)
  return result_table


Objective functions

In [16]:
def objective_SARIMAX(trial, data_train, data_test, forecast_horizon, exog_data_train, exog_data_test, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order, exog=exog_data_train)
  mdl = model.fit(disp=0)
  predictions_scaled = mdl.forecast(steps=forecast_horizon, exog=exog_data_test)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)

def objective_SARIMA(trial, data_train, data_test, forecast_horizon, scaler):
  data_train_scaled = scaler.fit_transform(data_train[['ConsumptionkWh']])
  data_train = pd.DataFrame(data_train_scaled, columns=['ConsumptionkWh'], index=data_train.index)
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order)
  mdl = model.fit(disp=0)
  predictions_scaled = mdl.forecast(steps=forecast_horizon)
  predictions = scaler.inverse_transform(predictions_scaled.values.reshape(-1, 1))
  predictions = pd.Series(predictions.flatten(), index=data_test.index)
  return root_mean_squared_error(data_test, predictions)

def objective_DynamicFactorMQ(trial, data_train, data_test, forecast_horizon):
  
  model = DynamicFactorMQ(data_train)
  mdl = model.fit(disp=0)
  predictions = mdl.forecast(steps=forecast_horizon)
  return root_mean_squared_error(data_test, predictions)

Optimizing through whole dataset

In [17]:
date_start = '2023-11-01'
window_train_size = 1440 #hours
forecast_horizon = 336 #hours
# 336_24, 1440_336, 17520_8760
trials = 100
model_name = f'SARIMAX_{window_train_size}_{forecast_horizon}_minmax'
scaler = MinMaxScaler()

data = sample_data_with_train_window(df, date_start, window_train_size) # start: date_start - window_train_size, end: last date in df
exog_data = sample_data_with_train_window(df2, date_start, window_train_size)

data_train, data_test = get_next_window(data, window_train_size, forecast_horizon)
exog_data_train, exog_data_test = get_next_window(exog_data, window_train_size, forecast_horizon)

def safe_objective(trial):
  try:
    return objective_SARIMAX(trial, data_train, data_test, forecast_horizon, exog_data_train, exog_data_test, scaler)
  except Exception as e:
    print(f"Failed trial: {e}. Skipped this trial.")
    return float('inf')

warnings.filterwarnings("ignore")
study1 = optuna.create_study(direction='minimize')
study1.optimize(safe_objective, n_trials=trials)

trial=study1.best_trial
print(f"Accuracy: {trial.value}")
print(f"best params for {model_name}: {trial.params}")

warnings.filterwarnings("default")

# Save the results in CSV
try:
  df_tuning = pd.read_csv('../Results/Whitebox/Tuning/whitebox_tuning.csv')
except:
  df_tuning = pd.DataFrame(columns=['model', 'accuracy', 'params'])

new_row = {'model': model_name, 'accuracy': trial.value, 'params': str(trial.params)}
df_tuning = pd.concat([df_tuning, pd.DataFrame([new_row])], ignore_index=True)
df_tuning = df_tuning.sort_values(by=['model', 'accuracy', 'params'], ascending=True).reset_index(drop=True)
df_tuning.to_csv('../Results/Whitebox/Tuning/whitebox_tuning.csv', index=False)

[I 2024-12-26 16:31:30,832] A new study created in memory with name: no-name-3e22c28e-18ec-4c66-b218-78d8eab5b406
[I 2024-12-26 16:31:30,859] Trial 0 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,862] Trial 1 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,864] Trial 2 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,868] Trial 3 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,870] Trial 4 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,873] Trial 5 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,876] Trial 6 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,879] Trial 7 finished with value: inf and parameters: {}. Best is

[I 2024-12-26 16:31:30,881] Trial 8 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.


Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.


[I 2024-12-26 16:31:30,884] Trial 9 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,887] Trial 10 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,889] Trial 11 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,892] Trial 12 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,895] Trial 13 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,899] Trial 14 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,902] Trial 15 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,905] Trial 16 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:30,910] Trial 17 finished with value: inf and parameters: {}.

Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed tri

[I 2024-12-26 16:31:31,039] Trial 60 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,042] Trial 61 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,044] Trial 62 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,047] Trial 63 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,050] Trial 64 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,052] Trial 65 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,055] Trial 66 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,057] Trial 67 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,061] Trial 68 finished with value: inf and parameters: {}

Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed tri

[I 2024-12-26 16:31:31,072] Trial 72 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,074] Trial 73 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,077] Trial 74 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,079] Trial 75 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,082] Trial 76 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,085] Trial 77 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,087] Trial 78 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,094] Trial 79 finished with value: inf and parameters: {}. Best is trial 0 with value: inf.
[I 2024-12-26 16:31:31,098] Trial 80 finished with value: inf and parameters: {}

Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed trial: "None of [Index(['ConsumptionkWh'], dtype='object')] are in the [columns]". Skipped this trial.
Failed tri