In [2]:
!pip install numpy pandas statsmodels matplotlib seaborn prophet sklearn optuna

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import optuna

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import Holt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.dynamic_factor_mq import DynamicFactorMQ
from statsmodels.tsa.forecasting.stl import STLForecast
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.forecasting.theta import ThetaModel
from datetime import datetime, timedelta

from sklearn.metrics import root_mean_squared_error

from prophet import Prophet

try:
  from google.colab import files
  from google.colab import drive
  uploaded = files.upload()
  !mkdir -p "/content/drive/My Drive/p9"
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False
import warnings
warnings.filterwarnings('once')

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


Initialize time series

In [4]:
# Consumption data
df = pd.read_csv('../Dataset/ConsumptionIndustry.csv' if not IN_COLAB else 'ConsumptionIndustry.csv', sep=';')

df['HourDK'] = pd.to_datetime(df['HourDK'])
df['ConsumptionkWh'] = df['ConsumptionkWh'].str.replace(",", ".").astype(float)
df.index = df['HourDK']

# format data here
df.drop(columns=['HourUTC', 'HourDK', 'MunicipalityNo', 'Branche'], inplace=True)
print(df)

                     ConsumptionkWh
HourDK                             
2021-01-01 00:00:00       37842.849
2021-01-01 01:00:00       35086.772
2021-01-01 02:00:00       31777.762
2021-01-01 03:00:00       28423.659
2021-01-01 04:00:00       25675.926
...                             ...
2024-11-29 19:00:00       48200.189
2024-11-29 20:00:00       43489.862
2024-11-29 21:00:00       41323.328
2024-11-29 22:00:00       38119.640
2024-11-29 23:00:00       35699.765

[34296 rows x 1 columns]


In [5]:
# El-spot prices
df2 = pd.read_csv('../Dataset/ELSpotPrices.csv' if not IN_COLAB else 'ELSpotPrices.csv', sep=';')
df2['HourDK'] = pd.to_datetime(df2['HourDK'])
df2['SpotPriceDKK'] = df2['SpotPriceDKK'].str.replace(",", ".").astype(float)
df2.index = df2['HourDK']
df2 = df2.iloc[1:] # remove first row, since the measurement at that time is not present in other dataset
df2.drop(columns=['HourUTC', 'HourDK', 'PriceArea', 'SpotPriceEUR'], inplace=True)

Functions

In [6]:
def plot_data(data_train, data_test, predictions, save_at=''):
  plt.figure(figsize=(7, 3))
  plt.plot(data_train.index, data_train, label=f'Train ({data_train.index[0]} - {data_train.index[-1]})')
  plt.plot(data_test.index, data_test, label=f'Test ({data_test.index[0]} - {data_test.index[-1]})')
  plt.plot(data_test.index, predictions, label='Prediction')
  plt.title('Consumption in danish private households with prediction')
  plt.xlabel('Measurements')
  plt.ylabel('Power (kW / charger)')
  plt.legend()
  if save_at:
    plt.savefig(save_at)
  plt.show()

def sample_data_with_train_window(df, start_date, train_window_size):
  start_date = datetime.strptime(start_date, '%Y-%m-%d') - timedelta(hours=train_window_size)
  end_date = df.index[-1]
  return df[(df.index >= start_date) & (df.index <= end_date)]

def get_next_window(data, train_window_size, forecast_horizon):
  return data[:train_window_size], data[train_window_size:train_window_size + forecast_horizon]

def forecast_whitebox_model(model, forecast_horizon, model_name, exog_data_test=None):
  model_res = model.fit()

  if "SARIMA" in model_name and "STL" not in model_name:
    return model_res.get_forecast(steps=forecast_horizon, exog=exog_data_test).predicted_mean
  else:
    return model_res.forecast(steps=forecast_horizon)

def create_result_table(results, columns=[]):
  result_table = pd.DataFrame(results)
  result_table.columns = columns
  result_table = result_table.sort_values(by='rmse', ascending=True).reset_index(drop=True)
  return result_table


Optimize functions

In [8]:
def optimize_SARIMA(data_train, data_test, forecast_horizon, model_name):
  results = []
  best_rmse = 0
  p = d = q = range(0, 3)
  seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

  for param in itertools.product(p, d, q):
    for seasonal_param in seasonal_pdq:
      try:
          model = SARIMAX(data_train, order=param, seasonal_order=seasonal_param)
          predictions = forecast_whitebox_model(model, forecast_horizon, model_name)
      except:
          continue
      
      rmse = root_mean_squared_error(data_test, predictions)
      results.append([f"{param}x{seasonal_param}", rmse])
      print(f"{param}x{seasonal_param} - RMSE: {rmse}")

      if rmse < best_rmse or best_rmse == 0:
        best_prediction = predictions

  result_table = create_result_table(results, columns=['params', 'rmse'])
  return result_table, best_prediction

def optimize_SARIMAX(data_train, data_test, forecast_horizon, model_name, exog_data_train=None, exog_data_test=None):
  results = []
  best_rmse = 0
  p = d = q = range(0, 3)
  seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

  for param in itertools.product(p, d, q):
    for seasonal_param in seasonal_pdq:
      try:
          model = SARIMAX(data_train, order=param, seasonal_order=seasonal_param, exog=exog_data_train)
          predictions = forecast_whitebox_model(model, forecast_horizon, model_name, exog_data_test=exog_data_test)
      except:
          continue
      
      rmse = root_mean_squared_error(data_test, predictions)
      results.append([f"{param}x{seasonal_param}", rmse])
      print(f"{param}x{seasonal_param} - RMSE: {rmse}")

      if rmse < best_rmse or best_rmse == 0:
        best_prediction = predictions

  result_table = create_result_table(results, columns=['params', 'rmse'])
  return result_table, best_prediction

def optimize_Theta_model(data_train, data_test, forecast_horizon, model_name):
  results = []
  best_rmse = 0
  p = range(1, 25)
  d = [True, False]
  u = [True, False]
  m = ['additive', 'multiplicative']
  di = [True, False]

  for param in itertools.product(p, d, u, m, di):
    try:
      model = ThetaModel(data_train, period=param[0], deseasonalize=param[1], use_test=param[2], method=param[3], difference=param[4])
    except:
      continue

    predictions = forecast_whitebox_model(model, forecast_horizon, model_name)
    rmse = root_mean_squared_error(data_test, predictions)
    results.append([param, rmse])
    print(f"{param} - RMSE: {rmse}")
    
    if rmse < best_rmse or best_rmse == 0:
      best_prediction = predictions

  result_table = create_result_table(results, columns=['params', 'rmse'])
  return result_table, best_prediction

def optimize_DynamicFactorMQ(data_train, data_test, forecast_horizon, model_name):
  results = []
  best_rmse = 0
  f = range(20, 60)
  i = [True, False]
  s = [True, False]

  for param in itertools.product(f, i, s):
    try:
      model = DynamicFactorMQ(data_train, factors=param[0], idiosyncratic_ar1=param[1], standardize=param[2])
    except:
      continue

    predictions = forecast_whitebox_model(model, forecast_horizon, model_name)
    rmse = root_mean_squared_error(data_test, predictions)
    results.append([param, rmse])
    print(f"{param} - RMSE: {rmse}")
    
    if rmse < best_rmse or best_rmse == 0:
      best_prediction = predictions

  result_table = create_result_table(results, columns=['params', 'rmse'])
  return result_table, best_prediction

In [60]:
def objective_SARIMAX(trial, data_train, data_test, forecast_horizon, exog_data_train=None, exog_data_test=None):
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  trend = trial.suggest_categorical('trend', ['n', 'c', 't', 'ct', None])
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order, exog=exog_data_train, trend=trend)
  mdl = model.fit(disp=0)
  predictions = mdl.forecast(steps=forecast_horizon, exog=exog_data_test)
  return root_mean_squared_error(data_test, predictions)

def objective_SARIMA(trial, data_train, data_test, forecast_horizon):
  p = d = q = range(0, 3)
  pdq = list(itertools.product(p, d, q))
  pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
  order = trial.suggest_categorical('order', pdq)
  seasonal_order = trial.suggest_categorical('seasonal_order', pdqs)
  model = SARIMAX(data_train, order=order, seasonal_order=seasonal_order)
  mdl = model.fit(disp=0)
  predictions = mdl.forecast(steps=forecast_horizon)
  return root_mean_squared_error(data_test, predictions)

def objective_DynamicFactorMQ(trial, data_train, data_test, forecast_horizon):
  f = trial.suggest_int('factors', 20, 60)
  idiosyncratic_ar1 = trial.suggest_categorical('idiosyncratic_ar1', [True, False])
  standardize = trial.suggest_categorical('standardize', [True, False])
  model = DynamicFactorMQ(data_train, factors=f, idiosyncratic_ar1=idiosyncratic_ar1, standardize=standardize)
  mdl = model.fit(disp=0)
  predictions = mdl.forecast(steps=forecast_horizon)
  return root_mean_squared_error(data_test, predictions)

Optimizing through whole dataset

In [59]:
date_start = '2023-11-01'
window_train_size = 1440 #hours
forecast_horizon = 336 #hours
# 336_24, 1440_336
trials = 100
model_name = f'SARIMA_{window_train_size}_{forecast_horizon}'

data = sample_data_with_train_window(df, date_start, window_train_size) # start: date_start - window_train_size, end: last date in df
# exog_data = sample_data_with_train_window(df2, date_start, window_train_size)

data_train, data_test = get_next_window(data, window_train_size, forecast_horizon)
# exog_data_train, exog_data_test = get_next_window(exog_data, window_train_size, forecast_horizon)

def safe_objective(trial):
  try:
    return objective_SARIMA(trial, data_train, data_test, forecast_horizon)
  except Exception as e:
    print(f"Failed trial: {e}. Skipped this trial.")
    return float('inf')

warnings.filterwarnings("ignore")
study1 = optuna.create_study(direction='minimize')
study1.optimize(safe_objective, n_trials=trials)

trial=study1.best_trial
print("Accuracy: {}".format(trial.value))
print("best params for SARIMAX: {}".format(trial.params))

warnings.filterwarnings("default")

# Save the results in CSV
try:
  df_tuning = pd.read_csv('../Results/Whitebox/Tuning/whitebox_tuning.csv')
except:
  df_tuning = pd.DataFrame(columns=['model', 'accuracy', 'params'])

new_row = {'model': model_name, 'accuracy': trial.value, 'params': str(trial.params)}
df_tuning = pd.concat([df_tuning, pd.DataFrame([new_row])], ignore_index=True)
df_tuning = df_tuning.sort_values(by=['model', 'accuracy', 'params'], ascending=True).reset_index(drop=True)
df_tuning.to_csv('../Results/Whitebox/Tuning/whitebox_tuning.csv', index=False)


[I 2024-12-19 09:41:44,889] A new study created in memory with name: no-name-84396405-2e50-409a-9f90-dea0f36e1647


[I 2024-12-19 09:42:26,136] Trial 0 finished with value: 137320.63824184958 and parameters: {'order': (0, 0, 0), 'seasonal_order': (2, 2, 0, 12), 'trend': None}. Best is trial 0 with value: 137320.63824184958.
[I 2024-12-19 09:42:35,709] Trial 1 finished with value: 11476.371266957362 and parameters: {'order': (0, 1, 0), 'seasonal_order': (1, 1, 1, 12), 'trend': 't'}. Best is trial 1 with value: 11476.371266957362.
[I 2024-12-19 09:42:54,149] Trial 2 finished with value: 6718.742078358796 and parameters: {'order': (2, 1, 1), 'seasonal_order': (1, 1, 0, 12), 'trend': 'c'}. Best is trial 2 with value: 6718.742078358796.
[I 2024-12-19 09:42:58,114] Trial 3 finished with value: 16009.658158831662 and parameters: {'order': (0, 0, 1), 'seasonal_order': (0, 0, 1, 12), 'trend': 't'}. Best is trial 2 with value: 6718.742078358796.
[I 2024-12-19 09:43:33,792] Trial 4 finished with value: 1177135.1792750997 and parameters: {'order': (2, 2, 0), 'seasonal_order': (2, 1, 1, 12), 'trend': 'ct'}. Best

Failed trial: LU decomposition error.. Skipped this trial.


[I 2024-12-19 09:51:08,976] Trial 20 finished with value: 499117.8820942635 and parameters: {'order': (0, 2, 0), 'seasonal_order': (1, 0, 1, 12), 'trend': None}. Best is trial 2 with value: 6718.742078358796.
[I 2024-12-19 09:51:27,405] Trial 21 finished with value: 6718.742078358796 and parameters: {'order': (2, 1, 1), 'seasonal_order': (1, 1, 0, 12), 'trend': 'c'}. Best is trial 2 with value: 6718.742078358796.
[I 2024-12-19 09:51:46,074] Trial 22 finished with value: 6718.742078358796 and parameters: {'order': (2, 1, 1), 'seasonal_order': (1, 1, 0, 12), 'trend': 'c'}. Best is trial 2 with value: 6718.742078358796.
[I 2024-12-19 09:51:56,694] Trial 23 finished with value: 644766.5633404262 and parameters: {'order': (1, 2, 0), 'seasonal_order': (2, 0, 1, 12), 'trend': 'c'}. Best is trial 2 with value: 6718.742078358796.
[I 2024-12-19 09:53:07,260] Trial 24 finished with value: 10928.623904181877 and parameters: {'order': (2, 1, 1), 'seasonal_order': (1, 2, 2, 12), 'trend': 't'}. Best 

Failed trial: LU decomposition error.. Skipped this trial.


[I 2024-12-19 10:12:39,464] Trial 87 finished with value: 15565775054752.207 and parameters: {'order': (2, 0, 1), 'seasonal_order': (2, 0, 2, 12), 'trend': 't'}. Best is trial 48 with value: 3711.1313582108182.
[I 2024-12-19 10:13:05,432] Trial 88 finished with value: 10823.683677148598 and parameters: {'order': (0, 1, 2), 'seasonal_order': (2, 1, 0, 12), 'trend': 't'}. Best is trial 48 with value: 3711.1313582108182.
[I 2024-12-19 10:13:27,761] Trial 89 finished with value: 11701.448617946096 and parameters: {'order': (2, 2, 1), 'seasonal_order': (1, 2, 0, 12), 'trend': 't'}. Best is trial 48 with value: 3711.1313582108182.
[I 2024-12-19 10:13:43,246] Trial 90 finished with value: 46068.684007421616 and parameters: {'order': (1, 2, 2), 'seasonal_order': (1, 1, 1, 12), 'trend': 't'}. Best is trial 48 with value: 3711.1313582108182.
[I 2024-12-19 10:13:48,235] Trial 91 finished with value: 5881.543864187514 and parameters: {'order': (0, 1, 2), 'seasonal_order': (1, 0, 1, 12), 'trend': N

Accuracy: 3711.1313582108182
best params for SARIMAX: {'order': (2, 0, 0), 'seasonal_order': (1, 1, 0, 12), 'trend': 'ct'}
