In [None]:
!pip install numpy pandas statsmodels matplotlib seaborn prophet sklearn 

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import Holt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.dynamic_factor_mq import DynamicFactorMQ
from statsmodels.tsa.forecasting.stl import STLForecast
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.forecasting.theta import ThetaModel
from datetime import datetime, timedelta

from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

from prophet import Prophet

try:
  from google.colab import files
  uploaded = files.upload()
  IN_COLAB = True
except:
  IN_COLAB = False
import warnings
warnings.filterwarnings('once')

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


Initialize time series

In [5]:
df = pd.read_csv('../Dataset/ConsumptionIndustry.csv' if not IN_COLAB else 'ConsumptionIndustry.csv', sep=';')

df['HourDK'] = pd.to_datetime(df['HourDK'])
df['ConsumptionkWh'] = df['ConsumptionkWh'].str.replace(",", ".").astype(float)
df.index = df['HourDK']

# format data here
df.drop(columns=['HourUTC', 'HourDK', 'MunicipalityNo', 'Branche'], inplace=True)

Functions

In [25]:
def plot_data(data_train, data_test):
  plt.figure(figsize=(7, 3))
  plt.plot(data_train.index, data_train, label=f'Train ({data_train.index[0]} - {data_train.index[-1]})')
  plt.plot(data_test.index, data_test, label=f'Test ({data_test.index[0]} - {data_test.index[-1]})')
  plt.title('Consumption in dk private households')
  plt.xlabel('Measurements')
  plt.ylabel('Power (kW / charger)')
  plt.legend()
  plt.show()

def split_data(df, train_window_size):
  return df[:-train_window_size], df[-train_window_size:]

Optimize functions

In [7]:
def optimize_SARIMAX(endog):
  results = []
  p = d = q = range(0, 3)
  seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

  for param in itertools.product(p, d, q):
    for seasonal_param in seasonal_pdq:
      try:
          model = SARIMAX(endog, order=param, seasonal_order=seasonal_param, enforce_stationarity=False, enforce_invertibility=False).fit(maxiter=200, disp=0) #disp=0 to reduce output verbosity
      except:
          continue
      
      aic = model.aic
      rmse = root_mean_squared_error(endog, model.fittedvalues)
      results.append([f"{param}x{seasonal_param}", aic, rmse])
      print(f"{param}x{seasonal_param} - AIC: {aic} - RMSE: {rmse}")

  result_table = pd.DataFrame(results)
  result_table.columns = ['parameters', 'aic', 'rmse']
  result_table = result_table.sort_values(by='rmse', ascending=True).reset_index(drop=True)

  return result_table

Optimizing through whole dataset

In [24]:
date_start = df.index[0]
window_train_size = 0 #hours
forecast_horizon = 24*366 #hours (2024 was a leap year)

data = df
data_train, data_test = split_data(data, forecast_horizon)

warnings.filterwarnings("ignore")
optimize_SARIMAX(data_train)
warnings.filterwarnings("default")

25055 8784 2023-11-11 00:00:00 2024-11-10 23:00:00
