## Imports: libraries and dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import itertools
from statsmodels.tsa.stattools import adfuller

In [2]:
# Load the dataset
df = pd.read_csv('../../dataset/KAG_energydata_complete.csv')

# Convert the 'date' column to datetime format and set it as the index
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

## Arima model

In [3]:
# Focusing on the 'Appliances' energy consumption
series = df['Appliances']

In [4]:
# Perform ADF test to check for stationarity
result = adfuller(series)
print('ADF Statistic:', result[0])
print('p-value:', result[1])

ADF Statistic: -21.61637819803621
p-value: 0.0


In [5]:
# If the p-value is greater than 0.05, difference the series
if result[1] > 0.05:
    series = series.diff().dropna()

In [6]:
# Parameter tuning for ARIMA using a smaller range for efficiency
p = d = q = range(0, 3)
best_aic = float("inf")
best_params = None
num_combinations = len(list(itertools.product(p, d, q)))

print(f"Total combinations to evaluate: {num_combinations}")

Total combinations to evaluate: 27


In [7]:
count = 0

for param in itertools.product(p, d, q):
    count += 1
    print(f"Evaluating combination {count}/{num_combinations}: ARIMA{param}")
    try:
        model = ARIMA(series, order=param)
        results = model.fit()
        if results.aic < best_aic:
            best_aic = results.aic
            best_params = param
    except Exception as e:
        print(f"Combination {param} failed with error: {e}")
        continue

print('Best ARIMA parameters:', best_params)
print('Best AIC:', best_aic)


Evaluating combination 1/27: ARIMA(0, 0, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 2/27: ARIMA(0, 0, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 3/27: ARIMA(0, 0, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 4/27: ARIMA(0, 1, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 5/27: ARIMA(0, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 6/27: ARIMA(0, 1, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 7/27: ARIMA(0, 2, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 8/27: ARIMA(0, 2, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 9/27: ARIMA(0, 2, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'


Evaluating combination 10/27: ARIMA(1, 0, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 11/27: ARIMA(1, 0, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 12/27: ARIMA(1, 0, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 13/27: ARIMA(1, 1, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 14/27: ARIMA(1, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 15/27: ARIMA(1, 1, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 16/27: ARIMA(1, 2, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 17/27: ARIMA(1, 2, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 18/27: ARIMA(1, 2, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'


Evaluating combination 19/27: ARIMA(2, 0, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 20/27: ARIMA(2, 0, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Evaluating combination 21/27: ARIMA(2, 0, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 22/27: ARIMA(2, 1, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 23/27: ARIMA(2, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 24/27: ARIMA(2, 1, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 25/27: ARIMA(2, 2, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 26/27: ARIMA(2, 2, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Evaluating combination 27/27: ARIMA(2, 2, 2)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Best ARIMA parameters: (2, 0, 2)
Best AIC: 221345.8773147678


In [8]:
# Fit the ARIMA model with the best parameters
model = ARIMA(series, order=best_params)
model_fit = model.fit()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [9]:
# Forecast
forecast_steps = 30  # Forecast for 30 time steps ahead
forecast = model_fit.forecast(steps=forecast_steps)

In [10]:
# Evaluate the model
y_train = series[:-forecast_steps]
y_test = series[-forecast_steps:]
y_pred = model_fit.predict(start=len(series)-forecast_steps, end=len(series)-1)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('Mean Absolute Error (MAE):', mae)
print('Mean Squared Error (MSE):', mse)
print('Root Mean Squared Error (RMSE):', rmse)

Mean Absolute Error (MAE): 39.73663515185555
Mean Squared Error (MSE): 3713.644219823917
Root Mean Squared Error (RMSE): 60.93967689300557


## Sarima model

In [11]:
# Focusing on the 'Appliances' energy consumption and selecting exogenous variables
series = df['Appliances']
# exog = df[['T1', 'RH_1', 'T2', 'RH_2', 'T_out', 'RH_out']]

In [12]:
# Fit the SARIMAX model with the selected parameters and exogenous variables
model = SARIMAX(series,
                order=(2, 0, 2),
                seasonal_order=(1, 0, 1, 24),
                #exog=exog,
                enforce_stationarity=False,
                enforce_invertibility=False)
model_fit = model.fit(disp=False)

# Print summary of the model
print(model_fit.summary())

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                           Appliances   No. Observations:                19735
Model:             SARIMAX(2, 0, 2)x(1, 0, [1], 24)   Log Likelihood             -110623.187
Date:                              Sat, 06 Jul 2024   AIC                         221260.373
Time:                                      23:21:07   BIC                         221315.595
Sample:                                  01-11-2016   HQIC                        221278.453
                                       - 05-27-2016                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.4069      0.009    154.128      0.000       1.389       1.425
ar.L2         -0.41

In [14]:
# Forecast
forecast_steps = 30  # Forecast for 30 time steps ahead
# exog_forecast = exog.iloc[-forecast_steps:].reset_index(drop=True)
# forecast = model_fit.forecast(steps=forecast_steps, exog=exog_forecast)
forecast = model_fit.forecast(steps=forecast_steps)

In [16]:
# Evaluate the model
y_train = series[:-forecast_steps]
y_test = series[-forecast_steps:]
# y_pred = model_fit.predict(start=len(series)-forecast_steps, end=len(series)-1, exog=exog_forecast)
y_pred = model_fit.predict(start=len(series)-forecast_steps, end=len(series)-1)


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print('Mean Absolute Error (MAE):', mae)
print('Mean Squared Error (MSE):', mse)
print('Root Mean Squared Error (RMSE):', rmse)

Mean Absolute Error (MAE): 42.12906344304756
Mean Squared Error (MSE): 3892.527091971792
Root Mean Squared Error (RMSE): 62.39012014711778


By removing the other independent variables, the sarima model improved marginally