# SARIMAX forecasting on PV

#### Import data

In [35]:
import warnings
warnings.filterwarnings("ignore")
import plotly.graph_objects as go
import pandas as pd

from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

import DataRetriever as dr

retriever = dr.DataRetriever()

PV_ATTRIBUTES = retriever.get_attributes(file_name='producing_attributes.pkl')

gen_df = retriever.get_data(file_name='All-Subsystems-hour-Year2.pkl')[PV_ATTRIBUTES].sum(axis=1).clip(lower=0) / 1000
gen_df = gen_df.rename("Generated Energy", inplace=True).to_frame()
gen_df

Unnamed: 0_level_0,Generated Energy
Timestamp,Unnamed: 1_level_1
2015-02-01 00:00:00,0.018496
2015-02-01 01:00:00,0.018332
2015-02-01 02:00:00,0.018385
2015-02-01 03:00:00,0.018502
2015-02-01 04:00:00,0.018524
...,...
2016-01-31 19:00:00,0.017710
2016-01-31 20:00:00,0.017940
2016-01-31 21:00:00,0.018149
2016-01-31 22:00:00,0.018209


#### Check if PV is stationary + Differencing

In [36]:
from statsmodels.tsa.stattools import adfuller

ADF_test = adfuller(gen_df["Generated Energy"])

print(f"The p-value from the Augmented Dickey-Fuller test is {ADF_test[1]}. \n This suggest that the TS is stationary, and d=0 and D=0.")

The p-value from the Augmented Dickey-Fuller test is 4.569102377280038e-19. 
 This suggest that the TS is stationary, and d=0 and D=0.


#### Determine order of autoregressive terms (p) and moving average terms (q)

In [37]:
acf_values, acf_conf = acf(gen_df['Generated Energy'], nlags=73, alpha=0.05)

for i in range(len(acf_values)):
    acf_conf[i] = acf_conf[i] - acf_values[i]

acf_conf = pd.DataFrame(acf_conf, columns=['Upper', 'Lower'])

fig = go.Figure(go.Bar(
    y = acf_values
))

fig.add_trace(go.Scatter(
    x = list(acf_conf.index) + list(acf_conf.index[::-1]),
    y = list(acf_conf['Upper']) + list(acf_conf['Lower'][::-1]),
    fill='toself',
    fillcolor='rgba(84,84,84,0.2)',
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
))

fig.update_yaxes(title="Autocorrelation", range=[-1, 1])
fig.update_xaxes(title="Lag")
fig.update_layout(template='plotly', showlegend=False)

fig.show()

In [38]:
pacf_values, pacf_conf = pacf(gen_df['Generated Energy'], nlags=73, alpha=0.05)

for i in range(len(pacf_values)):
    pacf_conf[i] = pacf_conf[i] - pacf_values[i]

pacf_conf = pd.DataFrame(pacf_conf, columns=['Upper', 'Lower'])

fig = go.Figure(go.Bar(
    y = pacf_values
))

fig.add_trace(go.Scatter(
    x = list(pacf_conf.index) + list(pacf_conf.index[::-1]),
    y = list(acf_conf['Upper']) + list(acf_conf['Lower'][::-1]),
    fill='toself',
    fillcolor='rgba(84,84,84,0.2)',
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip"
))

fig.update_yaxes(title="Partial Autocorrelation", range=[-1, 1])
fig.update_xaxes(title="Lag")
fig.update_layout(template='plotly', showlegend=False)

fig.show()

#### Train, validation and test set

In [39]:
#Split gen_df into train, validation and test sets. Corresponds to 70%, 20% and 10% of data, respectively.
train_df = gen_df[ : int(len(gen_df)*0.7)]
validation_df = gen_df[int(len(gen_df)*0.7) : int(len(gen_df)*0.9)]
test_df = gen_df[int(len(gen_df)*0.9) : ]

In [40]:
train_validation_df = pd.concat([train_df, validation_df])
train_validation_df

Unnamed: 0_level_0,Generated Energy
Timestamp,Unnamed: 1_level_1
2015-02-01 00:00:00,0.018496
2015-02-01 01:00:00,0.018332
2015-02-01 02:00:00,0.018385
2015-02-01 03:00:00,0.018502
2015-02-01 04:00:00,0.018524
...,...
2015-12-26 07:00:00,0.018100
2015-12-26 08:00:00,0.039769
2015-12-26 09:00:00,0.317566
2015-12-26 10:00:00,0.596980


In [41]:
september_start = '2015-09-01 00:00:00'

#### ARIMA(4, 0, 0)

In [42]:
arima_model = SARIMAX(endog=train_df, trend='n', order=(4, 0, 0))
fitted_arima_model = arima_model.fit(low_memory=True)

fitted_arima_model.summary()

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.29345D+00    |proj g|=  7.52300D-04

At iterate    5    f=  1.29345D+00    |proj g|=  3.86471D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5      6     10      1     0     0   4.171D-05   1.293D+00
  F =   1.2934507905747399     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


0,1,2,3
Dep. Variable:,Generated Energy,No. Observations:,6132.0
Model:,"SARIMAX(4, 0, 0)",Log Likelihood,-7931.44
Date:,"Wed, 18 May 2022",AIC,15872.88
Time:,16:21:30,BIC,15906.487
Sample:,02-01-2015,HQIC,15884.538
,- 10-14-2015,,
Covariance Type:,approx,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,1.3085,0.013,102.919,0.000,1.284,1.333
ar.L2,-0.2608,0.021,-12.420,0.000,-0.302,-0.220
ar.L3,-0.0522,0.021,-2.488,0.013,-0.093,-0.011
ar.L4,-0.0988,0.013,-7.770,0.000,-0.124,-0.074
sigma2,0.7777,0.014,55.372,0.000,0.750,0.805

0,1,2,3
Ljung-Box (L1) (Q):,11.53,Jarque-Bera (JB):,24534.1
Prob(Q):,0.0,Prob(JB):,0.0
Heteroskedasticity (H):,1.05,Skew:,0.3
Prob(H) (two-sided):,0.27,Kurtosis:,12.78


In [43]:
# Maybe validation is not need as we dont really adjust this ARIMA model.
forecast_arima_val = pd.DataFrame(fitted_arima_model.predict(start=validation_df.index[0], end=validation_df.index[-1]))
forecast_arima_val.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_arima_test = pd.DataFrame(fitted_arima_model.predict(start=test_df.index[0], end=test_df.index[-1]))
forecast_arima_test.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_arima_test

Unnamed: 0,Prediction
2015-12-26 12:00:00,-6.067986e-112
2015-12-26 13:00:00,-5.011304e-112
2015-12-26 14:00:00,-3.878284e-112
2015-12-26 15:00:00,-2.766398e-112
2015-12-26 16:00:00,-1.746862e-112
...,...
2016-01-31 19:00:00,1.742245e-167
2016-01-31 20:00:00,1.392524e-167
2016-01-31 21:00:00,1.035863e-167
2016-01-31 22:00:00,6.985972e-168


In [45]:
y_true = gen_df['Generated Energy'][test_df.index[0] : test_df.index[-1]]

#squared=False returns RMSE, True returns MSE
arima_model_rmse = mean_squared_error(y_true, forecast_arima_test, squared=False)

arima_model_rmse

2.1857233393815316

In [51]:
fig = go.Figure(go.Scattergl(
    x = train_validation_df[september_start : ].index,
    y = train_validation_df['Generated Energy'][september_start : ],
    name = "Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x = test_df.index,
    y = test_df['Generated Energy'],
    name = "Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x = forecast_arima_test.index,
    y = forecast_arima_test['Prediction'],
    name = "Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/arima_production.html")
fig.show()

## SARIMA(4, 0, 0, 2, 0, 0, 24)

In [47]:
first_sarima_model = SARIMAX(endog=train_df, trend='n', order=(4, 0, 0), seasonal_order=(2, 0, 0, 24))
fitted_first_sarima_model = first_sarima_model.fit(low_memory=True)

fitted_first_sarima_model.summary()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            7     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.30782D+00    |proj g|=  1.43999D-01


 This problem is unconstrained.



At iterate    5    f=  1.24355D+00    |proj g|=  6.68119D-02

At iterate   10    f=  1.20033D+00    |proj g|=  1.08307D-01

At iterate   15    f=  1.19252D+00    |proj g|=  7.80547D-03

At iterate   20    f=  1.19224D+00    |proj g|=  7.69568D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    7     22     27      1     0     0   3.545D-06   1.192D+00
  F =   1.1922358981388930     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


0,1,2,3
Dep. Variable:,Generated Energy,No. Observations:,6132.0
Model:,"SARIMAX(4, 0, 0)x(2, 0, 0, 24)",Log Likelihood,-7310.791
Date:,"Wed, 18 May 2022",AIC,14635.581
Time:,16:29:14,BIC,14682.63
Sample:,02-01-2015,HQIC,14651.902
,- 10-14-2015,,
Covariance Type:,approx,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,1.0256,0.014,71.241,0.000,0.997,1.054
ar.L2,-0.1010,0.018,-5.505,0.000,-0.137,-0.065
ar.L3,0.0381,0.018,2.080,0.037,0.002,0.074
ar.L4,-0.1232,0.013,-9.660,0.000,-0.148,-0.098
ar.S.L24,0.3301,0.013,25.069,0.000,0.304,0.356
ar.S.L48,0.3025,0.013,23.630,0.000,0.277,0.328
sigma2,0.6342,0.011,55.371,0.000,0.612,0.657

0,1,2,3
Ljung-Box (L1) (Q):,2.97,Jarque-Bera (JB):,24603.36
Prob(Q):,0.08,Prob(JB):,0.0
Heteroskedasticity (H):,0.89,Skew:,0.4
Prob(H) (two-sided):,0.01,Kurtosis:,12.78


In [48]:
forecast_first_sarima_val = pd.DataFrame(
    fitted_first_sarima_model.predict(
        start=validation_df.index[0],
        end=validation_df.index[-1]
    )
)
forecast_first_sarima_val.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_first_sarima_val

Unnamed: 0,Prediction
2015-10-14 12:00:00,4.052269e+00
2015-10-14 13:00:00,3.270607e+00
2015-10-14 14:00:00,2.335440e+00
2015-10-14 15:00:00,1.616067e+00
2015-10-14 16:00:00,6.541827e-01
...,...
2015-12-26 07:00:00,4.230791e-11
2015-12-26 08:00:00,1.913395e-10
2015-12-26 09:00:00,6.360259e-10
2015-12-26 10:00:00,9.990246e-10


In [49]:
y_true = gen_df['Generated Energy'][validation_df.index[0]: validation_df.index[-1]]

#squared=False returns RMSE, True returns MSE
first_sarima_model_rmse = mean_squared_error(y_true, forecast_first_sarima_val, squared=False)

first_sarima_model_rmse

2.18020503231293

In [50]:
fig = go.Figure(go.Scattergl(
    x=train_df[september_start:].index,
    y=train_df['Generated Energy'][september_start:],
    name="Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x=validation_df.index,
    y=validation_df['Generated Energy'],
    name="Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x=forecast_first_sarima_val.index,
    y=forecast_first_sarima_val['Prediction'],
    name="Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/first_sarima_production.html")
fig.show()

#### Calculate seasonal difference

In [31]:
# Calculate the seasonal difference as a daily seasonality.
gen_df["Seasonal Difference"] = gen_df["Generated Energy"] - gen_df["Generated Energy"].shift(24)
gen_df

Unnamed: 0_level_0,Generated Energy,1dif,Prediction,Seasonal Difference
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-02-01 00:00:00,0.018496,,,
2015-02-01 01:00:00,0.018332,-0.000165,,
2015-02-01 02:00:00,0.018385,0.000053,,
2015-02-01 03:00:00,0.018502,0.000117,,
2015-02-01 04:00:00,0.018524,0.000022,,
...,...,...,...,...
2016-01-31 19:00:00,0.017710,0.000288,,0.000077
2016-01-31 20:00:00,0.017940,0.000230,,0.000123
2016-01-31 21:00:00,0.018149,0.000208,,0.000199
2016-01-31 22:00:00,0.018209,0.000060,,0.000154


## SARIMA(4, 0, 0, 0, 1, 2, 24)

In [None]:
second_sarima_model = SARIMAX(endog=train_df, trend='n', order=(4, 0, 0), seasonal_order=(0, 1, 2, 24))
fitted_second_sarima_model = second_sarima_model.fit(low_memory=True)

fitted_second_sarima_model.summary()

In [None]:
forecast_second_sarima_val = pd.DataFrame(
    fitted_second_sarima_model.predict(
        start=validation_df.index[0],
        end=validation_df.index[-1]
    )
)
forecast_second_sarima_val.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_second_sarima_val

In [None]:
y_true = gen_df['Generated Energy'][validation_df.index[0]: validation_df.index[-1]]

#squared=False returns RMSE, True returns MSE
second_sarima_model_rmse = mean_squared_error(y_true, forecast_second_sarima_val, squared=False)

second_sarima_model_rmse

In [None]:
fig = go.Figure(go.Scattergl(
    x=train_df[september_start : ].index,
    y=train_df['Generated Energy'][september_start :],
    name="Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x=validation_df.index,
    y=validation_df['Generated Energy'],
    name="Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x=forecast_second_sarima_val.index,
    y=forecast_second_sarima_val['Prediction'],
    name="Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/second_sarima_production.html")
fig.show()

#### Forecast test set on the best SARIMA model

In [None]:
forecast_first_sarima_test = pd.DataFrame(
    fitted_first_sarima_model.predict(
        start=test_df.index[0],
        end=test_df.index[-1]
    )
)
forecast_first_sarima_test.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

In [None]:
forecast_second_sarima_test = pd.DataFrame(
    fitted_second_sarima_model.predict(
        start=test_df.index[0],
        end=test_df.index[-1]
    )
)
forecast_second_sarima_test.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

In [None]:
# TO DO: Alt efter hvilken SARIMA MODEL der fik den bedste RMSE på val tidligere i filen, dens forecast på test skal bruges i figuren nedenfor. Har lavet begge forecasts ovenfor, men kun den ene skal bruges.

In [None]:
fig = go.Figure(go.Scattergl(
    x=train_validation_df[september_start : ].index,
    y=train_validation_df['Generated Energy'][september_start :],
    name="Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x=test_df.index,
    y=test_df['Generated Energy'],
    name="Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x=forecast_second_sarima_test.index,
    y=forecast_second_sarima_test['Prediction'],
    name="Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/best_sarima_production_testset.html")
fig.show()

### SARIMAX(4, 0, 0, 0, 1, 2, 24)

In [38]:
from sklearn.preprocessing import OneHotEncoder

gen_df["Hour"] = gen_df.index.hour

ohe = OneHotEncoder(sparse=False)
hot_np = ohe.fit_transform(gen_df[["Hour"]])
hot = pd.DataFrame(data=hot_np, columns=ohe.get_feature_names_out())
hot

Unnamed: 0,Hour_0,Hour_1,Hour_2,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9,...,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [39]:
gen_df_no_changes = gen_df["Generated Energy"][:test_df.index[-1]].to_frame().reset_index()
gen_df_no_changes

Unnamed: 0,Timestamp,Generated Energy
0,2015-02-01 00:00:00,0.018496
1,2015-02-01 01:00:00,0.018332
2,2015-02-01 02:00:00,0.018385
3,2015-02-01 03:00:00,0.018502
4,2015-02-01 04:00:00,0.018524
...,...,...
8540,2016-01-22 20:00:00,0.018398
8541,2016-01-22 21:00:00,0.018487
8542,2016-01-22 22:00:00,0.018362
8543,2016-01-22 23:00:00,0.018211


In [40]:
weather_forecast = pd.read_csv("../_05Forecasting/CLEANED_GAI_2015_2016.csv", index_col=0)
weather_forecast.index.rename("Timestamp", inplace=True)
weather_forecast.index = pd.to_datetime(weather_forecast.index)
weather_forecast

Unnamed: 0_level_0,Condition
Timestamp,Unnamed: 1_level_1
2015-01-31 01:00:00,Fair
2015-01-31 02:00:00,Fair
2015-01-31 03:00:00,Fair
2015-01-31 04:00:00,Fair
2015-01-31 05:00:00,Fair
...,...
2016-01-31 20:00:00,Fair
2016-01-31 21:00:00,Fair
2016-01-31 22:00:00,Fair
2016-01-31 23:00:00,Fair


In [41]:
ohe = OneHotEncoder(sparse=False)
hot = ohe.fit_transform(gen_df["Hour"].to_frame().merge(weather_forecast, left_index=True, right_index=True))
df_ohe = pd.DataFrame(data=hot, columns=ohe.get_feature_names_out())
df_ohe

Unnamed: 0,Hour_0,Hour_1,Hour_2,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9,...,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Condition_Cloudy,Condition_Fair,Condition_Mostly Cloudy,Condition_Partly Cloudy,Condition_Thunder
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [42]:
exog_df = gen_df_no_changes.join(df_ohe)

In [43]:
exog_df.set_index('Timestamp', inplace=True)
exog_df

Unnamed: 0_level_0,Generated Energy,Hour_0,Hour_1,Hour_2,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,...,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Condition_Cloudy,Condition_Fair,Condition_Mostly Cloudy,Condition_Partly Cloudy,Condition_Thunder
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-01 00:00:00,0.018496,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-02-01 01:00:00,0.018332,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-02-01 02:00:00,0.018385,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-02-01 03:00:00,0.018502,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-02-01 04:00:00,0.018524,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-01-22 20:00:00,0.018398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2016-01-22 21:00:00,0.018487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2016-01-22 22:00:00,0.018362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2016-01-22 23:00:00,0.018211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [44]:
exog_train = exog_df[: int(len(gen_df)* 0.7)]
exog_validation = exog_df[int(len(gen_df)* 0.7) : int(len(gen_df)* 0.9)]
exog_test = exog_df[int(len(gen_df)* 0.9) : ]

exog_train_validation = pd.concat([exog_train, exog_validation])

exog_train

Unnamed: 0_level_0,Generated Energy,Hour_0,Hour_1,Hour_2,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,...,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Condition_Cloudy,Condition_Fair,Condition_Mostly Cloudy,Condition_Partly Cloudy,Condition_Thunder
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-01 00:00:00,0.018496,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-02-01 01:00:00,0.018332,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-02-01 02:00:00,0.018385,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-02-01 03:00:00,0.018502,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2015-02-01 04:00:00,0.018524,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-01-16 20:00:00,0.018212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2016-01-16 21:00:00,0.018364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2016-01-16 22:00:00,0.018465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2016-01-16 23:00:00,0.018533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [45]:
exog_attributes = list(exog_df.columns)[1:]
exog_attributes

['Hour_0',
 'Hour_1',
 'Hour_2',
 'Hour_3',
 'Hour_4',
 'Hour_5',
 'Hour_6',
 'Hour_7',
 'Hour_8',
 'Hour_9',
 'Hour_10',
 'Hour_11',
 'Hour_12',
 'Hour_13',
 'Hour_14',
 'Hour_15',
 'Hour_16',
 'Hour_17',
 'Hour_18',
 'Hour_19',
 'Hour_20',
 'Hour_21',
 'Hour_22',
 'Hour_23',
 'Condition_Cloudy',
 'Condition_Fair',
 'Condition_Mostly Cloudy',
 'Condition_Partly Cloudy',
 'Condition_Thunder']

In [46]:
sarimax_model = SARIMAX(endog=exog_train['Generated Energy'],
                        exog=exog_train[exog_attributes],
                        trend='n',
                        order=(4, 0, 0),
                        seasonal_order=(0, 1, 2, 24)
                        )

fitted_sarimax_model = sarimax_model.fit()
fitted_sarimax_model.summary()

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           36     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.16348D+00    |proj g|=  4.12509D-01

At iterate    5    f=  1.04600D+00    |proj g|=  4.97625D-02

At iterate   10    f=  1.02874D+00    |proj g|=  2.09261D-02

At iterate   15    f=  1.02700D+00    |proj g|=  3.68893D-03

At iterate   20    f=  1.02697D+00    |proj g|=  2.54995D-04

At iterate   25    f=  1.02697D+00    |proj g|=  4.76095D-05

At iterate   30    f=  1.02697D+00    |proj g|=  1.90936D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nac

0,1,2,3
Dep. Variable:,Generated Energy,No. Observations:,8401.0
Model:,"SARIMAX(4, 0, 0)x(0, 1, [1, 2], 24)",Log Likelihood,-8627.559
Date:,"Tue, 17 May 2022",AIC,17327.118
Time:,12:20:12,BIC,17580.315
Sample:,02-01-2015,HQIC,17413.584
,- 01-17-2016,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Hour_0,1.232e-09,3.09e-12,399.094,0.000,1.23e-09,1.24e-09
Hour_1,1.188e-09,2.68e-12,442.768,0.000,1.18e-09,1.19e-09
Hour_2,1.287e-09,2.14e-12,601.433,0.000,1.28e-09,1.29e-09
Hour_3,1.604e-09,1.62e-12,987.818,0.000,1.6e-09,1.61e-09
Hour_4,1.502e-09,1.22e-12,1232.557,0.000,1.5e-09,1.5e-09
Hour_5,-5.396e-09,4.45e-13,-1.21e+04,0.000,-5.4e-09,-5.4e-09
Hour_6,2.441e-09,1.35e-12,1813.007,0.000,2.44e-09,2.44e-09
Hour_7,1.899e-09,1.13e-12,1676.696,0.000,1.9e-09,1.9e-09
Hour_8,-3.898e-09,3.46e-12,-1127.112,0.000,-3.9e-09,-3.89e-09

0,1,2,3
Ljung-Box (L1) (Q):,0.08,Jarque-Bera (JB):,44151.99
Prob(Q):,0.78,Prob(JB):,0.0
Heteroskedasticity (H):,0.66,Skew:,-0.09
Prob(H) (two-sided):,0.0,Kurtosis:,14.25


In [None]:
forecast_sarimax_val_test = pd.DataFrame(
    fitted_sarimax_model.predict(
    start=exog_validation.index[0],
    end=exog_test.index[-1],
    exog=exog_df[exog_attributes][exog_validation.index[0] : exog_test.index[-1]]
    )
)
forecast_sarimax_val_test.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_sarimax_val_test

In [None]:
y_true = exog_df['Generated Energy'][exog_test.index[0]: exog_test.index[-1]]

#squared=False returns RMSE, True returns MSE
sarimax_model_rmse = mean_squared_error(y_true,
                                        forecast_sarimax_val_test[exog_test.index[0]: exog_test.index[-1]],
                                        squared=False
                                        )

sarimax_model_rmse

In [52]:
fig = go.Figure(go.Scattergl(
    x = exog_train_validation[september_start : ].index,
    y = exog_train_validation['Generated Energy'][september_start : ],
    name = "Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x = exog_test.index,
    y = exog_test['Generated Energy'],
    name = "Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x = forecast_sarimax_val_test[exog_test[0].index : exog_test.index[-1]].index,
    y = forecast_sarimax_val_test['Prediction'][exog_test.index[0] : exog_test.index[-1]],
    name = "Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/sarimax_production.html")
fig.show()