# SARIMAX forecasting on PV

#### Import data

In [113]:
import warnings
warnings.filterwarnings("ignore")
import plotly.graph_objects as go
import pandas as pd

from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

import DataRetriever as dr

retriever = dr.DataRetriever()

PV_ATTRIBUTES = retriever.get_attributes(file_name='producing_attributes.pkl')

gen_df = retriever.get_data(file_name='All-Subsystems-hour-Year2.pkl')[PV_ATTRIBUTES].sum(axis=1).clip(lower=0) / 1000
gen_df = gen_df.rename("Generated Energy", inplace=True).to_frame()
gen_df

Unnamed: 0_level_0,Generated Energy
Timestamp,Unnamed: 1_level_1
2015-02-01 00:00:00,0.018496
2015-02-01 01:00:00,0.018332
2015-02-01 02:00:00,0.018385
2015-02-01 03:00:00,0.018502
2015-02-01 04:00:00,0.018524
...,...
2016-01-31 19:00:00,0.017710
2016-01-31 20:00:00,0.017940
2016-01-31 21:00:00,0.018149
2016-01-31 22:00:00,0.018209


#### Check if PV is stationary + Differencing

In [114]:
from statsmodels.tsa.stattools import adfuller

ADF_test = adfuller(gen_df["Generated Energy"])

print(f"The p-value from the Augmented Dickey-Fuller test is {ADF_test[1]}. \n This suggest that the TS is stationary, and d=0 and D=0.")

The p-value from the Augmented Dickey-Fuller test is 4.569102377280038e-19. 
 This suggest that the TS is stationary, and d=0 and D=0.


#### Determine order of autoregressive terms (p) and moving average terms (q)

In [115]:
acf_values, acf_conf = acf(gen_df['Generated Energy'], nlags=73, alpha=0.05)

for i in range(len(acf_values)):
    acf_conf[i] = acf_conf[i] - acf_values[i]

acf_conf = pd.DataFrame(acf_conf, columns=['Upper', 'Lower'])

fig = go.Figure(go.Bar(
    y = acf_values
))

fig.add_trace(go.Scatter(
    x = list(acf_conf.index) + list(acf_conf.index[::-1]),
    y = list(acf_conf['Upper']) + list(acf_conf['Lower'][::-1]),
    fill='toself',
    fillcolor='rgba(84,84,84,0.2)',
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
))

fig.update_yaxes(title="Autocorrelation", range=[-1, 1])
fig.update_xaxes(title="Lag")
fig.update_layout(template='plotly', showlegend=False)

fig.show()

In [116]:
pacf_values, pacf_conf = pacf(gen_df['Generated Energy'], nlags=73, alpha=0.05)

for i in range(len(pacf_values)):
    pacf_conf[i] = pacf_conf[i] - pacf_values[i]

pacf_conf = pd.DataFrame(pacf_conf, columns=['Upper', 'Lower'])

fig = go.Figure(go.Bar(
    y = pacf_values
))

fig.add_trace(go.Scatter(
    x = list(pacf_conf.index) + list(pacf_conf.index[::-1]),
    y = list(acf_conf['Upper']) + list(acf_conf['Lower'][::-1]),
    fill='toself',
    fillcolor='rgba(84,84,84,0.2)',
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip"
))

fig.update_yaxes(title="Partial Autocorrelation", range=[-1, 1])
fig.update_xaxes(title="Lag")
fig.update_layout(template='plotly', showlegend=False)

fig.show()

#### Train, validation and test set

In [117]:
#Split gen_df into train, validation and test sets. Corresponds to 70%, 20% and 10% of data, respectively.
train_df = gen_df[: 24*7] #  gen_df[ : int(len(gen_df)*0.7)]
validation_df = gen_df[int(len(gen_df)*0.7) : int(len(gen_df)*0.9)]
test_df = gen_df[int(len(gen_df)*0.9) : ]

In [118]:
train_df

Unnamed: 0_level_0,Generated Energy
Timestamp,Unnamed: 1_level_1
2015-02-01 00:00:00,0.018496
2015-02-01 01:00:00,0.018332
2015-02-01 02:00:00,0.018385
2015-02-01 03:00:00,0.018502
2015-02-01 04:00:00,0.018524
...,...
2015-02-07 19:00:00,0.017949
2015-02-07 20:00:00,0.018130
2015-02-07 21:00:00,0.018347
2015-02-07 22:00:00,0.018529


In [119]:
train_validation_df = pd.concat([train_df, validation_df])
train_validation_df

Unnamed: 0_level_0,Generated Energy
Timestamp,Unnamed: 1_level_1
2015-02-01 00:00:00,0.018496
2015-02-01 01:00:00,0.018332
2015-02-01 02:00:00,0.018385
2015-02-01 03:00:00,0.018502
2015-02-01 04:00:00,0.018524
...,...
2015-12-26 07:00:00,0.018100
2015-12-26 08:00:00,0.039769
2015-12-26 09:00:00,0.317566
2015-12-26 10:00:00,0.596980


In [120]:
september_start = '2015-09-01 00:00:00'

#### ARIMA(4, 0, 0)

In [121]:
arima_model = SARIMAX(endog=train_df, trend='n', order=(4, 0, 0))
fitted_arima_model = arima_model.fit(low_memory=True)

fitted_arima_model.summary()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.02617D+00    |proj g|=  3.51936D-02

At iterate    5    f=  1.02596D+00    |proj g|=  1.90027D-03

At iterate   10    f=  1.02595D+00    |proj g|=  4.56150D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5     10     13      1     0     0   4.562D-05   1.026D+00
  F =   1.0259538030207622     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


 This problem is unconstrained.


0,1,2,3
Dep. Variable:,Generated Energy,No. Observations:,168.0
Model:,"SARIMAX(4, 0, 0)",Log Likelihood,-172.36
Date:,"Thu, 19 May 2022",AIC,354.72
Time:,06:43:56,BIC,370.34
Sample:,02-01-2015,HQIC,361.06
,- 02-07-2015,,
Covariance Type:,approx,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,1.5758,0.077,20.540,0.000,1.425,1.726
ar.L2,-0.7576,0.144,-5.267,0.000,-1.040,-0.476
ar.L3,0.0642,0.144,0.447,0.655,-0.217,0.346
ar.L4,-0.0045,0.076,-0.059,0.953,-0.154,0.145
sigma2,0.4474,0.049,9.164,0.000,0.352,0.543

0,1,2,3
Ljung-Box (L1) (Q):,0.16,Jarque-Bera (JB):,732.14
Prob(Q):,0.69,Prob(JB):,0.0
Heteroskedasticity (H):,1.83,Skew:,1.58
Prob(H) (two-sided):,0.03,Kurtosis:,12.73


In [122]:
# Maybe validation is not need as we dont really adjust this ARIMA model.
forecast_arima_val = pd.DataFrame(fitted_arima_model.predict(start=validation_df.index[0], end=validation_df.index[-1]))
forecast_arima_val.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_arima_test = pd.DataFrame(fitted_arima_model.predict(start=test_df.index[0], end=test_df.index[-1]))
forecast_arima_test.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_arima_test

Unnamed: 0,Prediction
2015-12-26 12:00:00,0.000000e+00
2015-12-26 13:00:00,-4.940656e-324
2015-12-26 14:00:00,-9.881313e-324
2015-12-26 15:00:00,-9.881313e-324
2015-12-26 16:00:00,-4.940656e-324
...,...
2016-01-31 19:00:00,-4.940656e-324
2016-01-31 20:00:00,-9.881313e-324
2016-01-31 21:00:00,-9.881313e-324
2016-01-31 22:00:00,-4.940656e-324


In [123]:
y_true = gen_df['Generated Energy'][test_df.index[0] : test_df.index[-1]]

#squared=False returns RMSE, True returns MSE
arima_model_rmse = mean_squared_error(y_true, forecast_arima_test, squared=False)

arima_model_rmse

2.1857233393815316

In [124]:
fig = go.Figure(go.Scattergl(
    x = train_validation_df[september_start : ].index,
    y = train_validation_df['Generated Energy'][september_start : ],
    name = "Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x = test_df.index,
    y = test_df['Generated Energy'],
    name = "Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x = forecast_arima_test.index,
    y = forecast_arima_test['Prediction'],
    name = "Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/arima_production.html")
fig.show()

## SARIMA(4, 0, 0, 2, 0, 0, 24)

In [125]:
first_sarima_model = SARIMAX(endog=train_df, trend='n', order=(4, 0, 0), seasonal_order=(2, 0, 0, 24))
fitted_first_sarima_model = first_sarima_model.fit(low_memory=True)

fitted_first_sarima_model.summary()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            7     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.11622D+00    |proj g|=  4.98252D-01


 This problem is unconstrained.



At iterate    5    f=  9.35617D-01    |proj g|=  1.06795D-01

At iterate   10    f=  8.68246D-01    |proj g|=  6.23244D-03

At iterate   15    f=  8.68027D-01    |proj g|=  2.67907D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    7     19     22      1     0     0   7.492D-07   8.680D-01
  F =  0.86802330187725241     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


0,1,2,3
Dep. Variable:,Generated Energy,No. Observations:,168.0
Model:,"SARIMAX(4, 0, 0)x(2, 0, 0, 24)",Log Likelihood,-145.828
Date:,"Thu, 19 May 2022",AIC,305.656
Time:,06:43:59,BIC,327.524
Sample:,02-01-2015,HQIC,314.531
,- 02-07-2015,,
Covariance Type:,approx,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,1.5176,0.084,18.046,0.000,1.353,1.682
ar.L2,-0.7581,0.148,-5.131,0.000,-1.048,-0.468
ar.L3,0.1467,0.141,1.041,0.298,-0.129,0.423
ar.L4,-0.0485,0.077,-0.633,0.526,-0.199,0.102
ar.S.L24,0.1105,0.065,1.696,0.090,-0.017,0.238
ar.S.L48,0.4790,0.063,7.563,0.000,0.355,0.603
sigma2,0.3015,0.034,8.973,0.000,0.236,0.367

0,1,2,3
Ljung-Box (L1) (Q):,0.02,Jarque-Bera (JB):,2394.77
Prob(Q):,0.9,Prob(JB):,0.0
Heteroskedasticity (H):,1.31,Skew:,2.6
Prob(H) (two-sided):,0.31,Kurtosis:,20.75


In [126]:
forecast_first_sarima_val = pd.DataFrame(
    fitted_first_sarima_model.predict(
        start=validation_df.index[0],
        end=validation_df.index[-1]
    )
)
forecast_first_sarima_val.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_first_sarima_val

Unnamed: 0,Prediction
2015-10-14 12:00:00,2.143679e-31
2015-10-14 13:00:00,1.413382e-31
2015-10-14 14:00:00,1.096125e-31
2015-10-14 15:00:00,7.532775e-32
2015-10-14 16:00:00,4.010060e-32
...,...
2015-12-26 07:00:00,5.017561e-42
2015-12-26 08:00:00,3.261597e-41
2015-12-26 09:00:00,6.984249e-41
2015-12-26 10:00:00,1.329422e-40


In [127]:
y_true = gen_df['Generated Energy'][validation_df.index[0]: validation_df.index[-1]]

#squared=False returns RMSE, True returns MSE
first_sarima_model_rmse = mean_squared_error(y_true, forecast_first_sarima_val, squared=False)

first_sarima_model_rmse

2.255867932993165

In [128]:
fig = go.Figure(go.Scattergl(
    x=train_df[september_start:].index,
    y=train_df['Generated Energy'][september_start:],
    name="Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x=validation_df.index,
    y=validation_df['Generated Energy'],
    name="Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x=forecast_first_sarima_val.index,
    y=forecast_first_sarima_val['Prediction'],
    name="Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/first_sarima_production.html")
fig.show()

#### Calculate seasonal difference

In [129]:
# Calculate the seasonal difference as a daily seasonality.
# gen_df["Seasonal Difference"] = gen_df["Generated Energy"] - gen_df["Generated Energy"].shift(24)
# gen_df

## SARIMA(4, 0, 0, 0, 1, 2, 24)

In [130]:
second_sarima_model = SARIMAX(endog=train_df, trend='n', order=(4, 0, 0), seasonal_order=(0, 1, 2, 24))
fitted_second_sarima_model = second_sarima_model.fit(low_memory=True)

fitted_second_sarima_model.summary()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            7     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.02645D+00    |proj g|=  4.59259D-01


 This problem is unconstrained.



At iterate    5    f=  8.46339D-01    |proj g|=  3.83869D-02

At iterate   10    f=  8.38152D-01    |proj g|=  5.15787D-03

At iterate   15    f=  8.37779D-01    |proj g|=  1.81253D-03

At iterate   20    f=  8.37761D-01    |proj g|=  6.74911D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    7     23     30      1     0     0   2.904D-05   8.378D-01
  F =  0.83776073412343033     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


0,1,2,3
Dep. Variable:,Generated Energy,No. Observations:,168.0
Model:,"SARIMAX(4, 0, 0)x(0, 1, [1, 2], 24)",Log Likelihood,-140.744
Date:,"Thu, 19 May 2022",AIC,295.488
Time:,06:44:11,BIC,316.276
Sample:,02-01-2015,HQIC,303.935
,- 02-07-2015,,
Covariance Type:,approx,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,1.3969,0.098,14.307,0.000,1.206,1.588
ar.L2,-0.5705,0.154,-3.694,0.000,-0.873,-0.268
ar.L3,0.0221,0.145,0.153,0.878,-0.261,0.306
ar.L4,-0.0106,0.084,-0.126,0.900,-0.176,0.155
ma.S.L24,-0.9777,0.288,-3.392,0.001,-1.543,-0.413
ma.S.L48,0.1181,0.107,1.099,0.272,-0.093,0.329
sigma2,0.3233,0.088,3.664,0.000,0.150,0.496

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,825.25
Prob(Q):,1.0,Prob(JB):,0.0
Heteroskedasticity (H):,0.47,Skew:,1.9
Prob(H) (two-sided):,0.01,Kurtosis:,14.09


In [131]:
forecast_second_sarima_val = pd.DataFrame(
    fitted_second_sarima_model.predict(
        start=validation_df.index[0],
        end=validation_df.index[-1]
    )
)
forecast_second_sarima_val.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_second_sarima_val

Unnamed: 0,Prediction
2015-10-14 12:00:00,4.902398
2015-10-14 13:00:00,4.316071
2015-10-14 14:00:00,3.540779
2015-10-14 15:00:00,2.351759
2015-10-14 16:00:00,0.989464
...,...
2015-12-26 07:00:00,0.071793
2015-12-26 08:00:00,0.603249
2015-12-26 09:00:00,1.472331
2015-12-26 10:00:00,3.146768


In [132]:
y_true = gen_df['Generated Energy'][validation_df.index[0]: validation_df.index[-1]]

#squared=False returns RMSE, True returns MSE
second_sarima_model_rmse = mean_squared_error(y_true, forecast_second_sarima_val, squared=False)

second_sarima_model_rmse

1.3536620571992535

In [133]:
fig = go.Figure(go.Scattergl(
    x=train_df[september_start : ].index,
    y=train_df['Generated Energy'][september_start :],
    name="Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x=validation_df.index,
    y=validation_df['Generated Energy'],
    name="Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x=forecast_second_sarima_val.index,
    y=forecast_second_sarima_val['Prediction'],
    name="Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/second_sarima_production.html")
fig.show()

#### Forecast test set on the best SARIMA model

In [134]:
forecast_first_sarima_test = pd.DataFrame(
    fitted_first_sarima_model.predict(
        start=test_df.index[0],
        end=test_df.index[-1]
    )
)
forecast_first_sarima_test.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

In [135]:
forecast_second_sarima_test = pd.DataFrame(
    fitted_second_sarima_model.predict(
        start=test_df.index[0],
        end=test_df.index[-1]
    )
)
forecast_second_sarima_test.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

In [136]:
# TO DO: Alt efter hvilken SARIMA MODEL der fik den bedste RMSE på val tidligere i filen, dens forecast på test skal bruges i figuren nedenfor. Har lavet begge forecasts ovenfor, men kun den ene skal bruges.

In [137]:
fig = go.Figure(go.Scattergl(
    x=train_validation_df[september_start : ].index,
    y=train_validation_df['Generated Energy'][september_start :],
    name="Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x=test_df.index,
    y=test_df['Generated Energy'],
    name="Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x=forecast_second_sarima_test.index,
    y=forecast_second_sarima_test['Prediction'],
    name="Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/best_sarima_production_testset.html")
fig.show()

### SARIMAX(4, 0, 0, 0, 1, 2, 24)

In [138]:
# gen_df["Hour"] = gen_df.index.hour
#
# ohe = OneHotEncoder(sparse=False)
# hot_np = ohe.fit_transform(gen_df[["Hour"]])
# hot = pd.DataFrame(data=hot_np, columns=ohe.get_feature_names_out())
# hot

In [139]:
gen_df_no_changes = gen_df["Generated Energy"][:test_df.index[-1]].to_frame().reset_index()
gen_df_no_changes

Unnamed: 0,Timestamp,Generated Energy
0,2015-02-01 00:00:00,0.018496
1,2015-02-01 01:00:00,0.018332
2,2015-02-01 02:00:00,0.018385
3,2015-02-01 03:00:00,0.018502
4,2015-02-01 04:00:00,0.018524
...,...,...
8755,2016-01-31 19:00:00,0.017710
8756,2016-01-31 20:00:00,0.017940
8757,2016-01-31 21:00:00,0.018149
8758,2016-01-31 22:00:00,0.018209


In [140]:
weather_forecast = pd.read_csv("../_05Forecasting/CLEANED_GAI_2015_2016.csv", index_col=0)
weather_forecast.index.rename("Timestamp", inplace=True)
weather_forecast.index = pd.to_datetime(weather_forecast.index)
weather_forecast

Unnamed: 0_level_0,Condition
Timestamp,Unnamed: 1_level_1
2015-01-31 01:00:00,Fair
2015-01-31 02:00:00,Fair
2015-01-31 03:00:00,Fair
2015-01-31 04:00:00,Fair
2015-01-31 05:00:00,Fair
...,...
2016-01-31 20:00:00,Fair
2016-01-31 21:00:00,Fair
2016-01-31 22:00:00,Fair
2016-01-31 23:00:00,Fair


In [141]:
ohe = OneHotEncoder(sparse=False)
hot = ohe.fit_transform(weather_forecast)
df_ohe = pd.DataFrame(data=hot, columns=ohe.get_feature_names_out())
df_ohe

Unnamed: 0,Condition_Cloudy,Condition_Fair,Condition_Mostly Cloudy,Condition_Partly Cloudy,Condition_Thunder
0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
8779,0.0,1.0,0.0,0.0,0.0
8780,0.0,1.0,0.0,0.0,0.0
8781,0.0,1.0,0.0,0.0,0.0
8782,0.0,1.0,0.0,0.0,0.0


In [142]:
# ohe = OneHotEncoder(sparse=False)
# hot = ohe.fit_transform(gen_df["Hour"].to_frame().merge(weather_forecast, left_index=True, right_index=True))
# df_ohe = pd.DataFrame(data=hot, columns=ohe.get_feature_names_out())
# df_ohe

In [143]:
exog_df = gen_df_no_changes.join(df_ohe)

In [144]:
exog_df.set_index('Timestamp', inplace=True)
exog_df

Unnamed: 0_level_0,Generated Energy,Condition_Cloudy,Condition_Fair,Condition_Mostly Cloudy,Condition_Partly Cloudy,Condition_Thunder
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-01 00:00:00,0.018496,0.0,1.0,0.0,0.0,0.0
2015-02-01 01:00:00,0.018332,0.0,1.0,0.0,0.0,0.0
2015-02-01 02:00:00,0.018385,0.0,1.0,0.0,0.0,0.0
2015-02-01 03:00:00,0.018502,0.0,1.0,0.0,0.0,0.0
2015-02-01 04:00:00,0.018524,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
2016-01-31 19:00:00,0.017710,0.0,1.0,0.0,0.0,0.0
2016-01-31 20:00:00,0.017940,0.0,1.0,0.0,0.0,0.0
2016-01-31 21:00:00,0.018149,0.0,1.0,0.0,0.0,0.0
2016-01-31 22:00:00,0.018209,0.0,1.0,0.0,0.0,0.0


In [145]:
exog_train = exog_df[: 24*7] #  exog_df[: int(len(gen_df)* 0.7)]
exog_validation = exog_df[int(len(gen_df)* 0.7) : int(len(gen_df)* 0.9)]
exog_test = exog_df[int(len(gen_df)* 0.9) : ]

exog_train_validation = pd.concat([exog_train, exog_validation])

exog_train

Unnamed: 0_level_0,Generated Energy,Condition_Cloudy,Condition_Fair,Condition_Mostly Cloudy,Condition_Partly Cloudy,Condition_Thunder
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-01 00:00:00,0.018496,0.0,1.0,0.0,0.0,0.0
2015-02-01 01:00:00,0.018332,0.0,1.0,0.0,0.0,0.0
2015-02-01 02:00:00,0.018385,0.0,1.0,0.0,0.0,0.0
2015-02-01 03:00:00,0.018502,0.0,1.0,0.0,0.0,0.0
2015-02-01 04:00:00,0.018524,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
2015-02-07 19:00:00,0.017949,0.0,1.0,0.0,0.0,0.0
2015-02-07 20:00:00,0.018130,0.0,1.0,0.0,0.0,0.0
2015-02-07 21:00:00,0.018347,0.0,1.0,0.0,0.0,0.0
2015-02-07 22:00:00,0.018529,0.0,1.0,0.0,0.0,0.0


In [146]:
exog_attributes = list(exog_df.columns)[1:]
exog_attributes

['Condition_Cloudy',
 'Condition_Fair',
 'Condition_Mostly Cloudy',
 'Condition_Partly Cloudy',
 'Condition_Thunder']

In [147]:
sarimax_model = SARIMAX(endog=exog_train['Generated Energy'],
                        exog=exog_train[exog_attributes],
                        trend='n',
                        order=(4, 0, 0),
                        seasonal_order=(0, 1, 2, 24)
                        )

fitted_sarimax_model = sarimax_model.fit(low_memory=True)
fitted_sarimax_model.summary()

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           12     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.02466D+00    |proj g|=  4.48723D-01

At iterate    5    f=  8.39973D-01    |proj g|=  3.43463D-02

At iterate   10    f=  8.33054D-01    |proj g|=  3.49727D-03

At iterate   15    f=  8.32712D-01    |proj g|=  3.46699D-03

At iterate   20    f=  8.32695D-01    |proj g|=  2.63054D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   12     24     31      1     0     0   2.969D-05   8.327D-01
  F =  0.83269456067981462     

CONVERG

0,1,2,3
Dep. Variable:,Generated Energy,No. Observations:,168.0
Model:,"SARIMAX(4, 0, 0)x(0, 1, [1, 2], 24)",Log Likelihood,-139.893
Date:,"Thu, 19 May 2022",AIC,303.785
Time:,06:44:41,BIC,339.423
Sample:,02-01-2015,HQIC,318.267
,- 02-07-2015,,
Covariance Type:,approx,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Condition_Cloudy,0.0626,,,,,
Condition_Fair,-0.0868,,,,,
Condition_Mostly Cloudy,0.0715,,,,,
Condition_Partly Cloudy,-0.0484,,,,,
Condition_Thunder,0,,,,,
ar.L1,1.4120,0.086,16.453,0.000,1.244,1.580
ar.L2,-0.6067,0.156,-3.890,0.000,-0.912,-0.301
ar.L3,0.0540,0.156,0.346,0.730,-0.252,0.360
ar.L4,-0.0209,0.086,-0.244,0.807,-0.189,0.147

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,820.24
Prob(Q):,0.99,Prob(JB):,0.0
Heteroskedasticity (H):,0.47,Skew:,1.92
Prob(H) (two-sided):,0.01,Kurtosis:,14.04


In [148]:
forecast_sarimax_val_test = pd.DataFrame(
    fitted_sarimax_model.predict(
    start=exog_validation.index[0],
    end=exog_test.index[-1],
    exog=exog_df[exog_attributes][exog_validation.index[0] : exog_test.index[-1]]
    )
)
forecast_sarimax_val_test.rename(columns={'predicted_mean': 'Prediction'}, inplace=True)

forecast_sarimax_val_test

ValueError: Provided exogenous values are not of the appropriate shape. Required (8592, 5), got (2628, 5).

In [None]:
y_true = exog_df['Generated Energy'][exog_test.index[0]: exog_test.index[-1]]

#squared=False returns RMSE, True returns MSE
sarimax_model_rmse = mean_squared_error(y_true,
                                        forecast_sarimax_val_test[exog_test.index[0]: exog_test.index[-1]],
                                        squared=False
                                        )

sarimax_model_rmse

In [None]:
fig = go.Figure(go.Scattergl(
    x = exog_train_validation[september_start : ].index,
    y = exog_train_validation['Generated Energy'][september_start : ],
    name = "Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x = exog_test.index,
    y = exog_test['Generated Energy'],
    name = "Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x = forecast_sarimax_val_test[exog_test.index[0] : exog_test.index[-1]].index,
    y = forecast_sarimax_val_test['Prediction'][exog_test.index[0] : exog_test.index[-1]],
    name = "Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                legend=dict(orientation="h",
                yanchor="bottom",
                y=1,
                xanchor="left",
                x=0))

fig.write_html("NEW_ARIMA_FIGS/sarimax_production.html")
fig.show()