#### Load and Show Data

In [19]:
import warnings
warnings.filterwarnings("ignore")
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import numpy as np

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
from sktime.forecasting.arima import ARIMA

import DataRetriever as dr

retriever = dr.DataRetriever()
#year1 = retriever.get_data("All-Subsystems-hour-Year1.pkl")
year2 = retriever.get_data("All-Subsystems-hour-Year2.pkl")

In [20]:
gen_df = year2[year2["PV_Watts3PhTotalW3PhT1"].isna() == False]
gen_df = gen_df[gen_df["PV_Watts3PhTotalW3PhT2"].isna() == False] #There are 87 rows with NaN values.

print(f"A total of {len(year2) - len(gen_df)} rows have been dropped since they have NaN values.")

gen_df["Generated Energy"] = (gen_df["PV_Watts3PhTotalW3PhT1"].to_numpy() + gen_df["PV_Watts3PhTotalW3PhT2"].to_numpy())
gen_df = gen_df[["Generated Energy"]] / 1e3 #Convert Wh to kWh

gen_df = gen_df.resample('h').sum() #Resample to 'h' again to get correct freq + former dropped rows now are back with 0 gen energy
gen_df

A total of 87 rows have been dropped since they have NaN values.


Unnamed: 0_level_0,Generated Energy
Timestamp,Unnamed: 1_level_1
2015-02-01 00:00:00,0.018496
2015-02-01 01:00:00,0.018332
2015-02-01 02:00:00,0.018385
2015-02-01 03:00:00,0.018502
2015-02-01 04:00:00,0.018524
...,...
2016-01-31 19:00:00,0.017710
2016-01-31 20:00:00,0.017940
2016-01-31 21:00:00,0.018149
2016-01-31 22:00:00,0.018209


In [21]:
fig = go.Figure(go.Scattergl(
    x = gen_df.index,
    y = gen_df["Generated Energy"]
))

fig.update_layout(template='plotly')

fig.show()

#### Create train, validation and test set

In [9]:
X = gen_df.copy()['2015-02-01 00:00:00' : '2016-01-23 00:00:00']

prediction_range_generation = 24 * 3  #Three days

test_df = X[len(X) - prediction_range_generation : ] #The last 3 days of data

X.drop(test_df.index, inplace=True)

validation_df = X[len(X) - prediction_range_generation : ] #The last 3 days of data after dropping the test_df indexes

train_df = X[ : len(X)-prediction_range_generation] #Rest of the data that is not validation or test
train_df_1week = train_df.index[-1 - (24*7)] # Used for figures later to show the last training week

validation_df, test_df

(                     Generated Energy
 Timestamp                            
 2016-01-17 01:00:00          0.018406
 2016-01-17 02:00:00          0.018367
 2016-01-17 03:00:00          0.018435
 2016-01-17 04:00:00          0.018469
 2016-01-17 05:00:00          0.018402
 ...                               ...
 2016-01-19 20:00:00          0.018206
 2016-01-19 21:00:00          0.018338
 2016-01-19 22:00:00          0.018323
 2016-01-19 23:00:00          0.018418
 2016-01-20 00:00:00          0.018550
 
 [72 rows x 1 columns],
                      Generated Energy
 Timestamp                            
 2016-01-20 01:00:00          0.018541
 2016-01-20 02:00:00          0.018427
 2016-01-20 03:00:00          0.018375
 2016-01-20 04:00:00          0.018486
 2016-01-20 05:00:00          0.018467
 ...                               ...
 2016-01-22 20:00:00          0.018398
 2016-01-22 21:00:00          0.018487
 2016-01-22 22:00:00          0.018362
 2016-01-22 23:00:00          0.018211

#### Stationary hypothesis test

In [10]:
from statsmodels.tsa.stattools import adfuller

ADF_test = adfuller(gen_df["Generated Energy"])

print(f"The p-value from the Augmented Dickey-Fuller test is {ADF_test[1]}. \n This suggest that the TS is stationary, and d=0.")

The p-value from the Augmented Dickey-Fuller test is 4.569102377280038e-19. 
 This suggest that the TS is stationary, and d=0.


In [11]:
# Doing the differencing if needed for later

gen_df['1dif'] = gen_df["Generated Energy"] - gen_df["Generated Energy"].shift(1)
gen_df

Unnamed: 0_level_0,Generated Energy,1dif
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-02-01 00:00:00,0.018496,
2015-02-01 01:00:00,0.018332,-0.000165
2015-02-01 02:00:00,0.018385,0.000053
2015-02-01 03:00:00,0.018502,0.000117
2015-02-01 04:00:00,0.018524,0.000022
...,...,...
2016-01-31 19:00:00,0.017710,0.000288
2016-01-31 20:00:00,0.017940,0.000230
2016-01-31 21:00:00,0.018149,0.000208
2016-01-31 22:00:00,0.018209,0.000060


#### ACF & PACF plots for p and q parameters

In [12]:
acf_values, acf_conf = acf(gen_df['Generated Energy'].dropna(), nlags=73, alpha=0.05)

for i in range(len(acf_values)):
    acf_conf[i] = acf_conf[i] - acf_values[i]

acf_conf = pd.DataFrame(acf_conf, columns=['Upper', 'Lower'])

fig = go.Figure(go.Bar(
    y = acf_values
))

fig.add_trace(go.Scatter(
    x = list(acf_conf.index) + list(acf_conf.index[::-1]),
    y = list(acf_conf['Upper']) + list(acf_conf['Lower'][::-1]),
    fill='toself',
    fillcolor='rgba(84,84,84,0.2)',
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
))

fig.update_yaxes(title="Autocorrelation", range=[-1, 1])
fig.update_xaxes(title="Lag")
fig.update_layout(template="plotly",
                  legend=dict(orientation="h", yanchor="top", y=1.11, xanchor="left", x=0),
                  showlegend=False
                  )

#fig.write_html("ARIMA_figs/ACF.html")

fig.show()

#autocorrelation_no_difference = plot_acf(gen_df["Generated Energy"].dropna())
# autocorrelation_first_difference = plot_acf(gen_df["1dif"].dropna())

In [13]:
pacf_values, pacf_conf = pacf(gen_df['Generated Energy'].dropna(), nlags=73, alpha=0.05)

for i in range(len(pacf_values)):
    pacf_conf[i] = pacf_conf[i] - pacf_values[i]

pacf_conf = pd.DataFrame(pacf_conf, columns=['Upper', 'Lower'])

fig = go.Figure(go.Bar(
    y = pacf_values
))

fig.add_trace(go.Scatter(
    x = list(pacf_conf.index) + list(pacf_conf.index[::-1]),
    y = list(acf_conf['Upper']) + list(acf_conf['Lower'][::-1]),
    fill='toself',
    fillcolor='rgba(84,84,84,0.2)',
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
))

fig.update_yaxes(title="Partial Autocorrelation", range=[-1, 1])
fig.update_xaxes(title="Lag")
fig.update_layout(template="plotly",
                  legend=dict(orientation="h", yanchor="top", y=1.11, xanchor="left", x=0),
                  showlegend=False
                  )

#fig.write_html("ARIMA_figs/PACF.html")
fig.show()

#partialautocorrelation_no_difference = plot_pacf(gen_df['Generated Energy'].dropna())
#partialautocorrelation_first_difference = plot_pacf(gen_df['1dif'].dropna())

In [14]:
#### Production ARIMA(4, 0, 0) model

In [15]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

production_arima = SARIMAX(train_df['Generated Energy'], trend='c', order=(4, 0, 0), seasonal_order=(0, 0, 0, 0))
fitted_model = production_arima.fit(maxiter=15)
fitted_model.summary()

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            6     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.20982D+00    |proj g|=  5.21012D-04

At iterate    5    f=  1.20982D+00    |proj g|=  2.65292D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    6      5      8      1     0     0   2.653D-05   1.210D+00
  F =   1.2098169949542752     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


0,1,2,3
Dep. Variable:,Generated Energy,No. Observations:,8401.0
Model:,"SARIMAX(4, 0, 0)",Log Likelihood,-10163.673
Date:,"Tue, 17 May 2022",AIC,20339.345
Time:,06:59:53,BIC,20381.562
Sample:,02-01-2015,HQIC,20353.76
,- 01-17-2016,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.2718,0.019,13.964,0.000,0.234,0.310
ar.L1,1.2654,0.007,173.515,0.000,1.251,1.280
ar.L2,-0.2736,0.009,-31.027,0.000,-0.291,-0.256
ar.L3,-0.0090,0.011,-0.825,0.409,-0.030,0.012
ar.L4,-0.1511,0.008,-18.787,0.000,-0.167,-0.135
sigma2,0.6580,0.006,110.405,0.000,0.646,0.670

0,1,2,3
Ljung-Box (L1) (Q):,0.03,Jarque-Bera (JB):,33497.66
Prob(Q):,0.86,Prob(JB):,0.0
Heteroskedasticity (H):,0.7,Skew:,0.77
Prob(H) (two-sided):,0.0,Kurtosis:,12.66


In [16]:
gen_df['Prediction'] = fitted_model.predict(start = validation_df.index[0], end = test_df.index[-1])
gen_df

Unnamed: 0_level_0,Generated Energy,1dif,Prediction
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-01 00:00:00,0.018496,,
2015-02-01 01:00:00,0.018332,-0.000165,
2015-02-01 02:00:00,0.018385,0.000053,
2015-02-01 03:00:00,0.018502,0.000117,
2015-02-01 04:00:00,0.018524,0.000022,
...,...,...,...
2016-01-31 19:00:00,0.017710,0.000288,
2016-01-31 20:00:00,0.017940,0.000230,
2016-01-31 21:00:00,0.018149,0.000208,
2016-01-31 22:00:00,0.018209,0.000060,


In [17]:
from sklearn.metrics import mean_squared_error

y_true = gen_df['Generated Energy'][validation_df.index[0] : validation_df.index[-1]]
y_pred = gen_df['Prediction'][validation_df.index[0] : validation_df.index[-1]]

model_rmse = mean_squared_error(y_true,y_pred, squared=False) #squared=False returns RMSE, True returns MSE
model_rmse

2.4224131014006867

In [18]:
fig = go.Figure(go.Scattergl(
    x = gen_df[train_df_1week : validation_df.index[0]].index,
    y = gen_df['Generated Energy'][train_df_1week : validation_df.index[0]],
    name = "Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x = gen_df[validation_df.index[0] : validation_df.index[-1]].index,
    y = gen_df['Generated Energy'][validation_df.index[0] : validation_df.index[-1]],
    name = "Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x = gen_df[validation_df.index[0]:validation_df.index[-1]].index,
    y = gen_df['Prediction'][validation_df.index[0]:validation_df.index[-1]],
    name = "Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                  legend=dict(orientation="h", yanchor="top", y=1.11, xanchor="left", x=0)
                  )

#fig.write_html("ARIMA_figs/ARIMA(4,0,0).html")
fig.show()