#### Load and Show Data

In [26]:
import warnings
warnings.filterwarnings("ignore")
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import numpy as np

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
from sktime.forecasting.arima import ARIMA

import DataRetriever as dr

retriever = dr.DataRetriever()
#year1 = retriever.get_data("All-Subsystems-hour-Year1.pkl")
year2 = retriever.get_data("All-Subsystems-hour-Year2.pkl")

In [27]:
gen_df = year2[year2["PV_Watts3PhTotalW3PhT1"].isna() == False]
gen_df = gen_df[gen_df["PV_Watts3PhTotalW3PhT2"].isna() == False] #There are 87 rows with NaN values.

print(f"A total of {len(year2) - len(gen_df)} rows have been dropped since they have NaN values.")

gen_df["Generated Energy"] = (gen_df["PV_Watts3PhTotalW3PhT1"].to_numpy() + gen_df["PV_Watts3PhTotalW3PhT2"].to_numpy())
gen_df = gen_df[["Generated Energy"]] / 1e3 #Convert Wh to kWh

gen_df = gen_df.resample('h').sum() #Resample to 'h' again to get correct freq + former dropped rows now are back with 0 gen energy
gen_df

A total of 87 rows have been dropped since they have NaN values.


Unnamed: 0_level_0,Generated Energy
Timestamp,Unnamed: 1_level_1
2015-02-01 00:00:00,0.018496
2015-02-01 01:00:00,0.018332
2015-02-01 02:00:00,0.018385
2015-02-01 03:00:00,0.018502
2015-02-01 04:00:00,0.018524
...,...
2016-01-31 19:00:00,0.017710
2016-01-31 20:00:00,0.017940
2016-01-31 21:00:00,0.018149
2016-01-31 22:00:00,0.018209


In [28]:
all_consuming = retriever.get_attributes(file_name='consuming_attributes.pkl')
flex_consuming = ["Load_ClothesWasherPowerWithStandby", "Elec_PowerDishwasher", "Load_DryerPowerTotal"]
fixed_consuming = list(set(all_consuming) - set(flex_consuming))

con_df = pd.DataFrame(retriever.get_data(
    file_name='All-Subsystems-hour-Year2.pkl')[fixed_consuming].sum(axis=1).clip(lower=0) / 1e3
                      ) #Convert til kWh
con_df.columns = ['Fixed Consumed']

#con_df = con_df.resample('4h').sum()

con_df

Unnamed: 0_level_0,Fixed Consumed
Timestamp,Unnamed: 1_level_1
2015-02-01 00:00:00,1.748570
2015-02-01 01:00:00,2.216490
2015-02-01 02:00:00,1.941349
2015-02-01 03:00:00,1.750880
2015-02-01 04:00:00,1.979749
...,...
2016-01-31 19:00:00,1.016179
2016-01-31 20:00:00,0.647665
2016-01-31 21:00:00,0.962249
2016-01-31 22:00:00,0.653362


In [29]:
fig = make_subplots(rows=2, cols=1,
                    subplot_titles=(" ", "Matrix Profile")
                    )

fig.add_trace(
    go.Scatter(x=gen_df.index, y=gen_df['Generated Energy'], name="Generated Energy [kWh]"),
    row=1, col=1)

fig.add_trace(
    go.Scatter(y=con_df['Fixed Consumed'], name="Fixed Consumed Energy [kWh]"),
    row=2, col=1)

fig.update_layout(template='plotly')

#### Test for Stationarity

In [30]:
from statsmodels.tsa.stattools import adfuller

#We know from ARIMA that Generated Energy is already stationary.
ADF_test_consumed = adfuller(con_df["Fixed Consumed"])

print(f"The p-value from the Augmented Dickey-Fuller test is {ADF_test_consumed[1]}.")

The p-value from the Augmented Dickey-Fuller test is 1.5944098012798294e-09.


In [31]:
#### Todo? Insert differenced data + acf and pacf plots

#### Create train, validation and test sets

In [32]:
X = gen_df.copy()['2015-02-01 00:00:00' : '2016-01-23 00:00:00']

prediction_range = 24 * 3  #Three days

test_df = X[len(X) - prediction_range : ] #The last 3 days of data

X.drop(test_df.index, inplace=True)

validation_df = X[len(X) - prediction_range : ] #The last 3 days of data after dropping the test_df indexes

train_df = X[ : len(X)-prediction_range] #Rest of the data that is not validation or test
training_data = train_df.index[-1 - (24*7)] # Used for figures later to show the last training week

validation_df, test_df

(                     Generated Energy
 Timestamp                            
 2016-01-17 01:00:00          0.018406
 2016-01-17 02:00:00          0.018367
 2016-01-17 03:00:00          0.018435
 2016-01-17 04:00:00          0.018469
 2016-01-17 05:00:00          0.018402
 ...                               ...
 2016-01-19 20:00:00          0.018206
 2016-01-19 21:00:00          0.018338
 2016-01-19 22:00:00          0.018323
 2016-01-19 23:00:00          0.018418
 2016-01-20 00:00:00          0.018550
 
 [72 rows x 1 columns],
                      Generated Energy
 Timestamp                            
 2016-01-20 01:00:00          0.018541
 2016-01-20 02:00:00          0.018427
 2016-01-20 03:00:00          0.018375
 2016-01-20 04:00:00          0.018486
 2016-01-20 05:00:00          0.018467
 ...                               ...
 2016-01-22 20:00:00          0.018398
 2016-01-22 21:00:00          0.018487
 2016-01-22 22:00:00          0.018362
 2016-01-22 23:00:00          0.018211

#### SARIMA for Production

In [33]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Production SAR models parameters are determined by looking at ACF and PACF plots
production_SAR_model = SARIMAX(train_df['Generated Energy'], trend='n', p=4, d=0, q=0, P=2, D=0, Q=0, m=24)
#Production SMA model is found heuristically by searching through best parameter combination
production_SMA_model = SARIMAX(train_df['Generated Energy'], trend='n', p=4, d=0, q=0, P=0, D=1, Q=2, m=24)

fitted_model_SAR = production_SAR_model.fit(maxiter=15)
fitted_model_SMA = production_SMA_model.fit(maxiter=15)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            2     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.41039D+00    |proj g|=  1.30231D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    2      1      4      1     0     0   8.225D-05   1.410D+00
  F =   1.4103909450234806     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            2     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.4

 This problem is unconstrained.
 This problem is unconstrained.


In [34]:
gen_df['Production SAR prediction'] = fitted_model_SAR.predict(start = validation_df.index[0], end = test_df.index[-1])
gen_df['Production SMA prediction'] = fitted_model_SMA.predict(start = validation_df.index[0], end = test_df.index[-1])
gen_df

Unnamed: 0_level_0,Generated Energy,Production SAR prediction,Production SMA prediction
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-01 00:00:00,0.018496,,
2015-02-01 01:00:00,0.018332,,
2015-02-01 02:00:00,0.018385,,
2015-02-01 03:00:00,0.018502,,
2015-02-01 04:00:00,0.018524,,
...,...,...,...
2016-01-31 19:00:00,0.017710,,
2016-01-31 20:00:00,0.017940,,
2016-01-31 21:00:00,0.018149,,
2016-01-31 22:00:00,0.018209,,


In [35]:
from sklearn.metrics import mean_squared_error

model_rmse_values = []

for prediction in gen_df[['Production SAR prediction', 'Production SMA prediction']]:
    y_true = gen_df['Generated Energy'][validation_df.index[0] : validation_df.index[-1]]
    y_pred = gen_df[prediction][validation_df.index[0] : validation_df.index[-1]]

    model_rmse = mean_squared_error(y_true,y_pred, squared=False) #squared=False returns RMSE, True returns MSE

    model_rmse_values.append(model_rmse)

model_rmse_values

[2.753775951854885, 2.753775951854885]