# Load Data

In [7]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from statsmodels.tsa.statespace.sarimax import SARIMAX
import plotly.graph_objects as go

import DataRetriever as dr

retriever = dr.DataRetriever()

PV_ATTRIBUTES = retriever.get_attributes(file_name='producing_attributes.pkl')

gen_df = retriever.get_data(file_name='All-Subsystems-hour-Year2.pkl')[PV_ATTRIBUTES].sum(axis=1).clip(lower=0) / 1000
gen_df = gen_df.rename("Generated Energy", inplace=True).to_frame()

weather_forecast = pd.read_csv("../_05Forecasting/CLEANED_GAI_2015_2016.csv", index_col=0)
weather_forecast.index.rename("Timestamp", inplace=True)
weather_forecast.index = pd.to_datetime(weather_forecast.index)

# OneHotEncode the weather data

In [8]:
weather_forecast = weather_forecast[weather_forecast.index.isin(gen_df.index)]

ohe = OneHotEncoder(sparse=False)
hot = ohe.fit_transform(weather_forecast)
weather_df = pd.DataFrame(data=hot, columns=ohe.get_feature_names_out(), index=gen_df.index)

In [9]:
data = gen_df.merge(weather_df, left_index=True, right_index=True, how='left')
condition_columns = data.columns[1:]

# Rolling Window Function

In [10]:
def sliding_forecast(endog: pd.DataFrame, trend: str, order: tuple,
                     seasonal_order: tuple=None, exog: pd.DataFrame=None,
                     train_length: int=0, val_length: int=0, shift: int=0):
    train_start = 0
    train_end = train_start + (train_length * 24)
    val_start = train_end
    val_end = val_start + (val_length * 24)
    progress = 1

    rmse_list = list()

    while len(endog[: val_end]) < len(endog):
        if exog is not None:
            model = SARIMAX(endog=endog[train_start : train_end],
                exog=exog[train_start : train_end],
                trend=trend,
                order=order,
                seasonal_order=seasonal_order)\
                .fit(low_memory=True, disp=False, full_output=False)

            predictions = pd.DataFrame(model.predict(start=endog.index[val_start],
                                                     end=endog.index[val_end - 1],
                                                     exog=exog[val_start: val_end]))

        else:
            model = SARIMAX(endog=endog[train_start : train_end],
                trend=trend,
                order=order,
                seasonal_order=seasonal_order)\
                .fit(low_memory=True, disp=False, full_output=False)

            predictions = pd.DataFrame(model.predict(start=endog.index[val_start],
                                                     end=endog.index[val_end - 1]))

        rmse_list.append(mean_squared_error(y_true=endog[val_start: val_end],
                                            y_pred=predictions,
                                            squared=False))

        train_start += shift * 24
        train_end = train_start + (train_length * 24)
        val_start = train_end
        val_end = val_start + (val_length * 24)

        progress += 1

    return sum(rmse_list) / len(rmse_list)

In [None]:
ARIMA_RMSE = sliding_forecast(endog=data["Generated Energy"],
                 trend='n', order=(4, 0, 0),
                 train_length=28, val_length=3, shift=5)

In [20]:
print(f"RMSE for ARIMA(4, 0, 0) is: {round(ARIMA_RMSE, 5)}")

RMSE for ARIMA(4, 0, 0) is: 2.89995


In [12]:
sarima_params = [(4, 0, 0, 2, 0, 0, 24), (4, 1, 0, 2, 0, 0, 24),
                 (4, 0, 0, 2, 1, 0, 24), (4, 1, 0, 2, 1, 0, 24),
                 (4, 0, 0, 0, 0, 2, 24), (4, 1, 0, 0, 0, 2, 24),
                 (4, 0, 0, 0, 1, 2, 24), (4, 1, 0, 0, 1, 2, 24)]

sarima_rmses = dict()
key = 1

for params in sarima_params:
    result = sliding_forecast(endog=data["Generated Energy"],
                              trend='n', order=params[0: 3], seasonal_order=params[3: ],
                              train_length=28, val_length=3, shift=5)
    sarima_rmses[key] = result
    print(f"RMSE for SARIMA{params} is: {round(result, 5)}")
    key += 1

RMSE for SARIMA(4, 0, 0, 2, 0, 0, 24) is: 1.82085
RMSE for SARIMA(4, 1, 0, 2, 0, 0, 24) is: 1.82261
RMSE for SARIMA(4, 0, 0, 2, 1, 0, 24) is: 1.38226
RMSE for SARIMA(4, 1, 0, 2, 1, 0, 24) is: 1.38196
RMSE for SARIMA(4, 0, 0, 0, 0, 2, 24) is: 2.55492
RMSE for SARIMA(4, 1, 0, 0, 0, 2, 24) is: 2.54235


  warn('Non-invertible starting seasonal moving average'
  warn('Non-invertible starting seasonal moving average'
  warn('Non-invertible starting seasonal moving average'
  warn('Non-invertible starting seasonal moving average'
  warn('Non-invertible starting seasonal moving average'
  warn('Non-invertible starting seasonal moving average'
  warn('Non-invertible starting seasonal moving average'


RMSE for SARIMA(4, 0, 0, 0, 1, 2, 24) is: 1.26489
RMSE for SARIMA(4, 1, 0, 0, 1, 2, 24) is: 1.26529


In [18]:
best_rmse = min(sarima_rmses.values())
best_param_key = [key for key in sarima_rmses if sarima_rmses[key] == best_rmse]
best_params = sarima_params[best_param_key[0]] #Access index 0 as the best_param_key returns a list

SARIMAX_RMSE = sliding_forecast(endog=data["Generated Energy"], exog=data[condition_columns],
                 trend='n', order=best_params[0: 3], seasonal_order=best_params[3: ],
                 train_length=28, val_length=3, shift=5)

print(round(SARIMAX_RMSE, 5))

1.23177


# Plot the predictions from a single window of the best model

In [36]:
start_train = 24*40
end_train = start_train + 24*28
start_test = end_train
end_test = start_test + 3*24

In [42]:
data["Generated Energy"][data.index[start_train: end_train]]

Timestamp
2015-03-13 00:00:00    0.018342
2015-03-13 01:00:00    0.018371
2015-03-13 02:00:00    0.018317
2015-03-13 03:00:00    0.018410
2015-03-13 04:00:00    0.018493
                         ...   
2015-04-09 19:00:00    0.017451
2015-04-09 20:00:00    0.017950
2015-04-09 21:00:00    0.018141
2015-04-09 22:00:00    0.018210
2015-04-09 23:00:00    0.018347
Freq: H, Name: Generated Energy, Length: 672, dtype: float64

In [43]:
data[condition_columns][start_train : end_train]

Unnamed: 0_level_0,Condition_Cloudy,Condition_Fair,Condition_Mostly Cloudy,Condition_Partly Cloudy,Condition_Thunder
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-03-13 00:00:00,0.0,1.0,0.0,0.0,0.0
2015-03-13 01:00:00,0.0,1.0,0.0,0.0,0.0
2015-03-13 02:00:00,0.0,1.0,0.0,0.0,0.0
2015-03-13 03:00:00,0.0,1.0,0.0,0.0,0.0
2015-03-13 04:00:00,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
2015-04-09 19:00:00,1.0,0.0,0.0,0.0,0.0
2015-04-09 20:00:00,1.0,0.0,0.0,0.0,0.0
2015-04-09 21:00:00,1.0,0.0,0.0,0.0,0.0
2015-04-09 22:00:00,1.0,0.0,0.0,0.0,0.0


In [45]:
model = SARIMAX(endog=data["Generated Energy"][data.index[start_train: end_train]],
                exog=data[condition_columns][start_train : end_train],
                trend='n',
                order=(4,0,0),
                seasonal_order=(0,1,2,24))\
    .fit(low_memory=True, disp=False, full_output=False)

predictions = pd.DataFrame(model.predict(start=data.index[start_test],
                                         end=data.index[end_test],
                                         exog=data[condition_columns][start_test: end_test + 1]))

In [46]:
predictions

Unnamed: 0,predicted_mean
2015-04-10 00:00:00,-0.309337
2015-04-10 01:00:00,-0.417880
2015-04-10 02:00:00,-0.456798
2015-04-10 03:00:00,-0.529654
2015-04-10 04:00:00,-0.613430
...,...
2015-04-12 20:00:00,0.356377
2015-04-12 21:00:00,0.331444
2015-04-12 22:00:00,0.299383
2015-04-12 23:00:00,0.287629


In [47]:
fig = go.Figure(go.Scattergl(
    x=data[start_train: end_train].index,
    y=data['Generated Energy'][start_train: end_train],
    name="Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x=data[start_test: end_test].index,
    y=data['Generated Energy'][start_test: end_test],
    name="Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x=predictions.index,
    y=predictions['predicted_mean'],
    name="Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                  legend=dict(orientation="h",
                              yanchor="bottom",
                              y=1,
                              xanchor="left",
                              x=0))

fig.write_html("slidingwindowarima/window_sarimax_producing.html")
fig.show()

print("Done")

Done
