# Load Data

In [121]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from statsmodels.tsa.statespace.sarimax import SARIMAX
import plotly.graph_objects as go

import DataRetriever as dr

retriever = dr.DataRetriever()

PV_ATTRIBUTES = retriever.get_attributes(file_name='producing_attributes.pkl')

gen_df = retriever.get_data(file_name='All-Subsystems-hour-Year2.pkl')[PV_ATTRIBUTES].sum(axis=1).clip(lower=0) / 1000
gen_df = gen_df.rename("Generated Energy", inplace=True).to_frame()

weather_forecast = pd.read_csv("../_05Forecasting/CLEANED_GAI_2015_2016.csv", index_col=0)
weather_forecast.index.rename("Timestamp", inplace=True)
weather_forecast.index = pd.to_datetime(weather_forecast.index)

# OneHotEncode the weather data

In [122]:
weather_forecast = weather_forecast[weather_forecast.index.isin(gen_df.index)]

ohe = OneHotEncoder(sparse=False)
hot = ohe.fit_transform(weather_forecast)
weather_df = pd.DataFrame(data=hot, columns=ohe.get_feature_names_out(), index=gen_df.index)

In [123]:
data = gen_df.merge(weather_df, left_index=True, right_index=True, how='left')
condition_columns = data.columns[1:]

# Rolling Window Function

In [131]:
def sliding_forecast(endog: pd.DataFrame, trend: str, order: tuple,
                     seasonal_order: tuple=None, exog: pd.DataFrame=None,
                     train_length: int=0, val_length: int=0, shift: int=0):
    train_start = 0
    train_end = train_start + (train_length * 24)
    val_start = train_end
    val_end = val_start + (val_length * 24)
    progress = 1

    rmse_list = list()

    while len(endog[: val_end]) < len(endog):
        if exog is not None:
            model = SARIMAX(endog=endog[train_start : train_end],
                exog=exog[train_start : train_end],
                trend=trend,
                order=order,
                seasonal_order=seasonal_order)\
                .fit(low_memory=True, disp=False, full_output=False)

            predictions = pd.DataFrame(model.predict(start=endog.index[val_start],
                                                     end=endog.index[val_end - 1],
                                                     exog=exog[val_start: val_end]))

        else:
            model = SARIMAX(endog=endog[train_start : train_end],
                trend=trend,
                order=order,
                seasonal_order=seasonal_order)\
                .fit(low_memory=True, disp=False, full_output=False)

            predictions = pd.DataFrame(model.predict(start=endog.index[val_start],
                                                     end=endog.index[val_end - 1]))

        rmse_list.append(mean_squared_error(y_true=endog[val_start: val_end],
                                            y_pred=predictions,
                                            squared=False))

        train_start += shift * 24
        train_end = train_start + (train_length * 24)
        val_start = train_end
        val_end = val_start + (val_length * 24)

        print(f"Completed a run - {progress}")

        progress += 1

    return sum(rmse_list) / len(rmse_list)

In [126]:
ARIMA_RMSE = sliding_forecast(endog=data["Generated Energy"],
                 trend='n', order=(4, 0, 0),
                 train_length=28, val_length=3, shift=5)

Completed 1/53 runs!
Completed 2/53 runs!
Completed 3/53 runs!
Completed 4/53 runs!
Completed 5/53 runs!
Completed 6/53 runs!
Completed 7/53 runs!
Completed 8/53 runs!
Completed 9/53 runs!
Completed 10/53 runs!
Completed 11/53 runs!
Completed 12/53 runs!
Completed 13/53 runs!
Completed 14/53 runs!
Completed 15/53 runs!
Completed 16/53 runs!
Completed 17/53 runs!
Completed 18/53 runs!
Completed 19/53 runs!
Completed 20/53 runs!
Completed 21/53 runs!
Completed 22/53 runs!
Completed 23/53 runs!
Completed 24/53 runs!
Completed 25/53 runs!
Completed 26/53 runs!
Completed 27/53 runs!
Completed 28/53 runs!
Completed 29/53 runs!
Completed 30/53 runs!
Completed 31/53 runs!
Completed 32/53 runs!
Completed 33/53 runs!
Completed 34/53 runs!
Completed 35/53 runs!
Completed 36/53 runs!
Completed 37/53 runs!
Completed 38/53 runs!
Completed 39/53 runs!
Completed 40/53 runs!
Completed 41/53 runs!
Completed 42/53 runs!
Completed 43/53 runs!
Completed 44/53 runs!
Completed 45/53 runs!
Completed 46/53 run

In [None]:
sarima_params = [(4, 0, 0, 2, 0, 0, 24), (4, 1, 0, 2, 0, 0, 24),
                 (4, 0, 0, 2, 1, 0, 24), (4, 1, 0, 2, 1, 0, 24),
                 (4, 0, 0, 0, 0, 2, 24), (4, 1, 0, 0, 0, 2, 24),
                 (4, 0, 0, 0, 1, 2, 24), (4, 1, 0, 0, 1, 2, 24)]

sarima_rmses = dict()
key = 1

for params in sarima_params:
    result = sliding_forecast(endog=data["Generated Energy"],
                              trend='n', order=params[0: 3], seasonal_order=params[3: ],
                              train_length=28, val_length=3, shift=5)
    sarima_rmses[key] = result
    print(f"RMSE for SARIMA{params} is: {round(result, 5)}")
    key += 1

In [None]:
best_rmse = min(sarima_rmses.values())
best_param_key = [key for key in sarima_rmses if sarima_rmses[key] == best_rmse]
best_params = sarima_params[best_param_key]

SARIMAX_RMSE = sliding_forecast(endog=data["Generated Energy"], exog=data[condition_columns],
                 trend='n', order=best_params[0: 3], seasonal_order=best_params[3: ],
                 train_length=28, val_length=3, shift=5)

print(round(SARIMAX_RMSE, 5))

# Plot the predictions from a single window of the best model

In [None]:
start_train = 24*40
end_train = start_train + 24*28
start_test = end_train
end_test = start_test + 3*24

model = SARIMAX(endog=data["Generated Energy"][data.index[start_train: end_train]],
                exog=data[condition_columns][data.index[start_train: end_train]],
                trend='n',
                order=best_params[: 3],
                seasonal_order=best_params[3: ])\
    .fit(low_memory=True, disp=False, full_output=False)

predictions = pd.DataFrame(model.predict(start=data.index[start_test],
                                         end=data.index[end_test],
                                         exog=data[condition_columns][start_test: end_test]))

fig = go.Figure(go.Scattergl(
    x=data[start_train: end_train].index,
    y=data['Generated Energy'][start_train: end_train],
    name="Training Data",
    line=dict(color='rgb(84, 84, 84)')
))

fig.add_trace(go.Scattergl(
    x=data[start_test: end_test].index,
    y=data['Generated Energy'][start_test: end_test],
    name="Observed",
    line=dict(color='rgb(234,143,129)')
))

fig.add_trace(go.Scattergl(
    x=predictions.index,
    y=predictions,
    name="Predicted",
    line=dict(color='rgb(32,115,171)')
))

fig.update_yaxes(title="Generated Energy [kWh]")
fig.update_layout(template="plotly",
                  legend=dict(orientation="h",
                              yanchor="bottom",
                              y=1,
                              xanchor="left",
                              x=0))

fig.write_html("ARIMA_figs/best_producing_arima.html")
fig.show()