In [425]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

import os
import datetime
from sklearn.svm import SVR
from skforecast.ForecasterBaseline import ForecasterEquivalentDate
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.metrics import mean_absolute_error, mean_squared_error

parameters = {
    "dataset":{
        "path": "../data/Processed_Data/Demand_All_Exogenous.csv",
        "trainingSize": .70,
        "validationSize": .15,
        "testSize": .15
    }
}

In [426]:
#Read Dataset {Path Dataset}
df = pd.read_csv(parameters["dataset"]["path"])
df["Date"] = pd.to_datetime(df["Date"])
df.head(5)

Unnamed: 0,Date,Demand,Year,Month,Hour,DayOfWeek,DayOfYear,Day,Minute,Temperature,...,Sunshine_Duration,Shortwave_Radiation,Direct_Shortwave_Radiation,Diffuse_Shortwave_Radiation,Demand_7d,isWeekend,Hour_sen,Hour_cos,Demand_24h,isHoliday
0,2018-01-01 00:15:00,8,2018,1,0,0,1,1,15,12.967492,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0.261799,0,True
1,2018-01-01 00:30:00,8,2018,1,0,0,1,1,30,12.972492,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0.261799,0,True
2,2018-01-01 00:45:00,8,2018,1,0,0,1,1,45,12.977492,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0.261799,0,True
3,2018-01-01 01:00:00,8,2018,1,1,0,1,1,0,12.982492,...,0.0,0.0,0.0,0.0,0.0,False,0.220297,0.141451,0,True
4,2018-01-01 01:15:00,8,2018,1,1,0,1,1,15,12.709992,...,0.0,0.0,0.0,0.0,0.0,False,0.220297,0.141451,0,True


In [427]:
#Dataset Split
#Podriamos probar otra estrategia para hacer el split
trainingSize = int(parameters["dataset"]["trainingSize"] * df.shape[0])
validationSize = trainingSize + int(parameters["dataset"]["validationSize"] * df.shape[0])
trainingLastDate = str(df.loc[[trainingSize]]["Date"].values[0])
validationLastDate = str(df.loc[[validationSize]]["Date"].values[0])

x_train = df.loc[:trainingSize, :].copy()
x_val = df.loc[trainingSize : validationSize, :].copy()
x_test = df.loc[validationSize:, :].copy()

df = df.set_index("Date")
df = df.asfreq("15min")
x_train = x_train.set_index("Date")
x_train = x_train.asfreq("15min")
x_val = x_val.set_index("Date")
x_val = x_val.asfreq("15min")
x_test = x_test.set_index("Date")
x_test = x_test.asfreq("15min")

print(f"Whole Dataset Size: {df.shape[0]}")
print(f"Trainig Dataset Size: {x_train.shape[0]} From: {x_train.index.min()} to {x_train.index.max()}")
print(f"Validation Dataset Size: {x_val.shape[0]} From: {x_val.index.min()} to {x_val.index.max()}")
print(f"Test Dataset Size: {x_test.shape[0]} From: {x_test.index.min()} to {x_test.index.max()}")

Whole Dataset Size: 70080
Trainig Dataset Size: 49057 From: 2018-01-01 00:15:00 to 2019-05-27 00:15:00
Validation Dataset Size: 10513 From: 2019-05-27 00:15:00 to 2019-09-13 12:15:00
Test Dataset Size: 10512 From: 2019-09-13 12:15:00 to 2020-01-01 00:00:00


In [428]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=x_train.index, y=x_train["Demand"], mode="lines", name="Train"))
fig.add_trace(go.Scatter(x=x_val.index, y=x_val["Demand"], mode="lines", name="Validation"))
fig.add_trace(go.Scatter(x=x_test.index, y=x_test["Demand"], mode="lines", name="Test"))
fig.update_layout(
    title = "Dataset Partition",
    xaxis_title = "Date",
    yaxis_title ="Demand (MWh)",
    width = 1000,
    height = 400,
    margin = dict(l=30, r=20, t=35, b=60),
    legend = dict(
        orientation = "h",
        yanchor = "bottom",
        y = 1.05,
        xanchor = "right",
        x=1
    )
)
fig.show()

In [429]:
#Equivalent Date (Baseline Model)
baselineForecaster = ForecasterEquivalentDate(
    offset= pd.DateOffset(weeks=1),
    n_offsets=1
)
baselineForecaster.fit(y=x_train["Demand"])
predictions = baselineForecaster.predict(steps = 96*7)
predictions = predictions.to_frame()


In [430]:
def plotPredictions(preds, tests):
    fig = go.Figure()
    trace1 = go.Scatter(x=tests.index, y=tests["Demand"], name="test", mode="lines")
    trace2 = go.Scatter(x=preds.index, y=preds["pred"], name="predictions", mode="lines")
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
        title="Real value vs Predicted in Test Data",
        xaxis_title="Date Time",
        yaxis_title="Demand",
        width=1020,
        height=450,
        margin = dict(l=70, r=20, t=55, b=20),
        legend = dict(
            orientation = "h",
            yanchor="top",
            y=1.1,
            xanchor="left",
            x=0.76
        )
    )
    fig.show()

In [431]:
def getMAE(y_pred, y_true):
    return mean_absolute_error(
        y_pred = y_pred["pred"],
        y_true = y_true["Demand"]
    )

def getMSE(y_pred, y_true):
    return mean_squared_error(
        y_pred = y_pred['pred'],
        y_true = y_true["Demand"]
    )
#print(f"MSE: {getMSE(predictions, df[trainingLastDate:])}, MAE: {getMAE(predictions, df[trainingLastDate:])}")
x = df[()]
plotPredictions(predictions, df[trainingLastDate:])

In [432]:
# Modelos
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.ensemble import GradientBoostingRegressor
from skforecast.model_selection import backtesting_forecaster
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
forecaster1 = ForecasterAutoreg(
    regressor = LGBMRegressor(random_state=123, verbose=-1),
    transformer_y = StandardScaler(),
    lags=24*7
)
forecaster2 = ForecasterAutoreg(
    regressor = KNeighborsRegressor(n_neighbors=5, weights= 'distance'),
    transformer_y = StandardScaler(),
    lags=24*7
)
#Better Results and Faster Exceution with LGBMRegressor
metrics, predictions = backtesting_forecaster(
    forecaster=forecaster2,
    y=df["Demand"],
    steps = 96, 
    metric = ['mean_absolute_error', 'mean_squared_error'],
    exog=None,
    initial_train_size=len(x_train)+len(x_val),
    refit=False,
    fixed_train_size=False,
    n_jobs= 'auto',
    verbose=False,
    show_progress=False
)

KeyboardInterrupt: 

In [None]:
print(f"Mean Absolute Error (MAE): {metrics[0]},  Mean Squared Error (MSE): {metrics[1]}")

Mean Absolute Error (MAE): 17.26929772774955,  Mean Squared Error (MSE): 1064.9523202264097


In [None]:
plotPredictions(preds=predictions, tests=df[validationLastDate:])