### Libraries

In [24]:
#Data Manipulation
import pandas as pd
import numpy as np

#Plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

#Utilities
import time
import warnings

#Models
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from skforecast.ForecasterAutoreg import ForecasterAutoreg

#Functionalities
from skforecast.model_selection import backtesting_forecaster
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from skforecast.model_selection import grid_search_forecaster


warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
parameters = {
    "dataset":{
        "path": "../data/data_casal_montserratina/LaMonserratina_2022_2023_2024.csv",
        "trainingSize": .70,
        "validationSize": .15,
        "testSize": .15
    },
    "backtesting":{
        "steps": 96,
        "fixedTrainSize": False,
        "refit": False,
    },
    "validation": {
        "n_splits" : 10
    }
}

## AutoRegressive Models (SKForecast)

### Split Dataset

In [25]:
#Read Dataset {Path Dataset}
df = pd.read_csv(parameters["dataset"]["path"])
df["Date"] = pd.to_datetime(df["Date"])
df = df.loc[df["Date"] <= pd.to_datetime('2023-30-12 0:00', format='%Y-%d-%m %H:%M')]
df = df.drop(columns=["Minute", "Consumo_total [kW]_-7d", "Consumo_total [kW]_-24h", "Consumo_total [kW]"], axis=1)
df = df.rename(columns={'Consumo_red [kW]': 'Demand'})


df.head(5)

Unnamed: 0,Date,Demand,pres,slp,wind_dir,wind_gust_spd,wind_spd,temp,app_temp,rh,...,Year,Month,Hour,DayOfWeek,DayOfYear,Day,isWeekend,Hour_sen,Hour_cos,isHoliday
0,2022-01-01 00:00:00,6.0,1027.0,1028.0,360.0,4.0,1.0,10.7,10.7,92.0,...,2022,1,0,5,1,1,True,0.0,1.0,True
1,2022-01-01 00:15:00,6.0,1027.0,1028.0,357.5,4.1,1.4,10.7,10.7,91.0,...,2022,1,0,5,1,1,True,0.0,1.0,True
2,2022-01-01 00:30:00,6.0,1027.0,1028.0,355.0,4.2,1.8,10.7,10.7,90.0,...,2022,1,0,5,1,1,True,0.0,1.0,True
3,2022-01-01 00:45:00,6.0,1027.0,1028.0,352.5,4.3,2.2,10.7,10.7,89.0,...,2022,1,0,5,1,1,True,0.0,1.0,True
4,2022-01-01 01:00:00,6.0,1027.0,1028.0,350.0,4.4,2.6,10.7,10.7,88.0,...,2022,1,1,5,1,1,True,0.258819,0.965926,True


In [26]:
#Setting Dataset Separators
trainingSize = int((parameters["dataset"]["validationSize"] + parameters["dataset"]["trainingSize"] ) * df.shape[0])
trainingLastDate = str(df.loc[[trainingSize]]["Date"].values[0])

#Set Date as Index an Data Frequency to 15 mins
df = df.set_index("Date")
df = df.asfreq("15min")

#Split Dataset
X_train = df.loc[:trainingLastDate, :].copy()
X_test = df.loc[trainingLastDate:, :].copy()

#THe first row is repeated
X_test = X_test.iloc[1:]

#Prints
print(f"Whole Dataset Size: {df.shape[0]}")
print(f"Trainig Dataset Size: {X_train.shape[0]} From: {X_train.index.min()} to {X_train.index.max()}")
print(f"Test Dataset Size: {X_test.shape[0]} From: {X_test.index.min()} to {X_test.index.max()}")

Whole Dataset Size: 69889
Trainig Dataset Size: 59406 From: 2022-01-01 00:00:00 to 2023-09-11 19:15:00
Test Dataset Size: 10483 From: 2023-09-11 19:30:00 to 2023-12-30 00:00:00


In [27]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_train.index, y=X_train["Demand"], mode="lines", name="Train"))
fig.add_trace(go.Scatter(x=X_test.index, y=X_test["Demand"], mode="lines", name="Test"))
fig.update_layout(
    title = "Dataset Partition",
    xaxis_title = "Date",
    yaxis_title ="Demand (MWh)",
    width = 1000,
    height = 400,
    margin = dict(l=30, r=20, t=35, b=60),
    legend = dict(
        orientation = "h",
        yanchor = "bottom",
        y = 1.05,
        xanchor = "right",
        x=1
    )
)
fig.show()

In [28]:
def plotPredictions(dates, y_pred, y_test):
    fig = go.Figure()
    trace1 = go.Scatter(x=dates, y=y_test, name="test", mode="lines")
    trace2 = go.Scatter(x=dates, y=y_pred, name="predictions", mode="lines")
    fig.add_trace(trace1)
    fig.add_trace(trace2)
    fig.update_layout(
        title="Real value vs Predicted in Test Data",
        xaxis_title="Date Time",
        yaxis_title="Demand",
        width=1020,
        height=450,
        margin = dict(l=70, r=20, t=55, b=20),
        legend = dict(
            orientation = "h",
            yanchor="top",
            y=1.1,
            xanchor="left",
            x=0.76
        )
    )
    fig.show()

### Backtesting

In [29]:
def run_backtesting(model, data, initial_train_size, exog, scaler, params):
    init = time.time()
    forecaster = ForecasterAutoreg(
        regressor = model,
        transformer_y = scaler,
        lags=24
    )
    metrics, predictions = backtesting_forecaster(
        forecaster = forecaster,
        steps = (len(data) - initial_train_size),
        y = data["Demand"],
        metric = ['mean_absolute_error', 'mean_squared_error'],
        exog = exog,
        initial_train_size = initial_train_size,
        refit = params["backtesting"]["refit"],
        fixed_train_size = parameters["backtesting"]["fixedTrainSize"],
        verbose = False,
        show_progress = True,
        n_jobs='auto'
    )
    end = time.time()
    return predictions, metrics, (end-init)

In [31]:
models = {
    'LGBM': LGBMRegressor(n_estimators=100, random_state=123, verbose=-1),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=123, max_depth=10),
    'KNNR': KNeighborsRegressor(n_neighbors=20, weights='distance'),
    "SVR": SVR(kernel='rbf'),
    'GBM': GradientBoostingRegressor(n_estimators=100, random_state=123),
    'RF': RandomForestRegressor(n_estimators=100, random_state=123),
    'ADA': AdaBoostRegressor(n_estimators=100, random_state=123)
}
scalers = [None, None, MinMaxScaler(), MinMaxScaler(), None, None]
results = {}
df_exog = df.drop("Demand", axis=1)

for (model_name, model), scaler in zip(models.items(), scalers):
    predictions, metrics, model_time = run_backtesting(model, df, len(X_train), df_exog, scaler, parameters)
    results[model_name] = {
        'predictions': predictions,
        'MAE': metrics[0],
        'RMSE': np.sqrt(metrics[1]),
        'Time': model_time
    }
    print(f"Model {model_name}: MAE: {metrics[0]}, RMSE: {np.sqrt(metrics[1])}, Time: {model_time:.6f} seconds")

100%|██████████| 1/1 [00:05<00:00,  5.03s/it]


Model LGBM: MAE: 2.6729064359828354, RMSE: 4.615129180367066, Time: 5.770355 seconds


100%|██████████| 1/1 [00:02<00:00,  2.51s/it]


Model XGBoost: MAE: 2.9984428830649286, RMSE: 4.792911662633772, Time: 5.369140 seconds


100%|██████████| 1/1 [00:25<00:00, 25.60s/it]


Model KNNR: MAE: 3.013485377692518, RMSE: 4.592002013040774, Time: 25.682983 seconds


### Hyperparameter Tuning

In [None]:
#Lags used as Predictors
lags_grid = [1, 48, 96, 96*2, 96*7]

#Regressor Hyperparameters
param_grid = {
    'n_neighbors': [10, 20, 30, 40, 50],
    'weights': ['uniform', 'distance']
}

forecaster = ForecasterAutoreg(
    regressor = KNeighborsRegressor(n_neighbors=5, weights= 'distance'),
    transformer_y = MinMaxScaler(),
    lags=24*7
)

results = grid_search_forecaster(
    forecaster = forecaster,
    y = df["Demand"],
    param_grid = param_grid,
    steps = len(X_test),
    lags_grid = lags_grid,
    exog = df_exog,
    refit = parameters["backtesting"]["refit"],
    metric = ["mean_squared_error", 'mean_absolute_error'],
    initial_train_size = len(X_train),
    n_jobs = 'auto',
    verbose = False,
    show_progress=True,
    return_best = True,
    fixed_train_size=parameters['backtesting']["fixedTrainSize"]
)
results.head(10)

In [None]:
forecaster = ForecasterAutoreg(
    regressor = KNeighborsRegressor(n_neighbors=25, weights= 'uniform'),
    transformer_y = StandardScaler(),
    lags=24*7
)
X_train_exog = X_train.drop("Demand", axis=1)
forecaster.fit(y=X_train["Demand"], exog=X_train_exog)
X_test_exog = X_test.drop("Demand", axis=1)
predictions = forecaster.predict(steps=len(X_test_exog), exog=X_test_exog)
print(f"TEST MAE: {mean_absolute_error(X_test["Demand"].values, predictions.values)}, RMSE: {root_mean_squared_error(X_test["Demand"].values, predictions.values)}")
plotPredictions(predictions.index, np.asarray(predictions.values), np.asarray(X_test["Demand"].values))

In [None]:
#Best Forecaster
forecaster