This notebook applies the same testing scheme in notebooks 4.0 and 4.1 to a linear regression model, using the Darts package.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from darts.timeseries import TimeSeries
from darts.dataprocessing.transformers.scaler import Scaler
from darts.models.forecasting.linear_regression_model import LinearRegressionModel
from darts.metrics.metrics import mae, mape, rmsle, quantile_loss
from sklearn.preprocessing import MinMaxScaler

  from tqdm.autonotebook import tqdm
  "ds": pd.date_range(start="1949-01-01", periods=len(AirPassengers), freq="M"),


In [2]:
random_state = 1923

In [3]:
# Plot settings
plt.rcParams["figure.autolayout"] = True
plt.rcParams['figure.dpi'] = 100
sns.set_style("darkgrid")

In [4]:
output_dir = "./OutputData/"

In [5]:
df = pd.read_csv(output_dir + "train_data.csv")
df["time"] = pd.to_datetime(df["time"])

In [6]:
df

Unnamed: 0,time,consumption_MWh,consumption_lag2,trend,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
0,2018-01-01 02:00:00,24635.32,27412.81,2,7.071068e-01,7.071068e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
1,2018-01-01 03:00:00,23872.12,26324.39,3,8.660254e-01,5.000000e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
2,2018-01-01 04:00:00,23194.89,24635.32,4,9.659258e-01,2.588190e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
3,2018-01-01 05:00:00,23071.96,23872.12,5,1.000000e+00,6.123234e-17,7.818315e-01,0.62349,5.000000e-01,0.866025
4,2018-01-01 06:00:00,23267.90,23194.89,6,9.659258e-01,-2.588190e-01,7.818315e-01,0.62349,5.000000e-01,0.866025
...,...,...,...,...,...,...,...,...,...,...
52577,2023-12-31 19:00:00,35090.93,34549.42,52579,-8.660254e-01,5.000000e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52578,2023-12-31 20:00:00,33310.94,36193.59,52580,-7.071068e-01,7.071068e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52579,2023-12-31 21:00:00,32083.96,35090.93,52581,-5.000000e-01,8.660254e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000
52580,2023-12-31 22:00:00,30469.49,33310.94,52582,-2.588190e-01,9.659258e-01,-2.449294e-16,1.00000,-2.449294e-16,1.000000


In [7]:
# Target series
ts_target = TimeSeries.from_dataframe(df, time_col = "time", value_cols = "consumption_MWh")

# Future known covariates
ts_covariates = TimeSeries.from_dataframe(df, time_col = "time", value_cols = df.columns.values[3:])

In [8]:
# Match the sequence2sequence testing scheme
output_length = 32 # Prediction window
first_t = pd.Timestamp('2022-10-18 16:00:00') # First prediction point
stride = 24 # Number of points before each prediction point
quantiles = [0.025, 0.5, 0.975]

In [9]:
# Check first prediction point
ts_target[first_t].time_index

DatetimeIndex(['2022-10-18 16:00:00'], dtype='datetime64[ns]', name='time', freq='h')

In [10]:
# Create scaler
scaler = Scaler(MinMaxScaler(feature_range = (-1, 1)))

In [11]:
# Create model
model = LinearRegressionModel(
    lags = 2,
    lags_future_covariates = [0],
    output_chunk_length = output_length,
    likelihood = "quantile",
    quantiles = quantiles,
    alpha = 0, # No regularization in QuantileRegressor
    random_state = random_state,
    add_encoders = {
        "transformer": scaler
    }
)

In [None]:
# Perform backtesting
ts_hist = model.historical_forecasts(
    ts_target,
    future_covariates = ts_covariates,
    start = first_t,
    forecast_horizon = output_length,
    stride = stride,
    retrain = True,
    last_points_only = False
)



Takes longer than 1 hour to run with retraining at each evaluation step.