In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(1, '/home/ximo/Documents/GitHub/skforecast')
%config Completer.use_jedi = False

In [10]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from skforecast.ForecasterAutoreg import ForecasterAutoreg

By default, when using `predict` method on a trained forecaster object, predictions starts right after the last training observation.

In [17]:
# Download data
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/skforecast/master/data/h2o.csv')
data = pd.read_csv(url, sep=',', header=0, names=['y', 'date'])

# Data preprocessing
# ==============================================================================
data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
data = data.set_index('date')
data = data.asfreq('MS')
data_train = data.loc[:'2005-01-01']
data_train.tail()

Unnamed: 0_level_0,y
date,Unnamed: 1_level_1
2004-09-01,1.134432
2004-10-01,1.181011
2004-11-01,1.216037
2004-12-01,1.257238
2005-01-01,1.17069


In [15]:
# Create and fit forecaster
# ==============================================================================
forecaster = ForecasterAutoreg(
                    regressor = RandomForestRegressor(),
                    lags = 5
                )

forecaster.fit(y=data_train['y'])
forecaster

ForecasterAutoreg 
Regressor: RandomForestRegressor() 
Lags: [1 2 3 4 5] 
Window size: 5 
Included exogenous: False 
Type of exogenous variable: None 
Exogenous variables names: None 
Training range: [Timestamp('1991-07-01 00:00:00'), Timestamp('2005-01-01 00:00:00')] 
Training index type: DatetimeIndex 
Training index frequency: MS 
Regressor parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 
Creation date: 2021-12-30 16:20:13 
Last fit date: 2021-12-30 16:20:13 
Skforecast version: 0.4.2dev 

In [16]:
# Predict
# ==============================================================================
forecaster.predict(steps=3).to_markdown(tablefmt="github")

2005-02-01    0.929271
2005-03-01    0.745145
2005-04-01    0.695416
Freq: MS, Name: pred, dtype: float64


As expected, predictions follow directly from the end of training data.

Altought this is the most comom behaviour, when the model is deployed in production, it is desired to generate prediction without retraining the model each time.

It is posible to generate predictions starting time ahead of training date using the argument `last_window`. When `last_window` is provided, the forecaster use this data to generate the lads needed as predictors.

In [21]:
# Predict
# ==============================================================================
forecaster.predict(steps=3, last_window=data['y'].tail(5)).to_markdown(tablefmt="github")

2008-07-01    0.821809
2008-08-01    0.895657
2008-09-01    0.913411
Freq: MS, Name: pred, dtype: float64

Since the provided `last_window` contains values from 2008-02-01 to 2008-06-01, the forecaster is able to create the needed lags and predict the next 5 steps.


> **⚠ WARNING:**  
> It is important to note that the lenght of last windows must be enought to include the maximum lag used by the forecaster. Fore example, if the forecaster uses lags 1, 24, 48, `last_window` must include the last 72 values of the series.

