In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\Joaquín Amat\\Documents\\GitHub\\skforecast'

In [2]:
from typing import Union, Optional
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
import re

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.datasets import fetch_dataset
import numpy as np
import pandas as pd
from astral.sun import sun
from astral import LocationInfo
from skforecast.datasets import fetch_dataset
from skforecast.model_selection import select_features


In [3]:
# Downloading data
# ==============================================================================
data = fetch_dataset('bike_sharing_extended_features', raw=False)

bike_sharing_extended_features
------------------------------
Hourly usage of the bike share system in the city of Washington D.C. during the
years 2011 and 2012. In addition to the number of users per hour, the dataset
was enriched by introducing supplementary features. Addition includes calendar-
based variables (day of the week, hour of the day, month, etc.), indicators for
sunlight, incorporation of rolling temperature averages, and the creation of
polynomial features generated from variable pairs. All cyclic variables are
encoded using sine and cosine functions to ensure accurate representation.
Fanaee-T,Hadi. (2013). Bike Sharing Dataset. UCI Machine Learning Repository.
https://doi.org/10.24432/C5W894.
Shape of the dataset: (17352, 90)


In [4]:
# Select exogenous variables to be included in the model
# ==============================================================================
df_exogenous_features = data.drop(columns=['users'])
exog_features = []
# Columns that ends with _sin or _cos are selected
exog_features.extend(df_exogenous_features.filter(regex='_sin$|_cos$').columns.tolist())
# columns that start with temp_ are selected
exog_features.extend(df_exogenous_features.filter(regex='^temp_.*').columns.tolist())
# Columns that start with holiday_ are selected
exog_features.extend(df_exogenous_features.filter(regex='^holiday_.*').columns.tolist())
exog_features.extend(['temp', 'holiday'])

df_exogenous_features = df_exogenous_features.filter(exog_features, axis=1)
df_exogenous_features['dummy_variable'] = 1

In [5]:
forecaster = ForecasterAutoreg(
                regressor = Ridge(random_state=123),
                lags      = 100,
            )

selector = RFECV(
    estimator              = forecaster.regressor,
    min_features_to_select = 1,
    cv                     = 3
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only_exog     = False,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)    

Recursive feature elimination
-----------------------------
Total number of features available: 189
Total number of records available: 17252
Total number of records used for feature selection: 1725
Number of features selected: 133
    Selected lags: [1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 23, 24, 25, 31, 32, 35, 36, 38, 43, 46, 47, 49, 50, 51, 55, 57, 59, 60, 62, 70, 71, 72, 73, 74, 75, 82, 84, 85, 91, 96, 97]
    Selected exog : 
 ['month_sin', 'month_cos', 'week_of_year_sin', 'week_of_year_cos', 'week_day_sin', 'week_day_cos', 'hour_day_sin', 'hour_day_cos', 'sunrise_hour_sin', 'sunrise_hour_cos', 'sunset_hour_sin', 'sunset_hour_cos', 'poly_month_sin__month_cos', 'poly_month_sin__week_of_year_sin', 'poly_month_sin__week_of_year_cos', 'poly_month_sin__week_day_sin', 'poly_month_sin__week_day_cos', 'poly_month_sin__hour_day_sin', 'poly_month_sin__hour_day_cos', 'poly_month_sin__sunrise_hour_sin', 'poly_month_sin__sunrise_hour_cos', 'poly_month_sin__sunset_hour_sin', 'poly

In [7]:
def create_predictors(y):
    lags = y[-1:-6:-1]                
    return lags


forecaster = ForecasterAutoregCustom(
                regressor = Ridge(random_state=123),
                fun_predictors = create_predictors,
                window_size = 5,
            )

selector = RFECV(
    estimator              = forecaster.regressor,
    min_features_to_select = 1,
    cv                     = 3
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only_exog     = False,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination
-----------------------------
Total number of features available: 94
Total number of records available: 17347
Total number of records used for feature selection: 1734
Number of features selected: 93
    Selected lags: ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2', 'custom_predictor_3', 'custom_predictor_4']
    Selected exog : 
 ['month_sin', 'month_cos', 'week_of_year_sin', 'week_of_year_cos', 'week_day_sin', 'week_day_cos', 'hour_day_sin', 'hour_day_cos', 'sunrise_hour_sin', 'sunrise_hour_cos', 'sunset_hour_sin', 'sunset_hour_cos', 'poly_month_sin__month_cos', 'poly_month_sin__week_of_year_sin', 'poly_month_sin__week_of_year_cos', 'poly_month_sin__week_day_sin', 'poly_month_sin__week_day_cos', 'poly_month_sin__hour_day_sin', 'poly_month_sin__hour_day_cos', 'poly_month_sin__sunrise_hour_sin', 'poly_month_sin__sunrise_hour_cos', 'poly_month_sin__sunset_hour_sin', 'poly_month_sin__sunset_hour_cos', 'poly_month_cos__week_of_year_sin', 'po

In [8]:
from sklearn.feature_selection import SelectFromModel

forecaster = ForecasterAutoreg(
                regressor = Ridge(random_state=123),
                lags      = 100,
            )

selector = SelectFromModel(
    estimator    = forecaster.regressor,
    threshold    = 0.25,
    max_features = 25
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only_exog     = False,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination
-----------------------------
Total number of features available: 189
Total number of records available: 17252
Total number of records used for feature selection: 1725
Number of features selected: 25
    Selected lags: []
    Selected exog : 
 ['week_of_year_sin', 'hour_day_sin', 'poly_month_sin__week_day_sin', 'poly_month_sin__week_day_cos', 'poly_month_sin__hour_day_sin', 'poly_month_sin__sunrise_hour_cos', 'poly_month_cos__sunrise_hour_cos', 'poly_month_cos__sunset_hour_cos', 'poly_week_of_year_sin__week_day_cos', 'poly_week_of_year_sin__hour_day_sin', 'poly_week_of_year_cos__week_day_sin', 'poly_week_of_year_cos__week_day_cos', 'poly_week_day_sin__sunrise_hour_cos', 'poly_week_day_sin__sunset_hour_cos', 'poly_week_day_cos__sunrise_hour_cos', 'poly_week_day_cos__sunset_hour_cos', 'poly_hour_day_sin__hour_day_cos', 'poly_hour_day_sin__sunrise_hour_sin', 'poly_hour_day_sin__sunset_hour_sin', 'poly_hour_day_sin__sunset_hour_cos', 'poly_hour_day_cos__sunris

In [9]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import ShuffleSplit

forecaster = ForecasterAutoreg(
                regressor = Ridge(random_state=123),
                lags      = 10,
            )

selector = SequentialFeatureSelector(
    estimator    = forecaster.regressor,
    n_features_to_select = 25,
    direction    = 'forward',
    cv           = ShuffleSplit(n_splits=1, test_size=0.3, random_state=951),
    scoring      = 'neg_mean_absolute_error',
    n_jobs       = -1
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only_exog     = False,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination
-----------------------------
Total number of features available: 99
Total number of records available: 17342
Total number of records used for feature selection: 1734
Number of features selected: 25
    Selected lags: [1, 2, 4, 5]
    Selected exog : 
 ['week_day_cos', 'hour_day_cos', 'poly_month_sin__hour_day_cos', 'poly_month_sin__sunset_hour_sin', 'poly_week_of_year_sin__hour_day_cos', 'poly_week_of_year_sin__sunset_hour_sin', 'poly_week_of_year_sin__sunset_hour_cos', 'poly_week_of_year_cos__hour_day_sin', 'poly_week_of_year_cos__hour_day_cos', 'poly_week_day_sin__hour_day_cos', 'poly_week_day_sin__sunset_hour_cos', 'poly_week_day_cos__hour_day_sin', 'poly_week_day_cos__sunrise_hour_sin', 'poly_hour_day_sin__hour_day_cos', 'poly_hour_day_sin__sunset_hour_cos', 'poly_hour_day_cos__sunrise_hour_sin', 'poly_sunrise_hour_cos__sunset_hour_cos', 'temp_roll_max_7_day', 'holiday_previous_day', 'holiday_next_day', 'holiday', 'dummy_variable']


In [10]:
# Simulate time series with hourly frequency with reproducible results
# ==============================================================================
from sklearn.feature_selection import RFE
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
import pytest

exog, y = make_regression(n_samples=500, n_features=5, n_informative=2, random_state=123)
exog = pd.DataFrame(
    exog,
    index = pd.date_range(start='2020-01-01', periods=len(exog), freq='H'),
    columns=[f"exog_{i}" for i in range(exog.shape[1])]
)
y = pd.Series(y, index=exog.index, name="y")

In [11]:
def test_select_features_when_selector_is_RFE_and_select_only_exog_is_True():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        verbose              = False,
    )

    assert selected_lags == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_4']

def test_select_features_when_selector_is_RFE_and_select_only_exog_is_True_regressor():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=LinearRegression(), n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        verbose              = False,
    )

    assert selected_lags == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_4']


def test_select_features_when_selector_is_RFE_and_select_only_exog_is_True_ForecasterAutoregCustom():
    forecaster = ForecasterAutoregCustom(
                    regressor = LinearRegression(),
                    fun_predictors = lambda y: y[-1:-6:-1],
                    window_size = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        verbose              = False,
    )

    assert selected_lags == ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2',
                            'custom_predictor_3', 'custom_predictor_4']
    assert selected_exog == ['exog_1', 'exog_2', 'exog_4']


def test_select_features_when_selector_is_RFE_and_select_only_exog_is_False():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=5)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = False,
        verbose              = False,
    )

    assert selected_lags == [5]
    assert selected_exog == ['exog_0', 'exog_1', 'exog_2', 'exog_3']


def test_select_features_when_selector_is_RFE_and_select_only_exog_is_False_ForecasterAutoregCustom():
    forecaster = ForecasterAutoregCustom(
                    regressor = LinearRegression(),
                    fun_predictors = lambda y: y[-1:-6:-1],
                    window_size = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=5)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = False,
        verbose              = False,
    )

    assert selected_lags == ['custom_predictor_4']
    assert selected_exog == ['exog_0', 'exog_1', 'exog_2', 'exog_3']


def test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_regex():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        force_inclusion      = "^exog_3",
        verbose              = False,
    )

    assert selected_lags == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_3', 'exog_4']


def test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_regex_ForecasterAutoregCustom():
    forecaster = ForecasterAutoregCustom(
                    regressor = LinearRegression(),
                    fun_predictors = lambda y: y[-1:-6:-1],
                    window_size = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        force_inclusion      = "^exog_3",
        verbose              = False,
    )

    assert selected_lags == ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2',
                            'custom_predictor_3', 'custom_predictor_4']
    assert selected_exog == ['exog_1', 'exog_2', 'exog_3', 'exog_4']


def test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_list():
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        force_inclusion      = ['exog_3'],
        verbose              = False,
    )

    assert selected_lags == [1, 2, 3, 4, 5]
    assert selected_exog == ['exog_1', 'exog_2', 'exog_3', 'exog_4']


def test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_list_ForecasterAutoregCustom():
    forecaster = ForecasterAutoregCustom(
                    regressor = LinearRegression(),
                    fun_predictors = lambda y: y[-1:-6:-1],
                    window_size = 5,
                )

    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

    selected_lags, selected_exog = select_features(
        selector             = selector,
        forecaster           = forecaster,
        y                    = y,
        exog                 = exog,
        select_only_exog     = True,
        force_inclusion      = ['exog_3'],
        verbose              = False,
    )

    assert selected_lags == ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2',
                            'custom_predictor_3', 'custom_predictor_4']
    assert selected_exog == ['exog_1', 'exog_2', 'exog_3', 'exog_4']

def test_select_features_raise_error_when_forecaster_is_not_supported():
    """
    Test that select_features raises an error when forecaster is not supported.
    """
    forecaster = ForecasterAutoregDirect(
                    regressor = LinearRegression(),
                    lags      = 5,
                    steps     = 3
                )
    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)
    err_msg = re.escape(
            "`forecaster` must be one of the following classes: ['ForecasterAutoreg', "
            "'ForecasterAutoregCustom']."
        )
    with pytest.raises(Exception, match = err_msg):
        selected_lags, selected_exog = select_features(
            selector             = selector,
            forecaster           = forecaster,
            y                    = y,
            exog                 = exog,
            select_only_exog     = True,
            verbose              = False,
        )

def test_select_features_raise_error_when_subsamle_is_not_between_0_and_1():
    """
    Test that select_features raises an error when subsample is not between 0 and 1.
    """
    forecaster = ForecasterAutoreg(
                    regressor = LinearRegression(),
                    lags      = 5,
                )
    selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)
    err_msg = re.escape(
            "`subsample` must be a number between 0 and 1."
        )
    with pytest.raises(Exception, match = err_msg):
        selected_lags, selected_exog = select_features(
            selector             = selector,
            forecaster           = forecaster,
            y                    = y,
            exog                 = exog,
            select_only_exog     = True,
            subsample            = 2,
            verbose              = False,
        )

test_select_features_when_selector_is_RFE_and_select_only_exog_is_True()
test_select_features_when_selector_is_RFE_and_select_only_exog_is_True_regressor()
test_select_features_when_selector_is_RFE_and_select_only_exog_is_True_ForecasterAutoregCustom()
test_select_features_when_selector_is_RFE_and_select_only_exog_is_False()
test_select_features_when_selector_is_RFE_and_select_only_exog_is_False_ForecasterAutoregCustom()
test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_regex()
test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_regex_ForecasterAutoregCustom()
test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_list()
test_select_features_when_selector_is_RFE_select_only_exog_is_True_and_force_inclusion_is_list_ForecasterAutoregCustom()
test_select_features_raise_error_when_forecaster_is_not_supported()
test_select_features_raise_error_when_subsamle_is_not_between_0_and_1()

AssertionError: 

In [13]:
forecaster = ForecasterAutoreg(
                regressor = LinearRegression(),
                lags      = 5,
            )

selector = RFE(estimator=forecaster.regressor, n_features_to_select=3)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = y,
    exog                 = exog,
    select_only_exog     = True,
    verbose              = False,
)

print(selected_lags)
print(selected_exog)

[1, 2, 3, 4, 5]
['exog_0', 'exog_1', 'exog_2']
