In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\Joaquín Amat\\Documents\\GitHub\\skforecast'

In [2]:
from typing import Union, Optional
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
import re

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.datasets import fetch_dataset
import numpy as np
import pandas as pd
from astral.sun import sun
from astral import LocationInfo
from skforecast.datasets import fetch_dataset
from skforecast.model_selection import select_features


In [4]:
# Downloading data
# ==============================================================================
data = fetch_dataset('bike_sharing', raw=True)

# Preprocessing data (setting index and frequency)
# ==============================================================================
data = data[['date_time', 'users', 'holiday', 'weather', 'temp', 'atemp', 'hum', 'windspeed']]
data['date_time'] = pd.to_datetime(data['date_time'], format='%Y-%m-%d %H:%M:%S')
data = data.set_index('date_time')
data = data.asfreq('H')
data = data.sort_index()
data.head()

# Calendar features
# ==============================================================================
calendar_features = pd.DataFrame(index=data.index)
calendar_features['month'] = calendar_features.index.month
calendar_features['week_of_year'] = calendar_features.index.isocalendar().week
calendar_features['week_day'] = calendar_features.index.day_of_week + 1
calendar_features['hour_day'] = calendar_features.index.hour + 1

# Sunlight features
# ==============================================================================
location = LocationInfo(
    name='Washington DC',
    region='USA',
    timezone='US/Eastern',
    latitude=40.516666666666666,
    longitude=-77.03333333333333
)
sunrise_hour = [
    sun(location.observer, date=date, tzinfo=location.timezone)['sunrise'].hour
    for date in data.index
]
sunset_hour = [
    sun(location.observer, date=date, tzinfo=location.timezone)['sunset'].hour
    for date in data.index
]
sun_light_features = pd.DataFrame({
                         'sunrise_hour': sunrise_hour,
                         'sunset_hour': sunset_hour}, 
                         index = data.index
                     )
sun_light_features['daylight_hours'] = (
    sun_light_features['sunset_hour'] - sun_light_features['sunrise_hour']
)
sun_light_features['is_daylight'] = np.where(
                                        (data.index.hour >= sun_light_features['sunrise_hour']) & \
                                        (data.index.hour < sun_light_features['sunset_hour']),
                                        1,
                                        0
                                    )

# Holiday features
# ==============================================================================
holiday_features = data[['holiday']].astype(int)
holiday_features['holiday_previous_day'] = holiday_features['holiday'].shift(24)
holiday_features['holiday_next_day'] = holiday_features['holiday'].shift(-24)

# Temperature features
# ==============================================================================
temp_features = data[['temp']].copy()
temp_features['temp_roll_mean_1_day'] = temp_features['temp'].rolling(24, closed='left').mean()
temp_features['temp_roll_mean_7_day'] = temp_features['temp'].rolling(24*7, closed='left').mean()
temp_features['temp_roll_max_1_day'] = temp_features['temp'].rolling(24, closed='left').max()
temp_features['temp_roll_min_1_day'] = temp_features['temp'].rolling(24, closed='left').min()
temp_features['temp_roll_max_7_day'] = temp_features['temp'].rolling(24*7, closed='left').max()
temp_features['temp_roll_min_7_day'] = temp_features['temp'].rolling(24*7, closed='left').min()


# Merge all exogenous variables
# ==============================================================================
df_exogenous_features = pd.concat([
                            calendar_features,
                            sun_light_features,
                            temp_features,
                            holiday_features
                        ], axis=1)

df_exogenous_features.head(4)

# Cliclical encoding of calendar and sunlight features
# ==============================================================================
def cyclical_encoding(data: pd.Series, cycle_length: int) -> pd.DataFrame:
    """
    Encode a cyclical feature with two new features sine and cosine.
    The minimum value of the feature is assumed to be 0. The maximum value
    of the feature is passed as an argument.
      
    Parameters
    ----------
    data : pd.Series
        Series with the feature to encode.
    cycle_length : int
        The length of the cycle. For example, 12 for months, 24 for hours, etc.
        This value is used to calculate the angle of the sin and cos.

    Returns
    -------
    result : pd.DataFrame
        Dataframe with the two new features sin and cos.

    """

    sin = np.sin(2 * np.pi * data/cycle_length)
    cos = np.cos(2 * np.pi * data/cycle_length)
    result =  pd.DataFrame({
                  f"{data.name}_sin": sin,
                  f"{data.name}_cos": cos
              })

    return result


month_encoded = cyclical_encoding(df_exogenous_features['month'], cycle_length=12)
week_of_year_encoded = cyclical_encoding(df_exogenous_features['week_of_year'], cycle_length=52)
week_day_encoded = cyclical_encoding(df_exogenous_features['week_day'], cycle_length=7)
hour_day_encoded = cyclical_encoding(df_exogenous_features['hour_day'], cycle_length=24)
sunrise_hour_encoded = cyclical_encoding(df_exogenous_features['sunrise_hour'], cycle_length=24)
sunset_hour_encoded = cyclical_encoding(df_exogenous_features['sunset_hour'], cycle_length=24)

cyclical_features = pd.concat([
                        month_encoded,
                        week_of_year_encoded,
                        week_day_encoded,
                        hour_day_encoded,
                        sunrise_hour_encoded,
                        sunset_hour_encoded
                    ], axis=1)

df_exogenous_features = pd.concat([df_exogenous_features, cyclical_features], axis=1)
df_exogenous_features.head(3)

# Interaction between exogenous variables
# ==============================================================================
transformer_poly = PolynomialFeatures(
                       degree           = 2,
                       interaction_only = True,
                       include_bias     = False
                   ).set_output(transform="pandas")

poly_cols = [
    'month_sin', 
    'month_cos',
    'week_of_year_sin',
    'week_of_year_cos',
    'week_day_sin',
    'week_day_cos',
    'hour_day_sin',
    'hour_day_cos',
    'sunrise_hour_sin',
    'sunrise_hour_cos',
    'sunset_hour_sin',
    'sunset_hour_cos',
    'daylight_hours',
    'is_daylight',
    'holiday_previous_day',
    'holiday_next_day',
    'temp_roll_mean_1_day',
    'temp_roll_mean_7_day',
    'temp_roll_max_1_day',
    'temp_roll_min_1_day',
    'temp_roll_max_7_day',
    'temp_roll_min_7_day',
    'temp',
    'holiday'
]

poly_features = transformer_poly.fit_transform(df_exogenous_features[poly_cols].dropna())
poly_features = poly_features.drop(columns=poly_cols)
poly_features.columns = [f"poly_{col}" for col in poly_features.columns]
poly_features.columns = poly_features.columns.str.replace(" ", "__")
df_exogenous_features = pd.concat([df_exogenous_features, poly_features], axis=1)
df_exogenous_features.head(4)

# Select exogenous variables to be included in the model
# ==============================================================================
exog_features = []
# Columns that ends with _sin or _cos are selected
exog_features.extend(df_exogenous_features.filter(regex='_sin$|_cos$').columns.tolist())
# columns that start with temp_ are selected
exog_features.extend(df_exogenous_features.filter(regex='^temp_.*').columns.tolist())
# Columns that start with holiday_ are selected
exog_features.extend(df_exogenous_features.filter(regex='^holiday_.*').columns.tolist())
exog_features.extend(['temp', 'holiday'])

df_exogenous_features = df_exogenous_features.filter(exog_features, axis=1)

bike_sharing
------------
Hourly usage of the bike share system in the city of Washington D.C. during the
years 2011 and 2012. In addition to the number of users per hour, information
about weather conditions and holidays is available.
Fanaee-T,Hadi. (2013). Bike Sharing Dataset. UCI Machine Learning Repository.
https://doi.org/10.24432/C5W894.
Shape of the dataset: (17544, 12)


In [5]:
df_exogenous_features = df_exogenous_features.dropna()
data = data.loc[df_exogenous_features.index, :]
df_exogenous_features['dummy_variable'] = 1

In [7]:
forecaster = ForecasterAutoreg(
                regressor = Ridge(random_state=123),
                lags      = 169,
            )

selector = RFECV(
    estimator              = forecaster.regressor,
    min_features_to_select = 1,
    cv                     = 3
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only_exog     = True,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)        

Recursive feature elimination
-----------------------------
Total number of features available: 88
Total number of records available: 17183
Total number of records used for feature selection: 1718
Number of features selected: 87
Selected lags: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169]
Selected exog : 
 ['month_sin', 'month_cos', 'week_of_year_sin', 'wee

In [8]:
def create_predictors(y):
    lags = y[-1:-6:-1]                
    return lags


forecaster = ForecasterAutoregCustom(
                regressor = Ridge(random_state=123),
                fun_predictors = create_predictors,
                window_size = 5,
            )

selector = RFECV(
    estimator              = forecaster.regressor,
    min_features_to_select = 1,
    cv                     = 3
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only_exog     = True,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination
-----------------------------
Total number of features available: 88
Total number of records available: 17347
Total number of records used for feature selection: 1734
Number of features selected: 78
Selected lags: ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2', 'custom_predictor_3', 'custom_predictor_4']
Selected exog : 
 ['month_sin', 'month_cos', 'week_of_year_sin', 'week_of_year_cos', 'week_day_sin', 'hour_day_sin', 'sunrise_hour_sin', 'sunrise_hour_cos', 'sunset_hour_sin', 'sunset_hour_cos', 'poly_month_sin__month_cos', 'poly_month_sin__week_of_year_sin', 'poly_month_sin__week_of_year_cos', 'poly_month_sin__week_day_sin', 'poly_month_sin__hour_day_sin', 'poly_month_sin__hour_day_cos', 'poly_month_sin__sunrise_hour_sin', 'poly_month_sin__sunrise_hour_cos', 'poly_month_sin__sunset_hour_sin', 'poly_month_sin__sunset_hour_cos', 'poly_month_cos__week_of_year_sin', 'poly_month_cos__week_of_year_cos', 'poly_month_cos__week_day_sin', 'poly_m

In [9]:
from sklearn.feature_selection import SelectFromModel

selector = SelectFromModel(
    estimator    = forecaster.regressor,
    threshold    = 0.25,
    max_features = 25
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only_exog     = False,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination
-----------------------------
Total number of features available: 93
Total number of records available: 17347
Total number of records used for feature selection: 1734
Number of features selected: 25
Selected lags: ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2', 'custom_predictor_3', 'custom_predictor_4']
Selected exog : 
 ['month_sin', 'sunrise_hour_sin', 'poly_month_sin__week_of_year_sin', 'poly_month_sin__hour_day_sin', 'poly_month_sin__hour_day_cos', 'poly_month_sin__sunset_hour_sin', 'poly_month_cos__week_day_sin', 'poly_month_cos__hour_day_cos', 'poly_week_of_year_sin__week_of_year_cos', 'poly_week_of_year_sin__week_day_sin', 'poly_week_of_year_sin__sunset_hour_sin', 'poly_week_of_year_cos__week_day_sin', 'poly_week_of_year_cos__hour_day_sin', 'poly_week_of_year_cos__hour_day_cos', 'poly_week_of_year_cos__sunset_hour_cos', 'poly_week_day_sin__sunrise_hour_sin', 'poly_week_day_sin__sunrise_hour_cos', 'poly_hour_day_sin__hour_day_cos'

In [10]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import ShuffleSplit

selector = SequentialFeatureSelector(
    estimator    = forecaster.regressor,
    n_features_to_select = 25,
    direction    = 'forward',
    cv           = ShuffleSplit(n_splits=1, test_size=0.3, random_state=951),
    scoring      = 'neg_mean_absolute_error',
    n_jobs       = -1
)

selected_lags, selected_exog = select_features(
    selector             = selector,
    forecaster           = forecaster,
    y                    = data['users'],
    exog                 = df_exogenous_features,
    select_only_exog     = False,
    subsample            = 0.1,
    force_inclusion      = "^dummy_variable",
    verbose              = True,
)  

Recursive feature elimination
-----------------------------
Total number of features available: 93
Total number of records available: 17347
Total number of records used for feature selection: 1734
Number of features selected: 25
Selected lags: ['custom_predictor_0', 'custom_predictor_1', 'custom_predictor_2', 'custom_predictor_3', 'custom_predictor_4']
Selected exog : 
 ['hour_day_cos', 'sunrise_hour_sin', 'poly_month_cos__week_of_year_sin', 'poly_month_cos__hour_day_cos', 'poly_month_cos__sunrise_hour_cos', 'poly_week_of_year_sin__week_of_year_cos', 'poly_week_of_year_sin__sunrise_hour_cos', 'poly_week_of_year_cos__week_day_cos', 'poly_week_of_year_cos__hour_day_cos', 'poly_week_day_sin__week_day_cos', 'poly_week_day_cos__hour_day_sin', 'poly_week_day_cos__sunrise_hour_cos', 'poly_hour_day_sin__sunrise_hour_sin', 'poly_hour_day_sin__sunset_hour_cos', 'poly_hour_day_cos__sunrise_hour_sin', 'poly_hour_day_cos__sunset_hour_cos', 'poly_sunrise_hour_cos__sunset_hour_cos', 'temp_roll_min_7_