In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent))
str(Path.cwd().parent)

'c:\\Users\\Joaquín Amat\\Documents\\GitHub\\skforecast'

In [2]:
# Libraries
# ==============================================================================
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5

from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect

In [3]:
# Downloading data
# ==============================================================================
url = ('https://raw.githubusercontent.com/JoaquinAmatRodrigo/Estadistica-machine-'
       'learning-python/master/data/bike_sharing_dataset_clean.csv')
data = pd.read_csv(url)

# Preprocess data
# ==============================================================================
data['date_time'] = pd.to_datetime(data['date_time'], format='%Y-%m-%d %H:%M:%S')
data = data.set_index('date_time')
data = data.asfreq('H')
data = data.sort_index()
data['holiday'] = data['holiday'].astype(int)
data = data[['holiday', 'weather', 'temp', 'hum', 'users']]
data[['holiday', 'weather']] = data[['holiday', 'weather']].astype(str)
print(data.dtypes)
data.head(3)

holiday     object
weather     object
temp       float64
hum        float64
users      float64
dtype: object


Unnamed: 0_level_0,holiday,weather,temp,hum,users
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-01-01 00:00:00,0,clear,9.84,81.0,16.0
2011-01-01 01:00:00,0,clear,9.02,80.0,40.0
2011-01-01 02:00:00,0,clear,9.02,80.0,32.0


In [4]:
# Split train-test
# ==============================================================================
start_train = '2012-06-01 00:00:00'
end_train = '2012-07-31 23:59:00'
end_test = '2012-08-15 23:59:00'
data_train = data.loc[start_train:end_train, :]
data_test  = data.loc[end_train:end_test, :]

print(f"Dates train : {data_train.index.min()} --- {data_train.index.max()}  (n={len(data_train)})")
print(f"Dates test  : {data_test.index.min()} --- {data_test.index.max()}  (n={len(data_test)})")

Dates train : 2012-06-01 00:00:00 --- 2012-07-31 23:00:00  (n=1464)
Dates test  : 2012-08-01 00:00:00 --- 2012-08-15 23:00:00  (n=360)


In [5]:
categorical_features = data.select_dtypes(exclude=[np.number]).columns.tolist()
exog_features = ['holiday', 'weather', 'temp', 'hum']
transformer_exog = make_column_transformer(
                        (
                            OrdinalEncoder(
                                dtype=int,
                                handle_unknown="use_encoded_value",
                                unknown_value=-1,
                                encoded_missing_value=-1
                            ),
                            categorical_features
                        ),
                        remainder="passthrough",
                        verbose_feature_names_out=False,
                   ).set_output(transform="pandas")

forecaster = ForecasterAutoregDirect(
                regressor = CatBoostRegressor(
                                cat_features=categorical_features,
                                n_estimators=5,
                                random_state=963,
                                silent=True,
                                allow_writing_files=False
                            ),
                lags = 5,
                steps = 3,
                transformer_exog = transformer_exog
             )

forecaster.fit(
    y = data.loc[:end_train, 'users'],
    exog = data.loc[:end_train, exog_features]
)

X_train, y_train = forecaster.create_train_X_y(
                        y = data.loc[:end_train, 'users'],
                        exog = data.loc[:end_train, exog_features]
                   )
X_train.head()

aaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaa


Unnamed: 0_level_0,lag_1,lag_2,lag_3,lag_4,lag_5,holiday_step_1,holiday_step_2,holiday_step_3,weather_step_1,weather_step_2,weather_step_3,temp_step_1,temp_step_2,temp_step_3,hum_step_1,hum_step_2,hum_step_3
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-01-01 07:00:00,1.0,13.0,32.0,40.0,16.0,0,0,0,1,0,0,9.84,9.02,8.2,75.0,80.0,86.0
2011-01-01 08:00:00,1.0,1.0,13.0,32.0,40.0,0,0,0,0,0,0,9.02,8.2,9.84,80.0,86.0,75.0
2011-01-01 09:00:00,2.0,1.0,1.0,13.0,32.0,0,0,0,0,0,0,8.2,9.84,13.12,86.0,75.0,76.0
2011-01-01 10:00:00,3.0,2.0,1.0,1.0,13.0,0,0,0,0,0,0,9.84,13.12,15.58,75.0,76.0,76.0
2011-01-01 11:00:00,8.0,3.0,2.0,1.0,1.0,0,0,0,0,0,0,13.12,15.58,14.76,76.0,76.0,81.0


In [6]:
forecaster.get_feature_importances(step=1)

Unnamed: 0,feature,importance
0,lag_1,70.15117
1,lag_2,6.682183
2,lag_3,4.560044
3,lag_4,9.515948
4,lag_5,5.134563
5,holiday,0.177256
6,weather,0.0
7,temp,2.235377
8,hum,1.543457
