In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

# Prepare dataset and parameters

In [2]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [3]:
scoring = rmse
timestamp_col = 'date'
target_col = 'count'
parquet_file_path = 'data/all_features.parquet'
output_dir = 'fs_results'
optuna_n_trials = 2
prediction_length = 14
test_size = None
val_ratio = 0.25
cv_fold = 5
add_lag_col = True
data = pd.read_csv("data/bike_sharing_day.csv")

In [4]:
data[timestamp_col] = data[timestamp_col].astype(str)
data = data.sort_values(timestamp_col).reset_index(drop=True)
data

Unnamed: 0,date,season,year,month,holiday,weekday,workingday,weather,temp,humidity,windspeed,count
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.805833,0.160446,985
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.696087,0.248539,801
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.437273,0.248309,1349
3,2011-01-04,1,0,1,0,2,1,1,0.200000,0.590435,0.160296,1562
4,2011-01-05,1,0,1,0,3,1,1,0.226957,0.436957,0.186900,1600
...,...,...,...,...,...,...,...,...,...,...,...,...
726,2012-12-27,1,1,12,0,4,1,2,0.254167,0.652917,0.350133,2114
727,2012-12-28,1,1,12,0,5,1,2,0.253333,0.590000,0.155471,3095
728,2012-12-29,1,1,12,0,6,0,2,0.253333,0.752917,0.124383,1341
729,2012-12-30,1,1,12,0,0,0,1,0.255833,0.483333,0.350754,1796


In [5]:
from utils import prepare_train_val_test_data

In [6]:
train_data, test_data, cvs = prepare_train_val_test_data(
    data=data, 
    target_col=target_col, 
    timestamp_col=timestamp_col, 
    test_size=test_size, 
    val_ratio=val_ratio, 
    cv_fold=cv_fold, 
    prediction_length=prediction_length, 
    add_lag_col=add_lag_col
)

# Feature selection

In [7]:
from feature_selection import FeatureSelector

feature_selector = FeatureSelector()

In [8]:
feature_selector.fit(
    train_data=train_data,
    cvs=cvs,
    timestamp_col=timestamp_col,
    target_col=target_col,
    prediction_length=prediction_length,
    parquet_file_path=parquet_file_path,
    output_dir=output_dir,
    scoring=scoring,
    optuna_n_trials=optuna_n_trials
)

  0%|          | 0/18 [00:00<?, ?it/s]

  from pandas import MultiIndex, Int64Index
 11%|█         | 330/2987 [02:32<20:25,  2.17it/s]
Traceback (most recent call last):
  File "calculate_feature_score.py", line 43, in <module>
    output = calculate_feature_score(**input_dict)
  File "calculate_feature_score.py", line 22, in calculate_feature_score
    losses = run_cv(model, train_data_exo_small, target_col,
  File "/notional_data/phuc_workspace/notional-ts-examples/utils.py", line 197, in run_cv
    model.fit(X_train, y_train)
  File "/opt/conda/envs/notional-ts/lib/python3.8/site-packages/xgboost/core.py", line 506, in inner_f
    return f(**kwargs)
  File "/opt/conda/envs/notional-ts/lib/python3.8/site-packages/xgboost/sklearn.py", line 789, in fit
    self._Booster = train(
  File "/opt/conda/envs/notional-ts/lib/python3.8/site-packages/xgboost/training.py", line 188, in train
    bst = _train_internal(params, dtrain,
  File "/opt/conda/envs/notional-ts/lib/python3.8/site-packages/xgboost/training.py", line 81, in _trai

KeyboardInterrupt: 

In [None]:
final_selected_features = feature_selector.get_best_features()
final_selected_features

# Evaluate performance of new features

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from utils import ARIMAModel
import xgboost as xgb

In [None]:
from utils import fine_tune_model, evaluate_models

In [None]:
fine_tune_model_args = {
    'train_data': train_data, 
    'target_col': target_col, 
    'cvs': cvs, 
    'scoring': scoring, 
    'timestamp_col': timestamp_col, 
    'optuna_n_trials': optuna_n_trials
}

lr_model = LinearRegression()
arima_model = ARIMAModel()
lasso_model = fine_tune_model('lasso', **fine_tune_model_args)
xgb_model = fine_tune_model('xgboost', **fine_tune_model_args)
rf_model = fine_tune_model('random_forest', **fine_tune_model_args)

models = [lr_model, arima_model, lasso_model, xgb_model, rf_model]
evaluate_models(models, train_data, test_data, target_col, timestamp_col, scoring)

In [None]:
from utils import add_exo_features

In [None]:
train_data_final = add_exo_features(
    train_data, 
    timestamp_col, 
    final_selected_features, 
    parquet_file_path, 
    prediction_length
)

test_data_final = add_exo_features(
    test_data, 
    timestamp_col, 
    final_selected_features, 
    parquet_file_path, 
    prediction_length
)

In [None]:
train_data_final

In [None]:
fine_tune_model_args = {
    'train_data': train_data_final, 
    'target_col': target_col, 
    'cvs': cvs, 
    'scoring': scoring, 
    'timestamp_col': timestamp_col, 
    'optuna_n_trials': optuna_n_trials
}

lr_model = LinearRegression()
arima_model = ARIMAModel()
lasso_model = fine_tune_model('lasso', **fine_tune_model_args)
xgb_model = fine_tune_model('xgboost', **fine_tune_model_args)
rf_model = fine_tune_model('random_forest', **fine_tune_model_args)

models = [lr_model, arima_model, lasso_model, xgb_model, rf_model]
evaluate_models(models, train_data_final, test_data_final, target_col, timestamp_col, scoring)