In [1]:
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
df_train = pd.read_parquet(Path("data") / "train.parquet")

In [3]:
def _encode_dates(X):

    X = X.copy()  # modify a copy of X
    #  Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    X['week'] = X["date"].dt.isocalendar().week

    return X.drop(columns=["date"])

In [4]:
def _merge_external_data(X):

    file_path = Path('data') / "external_data.csv"
    df_ext = pd.read_csv(file_path, parse_dates=["date"])

    X['date'] = pd.to_datetime(X['date']).astype('datetime64[ns]')
    df_ext['date'] = pd.to_datetime(df_ext['date']).astype('datetime64[ns]')

    X = X.copy()

    #  When using merge_asof left frame need to be sorted
    X["orig_index"] = np.arange(X.shape[0])
    X = pd.merge_asof(
        X.sort_values("date"),
        df_ext[["date", "t", 'u', "pres",
                "pmer", 'raf10',]].sort_values("date"),
        on="date"
    )

    #  Sort back to the original order
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X

In [5]:
df_train = _merge_external_data(df_train)

In [6]:
X_dates_encoding = _encode_dates(df_train[["date"]])
df_train = pd.concat([df_train, X_dates_encoding], axis=1)

In [7]:
y = df_train['log_bike_count']
X = df_train.drop('log_bike_count', axis=1)

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, random_state=2408
)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
from xgboost import XGBRegressor


categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name"]
passthrough_cols = ['latitude', 't', 'u',
                    'month', 'weekday', 'hour', 'day']

preprocessor = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_cols),
        ('passthrough', 'passthrough', passthrough_cols)
    ]
)

regressor = XGBRegressor(
    learning_rate=0.1, max_depth=9, n_estimators=980, gamma=0,
    min_child_weight=7, reg_alpha=0.2, colsample_bytree=0.85,
    reg_lambda=6, max_delta_step=1
)

pipe = make_pipeline(preprocessor, regressor)
pipe.fit(X_train, y_train)

In [10]:
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipe.predict(X_train), squared=False):.2f}"
)

print(
    f"Test set, RMSE={mean_squared_error(y_valid, pipe.predict(X_valid), squared=False):.2f}"
)

Train set, RMSE=0.28
Test set, RMSE=0.34


In [11]:
print("The hyper-parameters are for the full-pipeline are:")
for param_name in pipe.get_params().keys():
    print(param_name)

The hyper-parameters are for the full-pipeline are:
memory
steps
verbose
columntransformer
xgbregressor
columntransformer__n_jobs
columntransformer__remainder
columntransformer__sparse_threshold
columntransformer__transformer_weights
columntransformer__transformers
columntransformer__verbose
columntransformer__verbose_feature_names_out
columntransformer__cat
columntransformer__passthrough
columntransformer__cat__categories
columntransformer__cat__drop
columntransformer__cat__dtype
columntransformer__cat__feature_name_combiner
columntransformer__cat__handle_unknown
columntransformer__cat__max_categories
columntransformer__cat__min_frequency
columntransformer__cat__sparse
columntransformer__cat__sparse_output
xgbregressor__objective
xgbregressor__base_score
xgbregressor__booster
xgbregressor__callbacks
xgbregressor__colsample_bylevel
xgbregressor__colsample_bynode
xgbregressor__colsample_bytree
xgbregressor__device
xgbregressor__early_stopping_rounds
xgbregressor__enable_categorical
xgbr

In [12]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import time
from sklearn.model_selection import GridSearchCV

#  Define the parameter grid
param_grid = {
    'xgbregressor__n_estimators': [900, 980, 1000],
    'xgbregressor__learning_rate': [0.1, 0.11, 0.12],
    'xgbregressor__max_depth': [8, 9, 10],
    'xgbregressor__gamma': [0],
    'xgbregressor__min_child_weight': [7],
    'xgbregressor__colsample_bytree': [0.85],
    'xgbregressor__subsample': [1],
    'xgbregressor__reg_alpha': [0.2],
    'xgbregressor__reg_lambda': [6],
    'xgbregressor__max_delta_step': [1],
}

#  Perform grid search with 5-fold cross-validation
start = time.time()
model_grid_search = GridSearchCV(
    pipe, param_grid, scoring='neg_mean_squared_error',
    cv=TimeSeriesSplit(n_splits=5), n_jobs=-1, verbose=2
)

model_grid_search.fit(X_train, y_train)
elapsed_time = time.time() - start

#  Best parameters and best score
print("Best parameters:", model_grid_search.best_params_)
print("Best score:", model_grid_search.best_score_)
print("Execution time:", elapsed_time / 60)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'xgbregressor__colsample_bytree': 0.85, 'xgbregressor__gamma': 0, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_delta_step': 1, 'xgbregressor__max_depth': 10, 'xgbregressor__min_child_weight': 7, 'xgbregressor__n_estimators': 980, 'xgbregressor__reg_alpha': 0.2, 'xgbregressor__reg_lambda': 6, 'xgbregressor__subsample': 1}
Best score: -0.13268903563034698
Execution time: 15.329617540041605
