In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import catboost as cat
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from pandas.tseries.holiday import Holiday, AbstractHolidayCalendar
from dateutil.easter import easter
from datetime import timedelta
import lightgbm as lgb
import optuna
import warnings
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import RFECV

warnings.filterwarnings("ignore")

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "test.parquet")
data["date"] = pd.to_datetime(data["date"])
data = data.set_index("date")
test_data["date"] = pd.to_datetime(test_data["date"])
test_data = test_data.set_index("date")

In [3]:
prediction_data = pd.read_parquet(Path("data") / "final_test.parquet")

In [4]:
prediction_data["date"] = pd.to_datetime(prediction_data["date"])
prediction_data = prediction_data.set_index("date")

In [5]:
class FrenchHolidayCalendar(AbstractHolidayCalendar):
    rules = [
        Holiday("New Year's Day", month=1, day=1),
        Holiday("Labour Day", month=5, day=1),
        Holiday("Victory in Europe Day", month=5, day=8),
        Holiday("Bastille Day", month=7, day=14),
        Holiday("Assumption of Mary", month=8, day=15),
        Holiday("All Saints' Day", month=11, day=1),
        Holiday("Armistice Day", month=11, day=11),
        Holiday("Christmas Day", month=12, day=25),
    ]

    @staticmethod
    def easter_related_holidays(year):
        easter_sunday = easter(year)
        return [
            (easter_sunday + timedelta(days=1), "Easter Monday"),
            (easter_sunday + timedelta(days=39), "Ascension Day"),
        ]

In [6]:
def cyclical_encode(df, column, max_value):
    df[column + "_sin"] = np.sin(2 * np.pi * df[column] / max_value)
    df[column + "_cos"] = np.cos(2 * np.pi * df[column] / max_value)
    return df

In [7]:
def create_features(df):
    df = df.copy()
    df["hour"] = df.index.hour
    df["dayofweek"] = df.index.dayofweek
    df["quarter"] = df.index.quarter
    df["month"] = df.index.month
    df["dayofyear"] = df.index.dayofyear

    # Boolean for weekends
    df["is_weekend"] = df["dayofweek"].isin([5, 6])

    # cyclical
    df = cyclical_encode(df, "hour", 24)
    df = cyclical_encode(df, "dayofweek", 7)

    # Boolean for holidays
    cal = FrenchHolidayCalendar()
    holidays = cal.holidays(start=df.index.min(), end=df.index.max())
    easter_holidays = []
    for year in range(df.index.year.min(), df.index.year.max() + 1):
        for date, _ in FrenchHolidayCalendar.easter_related_holidays(year):
            easter_holidays.append(date)
    holidays = holidays.union(pd.to_datetime(easter_holidays))
    df["is_holiday"] = df.index.isin(holidays)

    # Lockdown periods
    lockdowns = {
        "lockdown_1": ("2020-03-17", "2020-05-10"),
        "lockdown_2": ("2020-10-28", "2020-12-01"),
        # with curfew from 7 PM to 6 AM
        "lockdown_3_1": ("2021-04-03 19:00:00", "2021-05-18 06:00:00"),
        # with curfew from 9 PM to 6 AM
        "lockdown_3_2": ("2021-05-19 21:00:00", "2021-06-08 06:00:00"),
        # with curfew from 11 PM to 6 AM
        "lockdown_3_3": ("2021-06-09 23:00:00", "2021-06-29 06:00:00"),
    }
    for lockdown, (start_date, end_date) in lockdowns.items():
        mask = (df.index >= start_date) & (df.index <= end_date)
        df[lockdown] = mask

    return df

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 455163 entries, 2020-09-01 02:00:00 to 2021-08-09 17:00:00
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 455163 non-null  category      
 1   counter_name               455163 non-null  category      
 2   site_id                    455163 non-null  int64         
 3   site_name                  455163 non-null  category      
 4   bike_count                 455163 non-null  float64       
 5   counter_installation_date  455163 non-null  datetime64[us]
 6   counter_technical_id       455163 non-null  category      
 7   latitude                   455163 non-null  float64       
 8   longitude                  455163 non-null  float64       
 9   log_bike_count             455163 non-null  float64       
dtypes: category(4), datetime64[us](1), float64(4), int64(1)
memory usage: 26.1 MB


In [9]:
train = data[["counter_name", "log_bike_count"]]
test = test_data[["counter_name", "log_bike_count"]]
prediction = prediction_data[["counter_name"]]

In [10]:
train = create_features(train)

In [11]:
test = create_features(test)

In [12]:
prediction = create_features(prediction)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 455163 entries, 2020-09-01 02:00:00 to 2021-08-09 17:00:00
Data columns (total 18 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   counter_name    455163 non-null  category
 1   log_bike_count  455163 non-null  float64 
 2   hour            455163 non-null  int32   
 3   dayofweek       455163 non-null  int32   
 4   quarter         455163 non-null  int32   
 5   month           455163 non-null  int32   
 6   dayofyear       455163 non-null  int32   
 7   is_weekend      455163 non-null  bool    
 8   hour_sin        455163 non-null  float64 
 9   hour_cos        455163 non-null  float64 
 10  dayofweek_sin   455163 non-null  float64 
 11  dayofweek_cos   455163 non-null  float64 
 12  is_holiday      455163 non-null  bool    
 13  lockdown_1      455163 non-null  bool    
 14  lockdown_2      455163 non-null  bool    
 15  lockdown_3_1    455163 non-null  bool    
 16  lock

In [None]:
train.columns

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "is_weekend",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "is_holiday",
    "lockdown_1",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

In [None]:
reg = xgb.XGBRegressor(n_estimators=1000)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(
        handle_unknown="ignore"), cat_feature)],
    remainder="passthrough",
)

In [None]:
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", reg)])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred_xgb = pipeline.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
xgb_model = pipeline.named_steps["model"]

xgb_model.feature_importances_

In [None]:
X_train_encoded = pd.get_dummies(
    X_train,
    columns=["counter_name", "counter_technical_id"],
    prefix=["encoded_counter", "encoded_tech"],
)
X_test_encoded = pd.get_dummies(
    X_test,
    columns=["counter_name", "counter_technical_id"],
    prefix=["encoded_counter", "encoded_tech"],
)

In [None]:
reg = xgb.XGBRegressor(n_estimators=1000)

In [None]:
reg.fit(
    X_train_encoded,
    y_train,
    eval_set=[(X_train_encoded, y_train), (X_test_encoded, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train_encoded)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

In [None]:
y_pred_xgb = reg.predict(X_test_encoded)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_,
    index=reg.feature_names_in_,
    columns=["Importance"],
)

In [None]:
fi.sort_values(by="Importance").plot(
    kind="barh", title="Feature Importances", figsize=(16, 25)
)

In [None]:
train.info()

In [None]:
boolean_columns = [
    "is_weekend",
    "is_holiday",
    "lockdown_1",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
]

for column in boolean_columns:
    train[column] = train[column].astype(int)


for column in boolean_columns:
    test[column] = test[column].astype(int)

In [None]:
train.info()

In [None]:
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

In [None]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=1000, enable_categorical=True)

In [None]:
reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_, index=reg.feature_names_in_, columns=[
        "Importance"]
)

fi.sort_values(by="Importance").plot(kind="barh", title="Feature Importances")

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "is_holiday",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

In [None]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=1000, enable_categorical=True)



reg.fit(
    X_train,
    y_train,

    eval_set=[(X_train, y_train), (X_test, y_test)],

    early_stopping_rounds=10,

    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_, index=reg.feature_names_in_, columns=[
        "Importance"]
)

fi.sort_values(by="Importance").plot(kind="barh", title="Feature Importances")

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [None]:
cat_reg = cat.CatBoostRegressor(n_estimators=1000, cat_features=cat_feature)

In [None]:
cat_reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_cat = cat_reg.predict(X_train)
rmse_cat_train = mean_squared_error(y_train, y_hat_cat, squared=False)
print("CAT Train:", rmse_cat_train)

y_test_cat = cat_reg.predict(X_test)
rmse_cat_test = mean_squared_error(y_test, y_test_cat, squared=False)
print("CAT Test:", rmse_cat_test)

In [None]:
fi = pd.DataFrame(
    data=cat_reg.feature_importances_,
    index=features,
    columns=["Importance"],
)

fi.sort_values(by="Importance").plot(kind="barh", title="Feature Importances")

In [None]:
weather_data = pd.read_csv("data/external_data_cleaned.csv")

In [None]:
weather_data_hourly.head()

In [None]:
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data.set_index("date", inplace=True)

In [None]:
weather_data_hourly = weather_data.resample("H").ffill()

In [None]:
weather_data_hourly.info()

In [None]:
weather_data_hourly = weather_data_hourly.drop(columns=["week", "day"])

In [None]:
combined_train.columns

In [None]:
# Merge train and weather_data_hourly on their DateTimeIndex
combined_train = train.merge(
    weather_data_hourly, left_index=True, right_index=True, how="left"
)
combined_test = test.merge(
    weather_data_hourly, left_index=True, right_index=True, how="left"
)

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "is_holiday",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "t",
    "rr1",
    "rr3",
    "rr6",
    "ff",
    "raf10",
    "rafper",
    "u",
    "vv",
    "n",
    "cl",
    "cm",
    "ch",
    "precipitation",
    "cloudy_day",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=1000, enable_categorical=True)



reg.fit(
    X_train,
    y_train,

    eval_set=[(X_train, y_train), (X_test, y_test)],

    early_stopping_rounds=10,

    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_, index=reg.feature_names_in_, columns=[
        "Importance"]
)

fi.sort_values(by="Importance").plot(kind="barh", title="Feature Importances")

In [None]:
cat_reg = cat.CatBoostRegressor(n_estimators=1000, cat_features=cat_feature)
cat_reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_cat = cat_reg.predict(X_train)
rmse_cat_train = mean_squared_error(y_train, y_hat_cat, squared=False)
print("CAT Train:", rmse_cat_train)

y_test_cat = cat_reg.predict(X_test)
rmse_cat_test = mean_squared_error(y_test, y_test_cat, squared=False)
print("CAT Test:", rmse_cat_test)

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "is_holiday",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "t",
    "rr1",
    "rr3",
    "rr6",
    "ff",
    "raf10",
    "rafper",
    "u",
    "vv",
    "n",
    "cl",
    "cm",
    "ch",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=1000, enable_categorical=True)



reg.fit(
    X_train,
    y_train,

    eval_set=[(X_train, y_train), (X_test, y_test)],

    early_stopping_rounds=10,

    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_, index=reg.feature_names_in_, columns=[
        "Importance"]
)

fi.sort_values(by="Importance").plot(kind="barh", title="Feature Importances")

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [None]:
cat_reg = cat.CatBoostRegressor(n_estimators=1000, cat_features=cat_feature)
cat_reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_cat = cat_reg.predict(X_train)
rmse_cat_train = mean_squared_error(y_train, y_hat_cat, squared=False)
print("CAT Train:", rmse_cat_train)

y_test_cat = cat_reg.predict(X_test)
rmse_cat_test = mean_squared_error(y_test, y_test_cat, squared=False)
print("CAT Test:", rmse_cat_test)

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "is_holiday",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "t",
    "rr1",
    "rr3",
    "rr6",
    "ff",
    "raf10",
    "u",
    "n",
    "cm",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=1000, enable_categorical=True)



reg.fit(
    X_train,
    y_train,

    eval_set=[(X_train, y_train), (X_test, y_test)],

    early_stopping_rounds=10,

    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_, index=reg.feature_names_in_, columns=[
        "Importance"]
)

fi.sort_values(by="Importance").plot(
    kind="barh", title="Feature Importances XGBoost")

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [None]:
cat_reg = cat.CatBoostRegressor(n_estimators=1000, cat_features=cat_feature)
cat_reg.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_cat = cat_reg.predict(X_train)
rmse_cat_train = mean_squared_error(y_train, y_hat_cat, squared=False)
print("CAT Train:", rmse_cat_train)

y_test_cat = cat_reg.predict(X_test)
rmse_cat_test = mean_squared_error(y_test, y_test_cat, squared=False)
print("CAT Test:", rmse_cat_test)

In [None]:
fi = pd.DataFrame(
    data=cat_reg.feature_importances_,
    index=features,
    columns=["Importance"],
)

fi.sort_values(by="Importance").plot(
    kind="barh", title="Feature Importances CATBoost")

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "t",
    "rr1",
    "rr3",
    "rr6",
    "raf10",
    "u",
    "n",
    "cm",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
combined_train["t"].head(20)

In [None]:
combined_train.head()

# Updated Weather

In [14]:
weather_data = pd.read_csv("data/external_data_cleaned_updated.csv")

In [15]:
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data.set_index("date", inplace=True)

In [16]:
weather_data.head()

Unnamed: 0_level_0,pmer,tend,cod_tend,dd,ff,t,td,u,vv,ww,...,etat_sol_lag6h,etat_sol_lag9h,etat_sol_lag12h,etat_sol_lag24h,etat_sol_lag168h,rr12_lag6h,rr12_lag9h,rr12_lag12h,rr12_lag24h,rr12_lag168h
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-09-01 00:00:00,102050,-10,8,340,1.6,285.75,282.55,81,30000,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-09-01 01:00:00,102050,-10,8,340,1.6,285.75,282.55,81,30000,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-09-01 02:00:00,102050,-10,8,340,1.6,285.75,282.55,81,30000,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-09-01 03:00:00,101990,-60,6,290,1.1,283.95,282.05,88,25000,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-09-01 04:00:00,101990,-60,6,290,1.1,283.95,282.05,88,25000,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9973 entries, 2020-09-01 00:00:00 to 2021-10-21 12:00:00
Data columns (total 74 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pmer                       9973 non-null   int64  
 1   tend                       9973 non-null   int64  
 2   cod_tend                   9973 non-null   int64  
 3   dd                         9973 non-null   int64  
 4   ff                         9973 non-null   float64
 5   t                          9973 non-null   float64
 6   td                         9973 non-null   float64
 7   u                          9973 non-null   int64  
 8   vv                         9973 non-null   int64  
 9   ww                         9973 non-null   int64  
 10  w1                         9973 non-null   float64
 11  w2                         9973 non-null   float64
 12  n                          9973 non-null   float64
 13  nbas        

In [18]:
boolean_columns = [
    "is_weekend",
    "is_holiday",
    "lockdown_1",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
]

for column in boolean_columns:
    train[column] = train[column].astype(int)


for column in boolean_columns:
    test[column] = test[column].astype(int)

for column in boolean_columns:
    prediction[column] = prediction[column].astype(int)

In [19]:
combined_train = train.merge(
    weather_data, left_index=True, right_index=True, how="left"
)
combined_test = test.merge(
    weather_data, left_index=True, right_index=True, how="left")
combined_prediction = prediction.merge(
    weather_data, left_index=True, right_index=True, how="left"
)

In [24]:
combined_prediction.head()

Unnamed: 0_level_0,counter_name,hour,dayofweek,quarter,month,dayofyear,is_weekend,hour_sin,hour_cos,dayofweek_sin,...,etat_sol_lag6h,etat_sol_lag9h,etat_sol_lag12h,etat_sol_lag24h,etat_sol_lag168h,rr12_lag6h,rr12_lag9h,rr12_lag12h,rr12_lag24h,rr12_lag168h
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-10 01:00:00,28 boulevard Diderot E-O,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,1.0,0.0,0.0,1.0,0.0,0.6,0.0,-0.1,1.2,0.0
2021-09-10 01:00:00,28 boulevard Diderot O-E,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,1.0,0.0,0.0,1.0,0.0,0.6,0.0,-0.1,1.2,0.0
2021-09-10 01:00:00,39 quai François Mauriac NO-SE,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,1.0,0.0,0.0,1.0,0.0,0.6,0.0,-0.1,1.2,0.0
2021-09-10 01:00:00,39 quai François Mauriac SE-NO,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,1.0,0.0,0.0,1.0,0.0,0.6,0.0,-0.1,1.2,0.0
2021-09-10 01:00:00,18 quai de l'Hôtel de Ville NO-SE,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,1.0,0.0,0.0,1.0,0.0,0.6,0.0,-0.1,1.2,0.0


In [25]:
combined_prediction.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 51440 entries, 2021-09-10 01:00:00 to 2021-10-18 21:00:00
Data columns (total 91 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   counter_name               51440 non-null  category
 1   hour                       51440 non-null  int32   
 2   dayofweek                  51440 non-null  int32   
 3   quarter                    51440 non-null  int32   
 4   month                      51440 non-null  int32   
 5   dayofyear                  51440 non-null  int32   
 6   is_weekend                 51440 non-null  int32   
 7   hour_sin                   51440 non-null  float64 
 8   hour_cos                   51440 non-null  float64 
 9   dayofweek_sin              51440 non-null  float64 
 10  dayofweek_cos              51440 non-null  float64 
 11  is_holiday                 51440 non-null  int32   
 12  lockdown_1                 51440 non-null  int32   
 

In [26]:
combined_train.columns

Index(['counter_name', 'log_bike_count', 'hour', 'dayofweek', 'quarter',
       'month', 'dayofyear', 'is_weekend', 'hour_sin', 'hour_cos',
       'dayofweek_sin', 'dayofweek_cos', 'is_holiday', 'lockdown_1',
       'lockdown_2', 'lockdown_3_1', 'lockdown_3_2', 'lockdown_3_3', 'pmer',
       'tend', 'cod_tend', 'dd', 'ff', 't', 'td', 'u', 'vv', 'ww', 'w1', 'w2',
       'n', 'nbas', 'hbas', 'cl', 'cm', 'ch', 'pres', 'tend24', 'raf10',
       'rafper', 'etat_sol', 'ht_neige', 'ssfrai', 'rr1', 'rr3', 'rr6', 'rr12',
       'rr24', 'nnuage1', 'ctype1', 'hnuage1', 'temp_humidity_interaction',
       't_lag6h', 't_lag9h', 't_lag12h', 't_lag24h', 't_lag168h', 'td_lag6h',
       'td_lag9h', 'td_lag12h', 'td_lag24h', 'td_lag168h', 'u_lag6h',
       'u_lag9h', 'u_lag12h', 'u_lag24h', 'u_lag168h', 'ww_lag6h', 'ww_lag9h',
       'ww_lag12h', 'ww_lag24h', 'ww_lag168h', 'n_lag6h', 'n_lag9h',
       'n_lag12h', 'n_lag24h', 'n_lag168h', 'tend24_lag6h', 'tend24_lag9h',
       'tend24_lag12h', 'tend24_la

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "is_weekend",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "is_holiday",
    "lockdown_1",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "pmer",
    "tend",
    "cod_tend",
    "dd",
    "ff",
    "t",
    "td",
    "u",
    "vv",
    "ww",
    "w1",
    "w2",
    "n",
    "nbas",
    "hbas",
    "cl",
    "cm",
    "ch",
    "pres",
    "tend24",
    "raf10",
    "rafper",
    "etat_sol",
    "ht_neige",
    "ssfrai",
    "rr1",
    "rr3",
    "rr6",
    "rr12",
    "rr24",
    "nnuage1",
    "ctype1",
    "hnuage1",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
reg = xgb.XGBRegressor(tree_method="hist", n_estimators=1000, enable_categorical=True)

In [None]:
reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_, index=reg.feature_names_in_, columns=[
        "Importance"]
)

fi.sort_values(by="Importance").plot(
    kind="barh", title="Feature Importances XGBoost", figsize=(10, 16)
)

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [None]:
cat_reg = cat.CatBoostRegressor(n_estimators=1000, cat_features=cat_feature)
cat_reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_cat = cat_reg.predict(X_train)
rmse_cat_train = mean_squared_error(y_train, y_hat_cat, squared=False)
print("CAT Train:", rmse_cat_train)

y_test_cat = cat_reg.predict(X_test)
rmse_cat_test = mean_squared_error(y_test, y_test_cat, squared=False)
print("CAT Test:", rmse_cat_test)

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "pmer",
    "tend",
    "cod_tend",
    "dd",
    "ff",
    "t",
    "td",
    "u",
    "vv",
    "ww",
    "w1",
    "n",
    "nbas",
    "hbas",
    "cl",
    "cm",
    "ch",
    "pres",
    "tend24",
    "raf10",
    "rafper",
    "etat_sol",
    "ht_neige",
    "rr1",
    "rr3",
    "rr6",
    "rr12",
    "rr24",
    "nnuage1",
    "ctype1",
    "hnuage1",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
combined_train.info()

In [None]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=1000, enable_categorical=True)



reg.fit(
    X_train,
    y_train,

    eval_set=[(X_train, y_train), (X_test, y_test)],

    early_stopping_rounds=10,

    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_, index=reg.feature_names_in_, columns=[
        "Importance"]
)

fi.sort_values(by="Importance").plot(
    kind="barh", title="Feature Importances XGBoost", figsize=(10, 16)
)

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [None]:
cat_reg = cat.CatBoostRegressor(n_estimators=1000, cat_features=cat_feature)
cat_reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_cat = cat_reg.predict(X_train)
rmse_cat_train = mean_squared_error(y_train, y_hat_cat, squared=False)
print("CAT Train:", rmse_cat_train)

y_test_cat = cat_reg.predict(X_test)
rmse_cat_test = mean_squared_error(y_test, y_test_cat, squared=False)
print("CAT Test:", rmse_cat_test)

In [None]:
X_train.shape

In [None]:
d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_feature)

In [None]:
params = {
    "boosting_type": "gbdt",  # Gradient boosting
    "objective": "regression",  # Task is regression
    "metric": {"l2", "l1"},  # Evaluation metrics
    "num_leaves": 31,  # Number of leaves in full tree
    "learning_rate": 0.05,  # Learning rate
    "feature_fraction": 0.9,  # Fraction of features to be used in each iteration
    "bagging_fraction": 0.8,  # Fraction of data to be used in each iteration
    "bagging_freq": 5,  # Frequency for bagging
    "verbose": 10,  # Verbosity of output
}

In [None]:
lgbm = lgb.train(params, d_train, num_boost_round=100)

In [None]:
y_pred_lgbm = lgbm.predict(X_test)


rmse_lgbm = mean_squared_error(y_test, y_pred_lgbm, squared=False)
print("LightGBM Test:", rmse_lgbm)

In [None]:
def objective_xgb(trial):
    # Hyperparameter search space
    params = {
        "tree_method": "hist",
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.3, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.6, 1.0),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
    }

    # Time Series Cross-Validation
    tscv = TimeSeriesSplit(n_splits=5)

    rmse_scores = []  # Store RMSE scores for each fold

    for train_index, val_index in tscv.split(X_train):
        X_train_cv, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_cv, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        model = xgb.XGBRegressor(**params, enable_categorical=True)
        model.fit(
            X_train_cv,
            y_train_cv,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=10,
            verbose=False,
        )

        y_pred = model.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        rmse_scores.append(rmse)

    # Calculate the average RMSE over all folds
    avg_rmse = np.mean(rmse_scores)

    return avg_rmse

In [None]:
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=100)

In [None]:
best_params = study_xgb.best_params
print("Best Hyperparameters:", best_params)

In [None]:
best_params = {
    "lambda": 5.5163638658795495,
    "alpha": 5.722067076221477e-06,
    "colsample_bytree": 0.5588623654042607,
    "subsample": 0.7939924932498187,
    "learning_rate": 0.08132063380230384,
    "max_depth": 10,
    "min_child_weight": 132,
}

In [None]:
final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

In [None]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

In [None]:
correlation = combined_train[features + target].corr()
plt.figure(figsize=(50, 50))
sns.heatmap(correlation, annot=True)

In [None]:
rfecv = RFECV(estimator=final_model, step=1, cv=5, scoring="neg_mean_squared_error")

In [None]:
rfecv.fit(X_train, y_train)

In [None]:
X_train_encoded = pd.get_dummies(X_train, columns=cat_feature)
X_test_encoded = pd.get_dummies(X_test, columns=cat_feature)

In [None]:
final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)

In [None]:
rfecv = RFECV(estimator=final_model, step=1, cv=5, scoring="neg_mean_squared_error")

In [None]:
rfecv.fit(X_train_encoded, y_train)

In [None]:
optimal_features = X_train_encoded.columns[rfecv.support_]

In [None]:
optimal_features

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "tend",
    "t",
    "td",
    "u",
    "ww",
    "n",
    "tend24",
    "etat_sol",
    "rr12",
    "ctype1",
    "hnuage1",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

# Interactions

In [20]:
combined_train["temp_hour_interaction"] = combined_train["t"] * \
    combined_train["hour"]



combined_train["humidity_day_interaction"] = (


    combined_train["u"] * combined_train["dayofweek"]


)

In [21]:
combined_test["temp_hour_interaction"] = combined_test["t"] * \
    combined_test["hour"]



combined_test["humidity_day_interaction"] = (


    combined_test["u"] * combined_test["dayofweek"]


)

In [22]:
combined_prediction["temp_hour_interaction"] = (
    combined_prediction["t"] * combined_prediction["hour"]
)


combined_prediction["humidity_day_interaction"] = (
    combined_prediction["u"] * combined_prediction["dayofweek"]
)

In [23]:
combined_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 455163 entries, 2020-09-01 01:00:00 to 2021-08-09 23:00:00
Data columns (total 94 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   counter_name               455163 non-null  category
 1   log_bike_count             455163 non-null  float64 
 2   hour                       455163 non-null  int32   
 3   dayofweek                  455163 non-null  int32   
 4   quarter                    455163 non-null  int32   
 5   month                      455163 non-null  int32   
 6   dayofyear                  455163 non-null  int32   
 7   is_weekend                 455163 non-null  int32   
 8   hour_sin                   455163 non-null  float64 
 9   hour_cos                   455163 non-null  float64 
 10  dayofweek_sin              455163 non-null  float64 
 11  dayofweek_cos              455163 non-null  float64 
 12  is_holiday                 455163 non-

In [24]:
combined_train.head()

Unnamed: 0_level_0,counter_name,log_bike_count,hour,dayofweek,quarter,month,dayofyear,is_weekend,hour_sin,hour_cos,...,etat_sol_lag12h,etat_sol_lag24h,etat_sol_lag168h,rr12_lag6h,rr12_lag9h,rr12_lag12h,rr12_lag24h,rr12_lag168h,temp_hour_interaction,humidity_day_interaction
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-09-01 01:00:00,28 boulevard Diderot E-O,0.0,1,1,3,9,245,0,0.258819,0.965926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,285.75,81
2020-09-01 01:00:00,28 boulevard Diderot O-E,0.693147,1,1,3,9,245,0,0.258819,0.965926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,285.75,81
2020-09-01 01:00:00,39 quai François Mauriac NO-SE,1.098612,1,1,3,9,245,0,0.258819,0.965926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,285.75,81
2020-09-01 01:00:00,39 quai François Mauriac SE-NO,0.693147,1,1,3,9,245,0,0.258819,0.965926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,285.75,81
2020-09-01 01:00:00,18 quai de l'Hôtel de Ville NO-SE,0.0,1,1,3,9,245,0,0.258819,0.965926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,285.75,81


In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "tend",
    "t",
    "td",
    "u",
    "ww",
    "n",
    "tend24",
    "etat_sol",
    "rr12",
    "ctype1",
    "hnuage1",
    "temp_hour_interaction",
    "humidity_day_interaction",
    "temp_humidity_interaction",
    "t_lag6h",
    "t_lag9h",
    "t_lag24h",
    "td_lag6h",
    "td_lag9h",
    "td_lag24h",
    "u_lag6h",
    "u_lag9h",
    "u_lag24h",
    "ww_lag6h",
    "ww_lag9h",
    "ww_lag24h",
    "n_lag6h",
    "n_lag9h",
    "n_lag24h",
    "tend24_lag6h",
    "tend24_lag9h",
    "tend24_lag24h",
    "etat_sol_lag6h",
    "etat_sol_lag9h",
    "etat_sol_lag24h",
    "rr12_lag6h",
    "rr12_lag9h",
    "rr12_lag24h",
    "t_lag12h",
    "td_lag12h",
    "u_lag12h",
    "ww_lag12h",
    "n_lag12h",
    "tend24_lag12h",
    "etat_sol_lag12h",
    "rr12_lag12h",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

In [None]:
fi = pd.DataFrame(
    data=final_model.feature_importances_,
    index=final_model.feature_names_in_,
    columns=["Importance"],
)

fi.sort_values(by="Importance").plot(
    kind="barh", title="Feature Importances XGBoost", figsize=(10, 16)
)

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [None]:
from sklearn.utils import resample

In [None]:
num_bootstraps = 100
feature_importances = np.zeros((num_bootstraps, len(X_train.columns)))

In [None]:
for i in range(num_bootstraps):
    # Create a bootstrap sample
    X_sample, y_sample = resample(X_train, y_train)

    # Fit the model
    model = xgb.XGBRegressor(tree_method="hist", enable_categorical=True)
    model.fit(X_sample, y_sample)

    # Store the feature importances
    feature_importances[i, :] = model.feature_importances_

In [None]:
mean_importances = np.mean(feature_importances, axis=0)
std_importances = np.std(feature_importances, axis=0)

In [None]:
mean_importances

In [None]:
std_importances

In [None]:
model.feature_names_in_

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "month",
    "quarter",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "t",
    "u",
    "ww",
    "n",
    "etat_sol",
    "rr12",
    "temp_hour_interaction",
    "humidity_day_interaction",
    "t_lag6h",
    "t_lag9h",
    "t_lag24h",
    "td_lag24h",
    "u_lag24h",
    "ww_lag24h",
    "n_lag24h",
    "etat_sol_lag24h",
    "rr12_lag24h",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
def objective(trial):
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.1, 1, step=0.1),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
    }

    model = xgb.XGBRegressor(tree_method="hist", **
                             param, enable_categorical=True)

    tscv = TimeSeriesSplit(n_splits=5)
    scores = []

    for train_index, test_index in tscv.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        model.fit(
            X_train_fold,
            y_train_fold,
            eval_set=[(X_test_fold, y_test_fold)],
            early_stopping_rounds=50,
            verbose=False,
        )
        predictions = model.predict(X_test_fold)
        score = mean_squared_error(
            y_test_fold, predictions, squared=False)  # RMSE
        scores.append(score)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

In [None]:
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

In [None]:
best_params = {
    "n_estimators": 633,
    "max_depth": 11,
    "min_child_weight": 2,
    "gamma": 0.5,
    "learning_rate": 0.01745767642563374,
    "subsample": 0.6852898171340072,
    "colsample_bytree": 0.5752583768824626,
    "reg_alpha": 0.6174748033948815,
    "reg_lambda": 0.37071451261939165,
}

In [None]:
final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

In [None]:
prediction_data = pd.read_parquet("data\final_test.parquet")

In [None]:
combined_train.columns

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "is_weekend",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "is_holiday",
    "lockdown_1",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "pmer",
    "tend",
    "cod_tend",
    "dd",
    "ff",
    "t",
    "td",
    "u",
    "vv",
    "ww",
    "w1",
    "w2",
    "n",
    "nbas",
    "hbas",
    "cl",
    "cm",
    "ch",
    "pres",
    "tend24",
    "raf10",
    "rafper",
    "etat_sol",
    "ht_neige",
    "ssfrai",
    "rr1",
    "rr3",
    "rr6",
    "rr12",
    "rr24",
    "nnuage1",
    "ctype1",
    "hnuage1",
    "temp_humidity_interaction",
    "t_lag6h",
    "t_lag9h",
    "t_lag12h",
    "t_lag24h",
    "t_lag168h",
    "td_lag6h",
    "td_lag9h",
    "td_lag12h",
    "td_lag24h",
    "td_lag168h",
    "u_lag6h",
    "u_lag9h",
    "u_lag12h",
    "u_lag24h",
    "u_lag168h",
    "ww_lag6h",
    "ww_lag9h",
    "ww_lag12h",
    "ww_lag24h",
    "ww_lag168h",
    "n_lag6h",
    "n_lag9h",
    "n_lag12h",
    "n_lag24h",
    "n_lag168h",
    "tend24_lag6h",
    "tend24_lag9h",
    "tend24_lag12h",
    "tend24_lag24h",
    "tend24_lag168h",
    "etat_sol_lag6h",
    "etat_sol_lag9h",
    "etat_sol_lag12h",
    "etat_sol_lag24h",
    "etat_sol_lag168h",
    "rr12_lag6h",
    "rr12_lag9h",
    "rr12_lag12h",
    "rr12_lag24h",
    "rr12_lag168h",
    "temp_hour_interaction",
    "humidity_day_interaction",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]
X_pred = combined_prediction[features]

In [None]:
best_params = {
    "n_estimators": 633,
    "max_depth": 11,
    "min_child_weight": 2,
    "gamma": 0.5,
    "learning_rate": 0.01745767642563374,
    "subsample": 0.6852898171340072,
    "colsample_bytree": 0.5752583768824626,
    "reg_alpha": 0.6174748033948815,
    "reg_lambda": 0.37071451261939165,
}

final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

In [None]:
fi = pd.DataFrame(
    data=final_model.feature_importances_,
    index=final_model.feature_names_in_,
    columns=["Importance"],
)

fi.sort_values(by="Importance").plot(
    kind="barh", title="Feature Importances XGBoost", figsize=(10, 16)
)

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [None]:
predictions = final_model.predict(X_pred)

In [None]:
predictions

In [None]:
fi.to_csv("feature_importances_xgb_weather.csv")

In [None]:
predictions

In [None]:
prediction_data.head()

In [None]:
X_train_encoded = pd.get_dummies(X_train, columns=cat_feature)
X_test_encoded = pd.get_dummies(X_test, columns=cat_feature)

In [None]:
model = xgb.XGBRegressor()

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
rfecv = RFECV(
    estimator=model,
    step=1,
    cv=tscv,
    scoring="neg_mean_squared_error",
    verbose=10,
    n_jobs=-1,
)

In [None]:
rfecv.fit(X_train_encoded, y_train)

In [None]:
optimal_features = X_train_encoded.columns[rfecv.support_]

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "is_weekend",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "is_holiday",
    "lockdown_1",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "pmer",
    "tend",
    "cod_tend",
    "dd",
    "ff",
    "t",
    "td",
    "u",
    "vv",
    "ww",
    "w1",
    "w2",
    "n",
    "nbas",
    "hbas",
    "cl",
    "cm",
    "ch",
    "pres",
    "tend24",
    "raf10",
    "rafper",
    "etat_sol",
    "ht_neige",
    "ssfrai",
    "rr1",
    "rr3",
    "rr6",
    "rr12",
    "rr24",
    "nnuage1",
    "ctype1",
    "hnuage1",
    "temp_humidity_interaction",
    "t_lag6h",
    "t_lag9h",
    "t_lag12h",
    "t_lag24h",
    "t_lag168h",
    "td_lag6h",
    "td_lag9h",
    "td_lag12h",
    "td_lag24h",
    "td_lag168h",
    "u_lag6h",
    "u_lag9h",
    "u_lag12h",
    "u_lag24h",
    "u_lag168h",
    "ww_lag6h",
    "ww_lag9h",
    "ww_lag12h",
    "ww_lag24h",
    "ww_lag168h",
    "n_lag6h",
    "n_lag9h",
    "n_lag12h",
    "n_lag24h",
    "n_lag168h",
    "tend24_lag6h",
    "tend24_lag9h",
    "tend24_lag12h",
    "tend24_lag24h",
    "tend24_lag168h",
    "etat_sol_lag6h",
    "etat_sol_lag9h",
    "etat_sol_lag12h",
    "etat_sol_lag24h",
    "etat_sol_lag168h",
    "rr12_lag6h",
    "rr12_lag9h",
    "rr12_lag12h",
    "rr12_lag24h",
    "rr12_lag168h",
    "temp_hour_interaction",
    "humidity_day_interaction",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=1000, enable_categorical=True)


reg.fit(
    X_train,
    y_train,

    eval_set=[(X_train, y_train), (X_test, y_test)],

    early_stopping_rounds=10,

    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
feature_importances = reg.feature_importances_
feature_names = X_train.columns
features_ranked = sorted(
    zip(feature_names, feature_importances), key=lambda x: x[1])

In [None]:
features_ranked

In [None]:
performance_history = []

In [None]:
for i in range(len(features_ranked)):
    # Remove the least important feature
    features_to_use = [f[0] for f in features_ranked[i:]]
    X_train_reduced = X_train[features_to_use]
    X_test_reduced = X_test[features_to_use]

    # Retrain the model
    reg.fit(
        X_train_reduced,
        y_train,
        eval_set=[(X_test_reduced, y_test)],
        early_stopping_rounds=10,
    )

    # Evaluate performance
    predictions = reg.predict(X_test_reduced)
    current_performance = mean_squared_error(
        y_test, predictions, squared=False)

    # Store the performance
    performance_history.append((len(features_to_use), current_performance))

In [None]:
performance_df = pd.DataFrame(
    performance_history, columns=["Num_Features", "Performance"]
)

In [None]:
plt.plot(performance_df["Num_Features"], performance_df["Performance"])
plt.xlabel("Number of Features")
plt.ylabel("Model Performance (RMSE)")
plt.title("Performance vs. Number of Features")
plt.show()

In [None]:
performance_df.to_csv("feature performance.csv")

In [None]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "dayofweek_sin",
    "dayofweek_cos",
    "is_holiday",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "pmer",
    "tend",
    "cod_tend",
    "dd",
    "ff",
    "t",
    "td",
    "u",
    "vv",
    "ww",
    "w1",
    "w2",
    "n",
    "nbas",
    "hbas",
    "cl",
    "cm",
    "ch",
    "pres",
    "tend24",
    "raf10",
    "rafper",
    "etat_sol",
    "ht_neige",
    "ssfrai",
    "rr1",
    "rr3",
    "rr6",
    "rr12",
    "rr24",
    "nnuage1",
    "ctype1",
    "hnuage1",
    "temp_humidity_interaction",
    "t_lag6h",
    "t_lag9h",
    "t_lag12h",
    "t_lag24h",
    "t_lag168h",
    "td_lag6h",
    "td_lag9h",
    "td_lag12h",
    "td_lag24h",
    "td_lag168h",
    "u_lag6h",
    "u_lag9h",
    "u_lag12h",
    "u_lag24h",
    "u_lag168h",
    "ww_lag6h",
    "ww_lag9h",
    "ww_lag12h",
    "ww_lag24h",
    "ww_lag168h",
    "n_lag6h",
    "n_lag9h",
    "n_lag12h",
    "n_lag24h",
    "n_lag168h",
    "tend24_lag6h",
    "tend24_lag9h",
    "tend24_lag12h",
    "tend24_lag24h",
    "tend24_lag168h",
    "etat_sol_lag6h",
    "etat_sol_lag9h",
    "etat_sol_lag12h",
    "etat_sol_lag24h",
    "etat_sol_lag168h",
    "rr12_lag6h",
    "rr12_lag9h",
    "rr12_lag12h",
    "rr12_lag24h",
    "rr12_lag168h",
    "temp_hour_interaction",
    "humidity_day_interaction",
]

target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [None]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=1500, enable_categorical=True)

reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_hat_xgb = reg.predict(X_train)
rmse_xgb_train = mean_squared_error(y_train, y_hat_xgb, squared=False)
print("XGB Train:", rmse_xgb_train)

y_pred_xgb = reg.predict(X_test)
rmse_xgb_test = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGB Test:", rmse_xgb_test)

In [None]:
fi = pd.DataFrame(
    data=reg.feature_importances_, index=reg.feature_names_in_, columns=[
        "Importance"]
)
fi.sort_values(by="Importance", ascending=False)

In [None]:
features = [
    "hour_cos",
    "temp_hour_interaction",
    "humidity_day_interaction",
    "etat_sol",
    "counter_name",
    "dayofyear",
    "hour_sin",
    "t_lag168h",
    "etat_sol_lag6h",
    "t",
    "t_lag12h",
    "t_lag9h",
    "w1",
    "dayofweek",
    "n",
    "month",
    "t_lag24h",
    "lockdown_2",
    "ww",
    "u",
    "lockdown_3_3",
    "ht_neige",
    "t_lag6h",
    "dayofweek_cos",
    "lockdown_3_1",
    "lockdown_3_2",
    "td_lag12h",
    "hour",
    "rr3",
    "rr24",
    "raf10",
    "dayofweek_sin",
    "n_lag6h",
    "td_lag24h",
    "td_lag168h",
    "etat_sol_lag12h",
    "tend24_lag6h",
    "u_lag9h",
    "etat_sol_lag168h",
    "ww_lag6h",
    "u_lag24h",
    "tend24_lag9h",
    "rr12_lag12h",
    "dd",
    "cl",
    "pres",
    "u_lag12h",
    "u_lag6h",
    "etat_sol_lag9h",
    "rr12_lag168h",
    "tend24",
    "temp_humidity_interaction",
    "rr12_lag24h",
    "td",
    "cm",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]
X_pred = combined_prediction[features]

In [None]:
best_params = {
    "n_estimators": 633,
    "max_depth": 11,
    "min_child_weight": 2,
    "gamma": 0.5,
    "learning_rate": 0.01745767642563374,
    "subsample": 0.6852898171340072,
    "colsample_bytree": 0.5752583768824626,
    "reg_alpha": 0.6174748033948815,
    "reg_lambda": 0.37071451261939165,
}

final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

In [None]:
fi = pd.DataFrame(
    data=final_model.feature_importances_,
    index=final_model.feature_names_in_,
    columns=["Importance"],
)

In [None]:
fi.sort_values(by="Importance", ascending=False)

In [42]:
features = [
    "hour_cos",
    "hour",
    "temp_hour_interaction",
    "counter_name",
    "hour_sin",
    "lockdown_3_3",
    "t",
    "u_lag9h",
    "t_lag24h",
    "u",
    "lockdown_3_2",
    "month",
    "etat_sol",
    "dayofyear",
    "humidity_day_interaction",
    "dayofweek_sin",
    "t_lag9h",
    "temp_humidity_interaction",
    "dayofweek",
    "lockdown_3_1",
    "t_lag12h",
    "t_lag6h",
]
target = ['log_bike_count']
cat_feature = ['counter_name']

In [28]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]

In [30]:
best_params = {
    "n_estimators": 633,
    "max_depth": 11,
    "min_child_weight": 2,
    "gamma": 0.5,
    "learning_rate": 0.01745767642563374,
    "subsample": 0.6852898171340072,
    "colsample_bytree": 0.5752583768824626,
    "reg_alpha": 0.6174748033948815,
    "reg_lambda": 0.37071451261939165,
}
final_model = xgb.XGBRegressor(




    tree_method="hist", **best_params, enable_categorical=True




)


final_model.fit(




    X_train,




    y_train,




    eval_set=[(X_test, y_test)],




    early_stopping_rounds=10,




    verbose=10,




)

[0]	validation_0-rmse:3.20416
[10]	validation_0-rmse:2.71568
[20]	validation_0-rmse:2.29121
[30]	validation_0-rmse:1.93657
[40]	validation_0-rmse:1.64050
[50]	validation_0-rmse:1.39892
[60]	validation_0-rmse:1.21431
[70]	validation_0-rmse:1.04909
[80]	validation_0-rmse:0.91746
[90]	validation_0-rmse:0.81270
[100]	validation_0-rmse:0.73281
[110]	validation_0-rmse:0.66214
[120]	validation_0-rmse:0.61079
[130]	validation_0-rmse:0.56254
[140]	validation_0-rmse:0.53135
[150]	validation_0-rmse:0.51218
[160]	validation_0-rmse:0.49234
[170]	validation_0-rmse:0.48007
[180]	validation_0-rmse:0.46737
[190]	validation_0-rmse:0.45839
[200]	validation_0-rmse:0.44956
[210]	validation_0-rmse:0.44324
[220]	validation_0-rmse:0.43975
[230]	validation_0-rmse:0.43450
[240]	validation_0-rmse:0.43042
[250]	validation_0-rmse:0.42605
[260]	validation_0-rmse:0.42303
[270]	validation_0-rmse:0.42125
[280]	validation_0-rmse:0.41832
[290]	validation_0-rmse:0.41539
[300]	validation_0-rmse:0.41415
[310]	validation_0-

In [31]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

Final Model  Train RMSE: 0.3325216647379481
Final Model Test RMSE: 0.40347115850532306


In [34]:
fi = pd.DataFrame(
    data=final_model.feature_importances_,
    index=final_model.feature_names_in_,
    columns=["Importance"],
)
fi.sort_values(by="Importance", ascending=False)

Unnamed: 0,Importance
hour_cos,0.291146
hour,0.181208
temp_hour_interaction,0.118382
counter_name,0.086044
hour_sin,0.077321
dayofyear,0.021467
dayofweek_sin,0.020475
month,0.018448
u,0.018364
humidity_day_interaction,0.017329


In [86]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import RandomizedSearchCV
import optuna

In [57]:
X_train_encoded = pd.get_dummies(X_train, columns=cat_feature, dtype=int)
X_test_encoded = pd.get_dummies(X_test, columns=cat_feature, dtype=int)

In [75]:
base_model = xgb.XGBRegressor(random_state=42)

In [79]:
bagging_model = BaggingRegressor(
    base_model, n_estimators=10, random_state=42, verbose=10, oob_score=True
)

In [77]:
param_grid = {
    "base_estimator__n_estimators": [
        50,
        100,
        200,
    ],  # Number of trees in each base model
    "base_estimator__max_depth": [3, 4, 5],  # Depth of each tree
    # Add other hyperparameters here
}

In [84]:
random_search = RandomizedSearchCV(
    bagging_model,
    param_distributions=param_grid,
    # Number of random parameter combinations to try (can be adjusted)
    n_iter=10,
    cv=3,  # Number of cross-validation folds
    scoring="neg_mean_squared_error",  # Use mean squared error for regression
    verbose=10,
    n_jobs=-1,  # Use all available CPU cores
    random_state=42,
)

In [90]:
def objective(trial):
    # Define the base XGBoost model
    base_model = xgb.XGBRegressor(
        n_estimators=trial.suggest_int("n_estimators", 50, 200),
        max_depth=trial.suggest_int("max_depth", 3, 5),
        random_state=42,
    )

    # Create a BaggingRegressor with XGBoost as base models
    bagging_model = BaggingRegressor(
        base_model,
        n_estimators=10,
        random_state=42,
        verbose=10,
        oob_score=True,  # Enable OOB scoring
    )

    # Fit the bagging model to the training data
    bagging_model.fit(X_train_encoded, y_train)

    # Calculate the out-of-bag (OOB) score
    oob_score = -bagging_model.oob_score_  # Negate to get MSE
    return oob_score

In [91]:
study = optuna.create_study(direction="minimize")

[I 2023-12-10 21:36:33,452] A new study created in memory with name: no-name-77d98874-5fd9-4245-8e24-a5fe8cecb084


In [92]:
study.optimize(objective, n_trials=10)

Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   38.1s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   38.1s
[I 2023-12-10 21:37:14,441] Trial 0 finished with value: -0.8043596353572102 and parameters: {'n_estimators': 73, 'max_depth': 3}. Best is trial 0 with value: -0.8043596353572102.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   56.6s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   56.6s
[I 2023-12-10 21:38:13,194] Trial 1 finished with value: -0.8513402953520884 and parameters: {'n_estimators': 88, 'max_depth': 4}. Best is trial 1 with value: -0.8513402953520884.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   48.6s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   48.6s
[I 2023-12-10 21:39:03,853] Trial 2 finished with value: -0.8560931819465505 and parameters: {'n_estimators': 59, 'max_depth': 5}. Best is trial 2 with value: -0.8560931819465505.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  1.1min
[I 2023-12-10 21:40:14,564] Trial 3 finished with value: -0.8732921894589202 and parameters: {'n_estimators': 85, 'max_depth': 5}. Best is trial 3 with value: -0.8732921894589202.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   51.8s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   51.8s
[I 2023-12-10 21:41:08,447] Trial 4 finished with value: -0.8423141217056241 and parameters: {'n_estimators': 75, 'max_depth': 4}. Best is trial 3 with value: -0.8732921894589202.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  1.9min
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  1.9min
[I 2023-12-10 21:43:05,060] Trial 5 finished with value: -0.8793834816206412 and parameters: {'n_estimators': 176, 'max_depth': 4}. Best is trial 5 with value: -0.8793834816206412.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  1.4min
[I 2023-12-10 21:44:32,634] Trial 6 finished with value: -0.8704884699838676 and parameters: {'n_estimators': 134, 'max_depth': 4}. Best is trial 5 with value: -0.8793834816206412.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   59.6s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   59.6s
[I 2023-12-10 21:45:34,285] Trial 7 finished with value: -0.8542092223170503 and parameters: {'n_estimators': 92, 'max_depth': 4}. Best is trial 5 with value: -0.8793834816206412.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  2.1min
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  2.1min
[I 2023-12-10 21:47:42,288] Trial 8 finished with value: -0.8826620903160378 and parameters: {'n_estimators': 199, 'max_depth': 4}. Best is trial 8 with value: -0.8826620903160378.


Building estimator 1 of 10 for this parallel run (total 10)...
Building estimator 2 of 10 for this parallel run (total 10)...
Building estimator 3 of 10 for this parallel run (total 10)...
Building estimator 4 of 10 for this parallel run (total 10)...
Building estimator 5 of 10 for this parallel run (total 10)...
Building estimator 6 of 10 for this parallel run (total 10)...
Building estimator 7 of 10 for this parallel run (total 10)...
Building estimator 8 of 10 for this parallel run (total 10)...
Building estimator 9 of 10 for this parallel run (total 10)...
Building estimator 10 of 10 for this parallel run (total 10)...


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   57.9s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:   57.9s
[I 2023-12-10 21:48:42,540] Trial 9 finished with value: -0.8471807856274294 and parameters: {'n_estimators': 82, 'max_depth': 4}. Best is trial 8 with value: -0.8826620903160378.


In [93]:
best_params = study.best_params
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'n_estimators': 199, 'max_depth': 4}


In [95]:
base_model = xgb.XGBRegressor(**best_params, random_state=42)
bagging_model = BaggingRegressor(
    base_model,
    n_estimators=20,  # Number of base models (can be tuned)
    random_state=42,
    verbose=10,
    oob_score=True,  # Enable OOB scoring
)
bagging_model.fit(X_train_encoded, y_train)

Building estimator 1 of 20 for this parallel run (total 20)...
Building estimator 2 of 20 for this parallel run (total 20)...
Building estimator 3 of 20 for this parallel run (total 20)...
Building estimator 4 of 20 for this parallel run (total 20)...
Building estimator 5 of 20 for this parallel run (total 20)...
Building estimator 6 of 20 for this parallel run (total 20)...
Building estimator 7 of 20 for this parallel run (total 20)...
Building estimator 8 of 20 for this parallel run (total 20)...
Building estimator 9 of 20 for this parallel run (total 20)...
Building estimator 10 of 20 for this parallel run (total 20)...
Building estimator 11 of 20 for this parallel run (total 20)...
Building estimator 12 of 20 for this parallel run (total 20)...
Building estimator 13 of 20 for this parallel run (total 20)...
Building estimator 14 of 20 for this parallel run (total 20)...
Building estimator 15 of 20 for this parallel run (total 20)...
Building estimator 16 of 20 for this parallel run

[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  4.4min
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  4.4min


In [96]:
# Make predictions
y_pred = bagging_model.predict(X_test_encoded)

# Evaluate the performance
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 0.44845590436811616


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.4s


In [97]:
oob_score = -bagging_model.oob_score_
print("OOB Score (Mean Squared Error):", oob_score)

OOB Score (Mean Squared Error): -0.9282841113907048


In [98]:
features = [
    "hour_cos",
    "hour",
    "temp_hour_interaction",
    "counter_name",
    "hour_sin",
    "t",
    "u_lag9h",
    "t_lag24h",
    "u",
    "month",
    "etat_sol",
    "dayofyear",
    "humidity_day_interaction",
    "dayofweek_sin",
    "t_lag9h",
    "dayofweek",
    "lockdown_3_1",
    "t_lag12h",
    "t_lag6h",
]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]
X_pred = combined_prediction[features]

In [99]:
base_model = xgb.XGBRegressor(**best_params, random_state=42)
bagging_model = BaggingRegressor(
    base_model,
    n_estimators=20,  # Number of base models (can be tuned)
    random_state=42,
    verbose=10,
    oob_score=True,  # Enable OOB scoring
)
bagging_model.fit(X_train_encoded, y_train)

Building estimator 1 of 20 for this parallel run (total 20)...
Building estimator 2 of 20 for this parallel run (total 20)...
Building estimator 3 of 20 for this parallel run (total 20)...
Building estimator 4 of 20 for this parallel run (total 20)...
Building estimator 5 of 20 for this parallel run (total 20)...
Building estimator 6 of 20 for this parallel run (total 20)...
Building estimator 7 of 20 for this parallel run (total 20)...
Building estimator 8 of 20 for this parallel run (total 20)...
Building estimator 9 of 20 for this parallel run (total 20)...
Building estimator 10 of 20 for this parallel run (total 20)...
Building estimator 11 of 20 for this parallel run (total 20)...
Building estimator 12 of 20 for this parallel run (total 20)...
Building estimator 13 of 20 for this parallel run (total 20)...
Building estimator 14 of 20 for this parallel run (total 20)...
Building estimator 15 of 20 for this parallel run (total 20)...
Building estimator 16 of 20 for this parallel run

[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  4.7min
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  4.7min


In [None]:
features = [
    "hour_cos",
    "hour",
    "temp_hour_interaction",
    "counter_name",
    "hour_sin",
    "t",
    "u_lag9h",
    "t_lag24h",
    "u",
    "month",
    "etat_sol",
    "dayofyear",
    "humidity_day_interaction",
    "dayofweek_sin",
    "t_lag9h",
    "dayofweek",
    "lockdown_3_1",
    "t_lag12h",
    "t_lag6h",
]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]
X_pred = combined_prediction[features]

In [None]:
final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

In [None]:
fi = pd.DataFrame(
    data=final_model.feature_importances_,
    index=final_model.feature_names_in_,
    columns=["Importance"],
)
fi.sort_values(by="Importance", ascending=False)

In [None]:
features = [
    "hour_cos",
    "hour",
    "temp_hour_interaction",
    "counter_name",
    "hour_sin",
    "t",
    "t_lag24h",
    "u",
    "month",
    "etat_sol",
    "dayofyear",
    "humidity_day_interaction",
    "dayofweek_sin",
    "t_lag9h",
    "dayofweek",
    "lockdown_3_1",
    "t_lag12h",
    "t_lag6h",
]

In [None]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]
X_pred = combined_prediction[features]

In [None]:
final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

In [None]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

In [None]:
fi = pd.DataFrame(
    data=final_model.feature_importances_,
    index=final_model.feature_names_in_,
    columns=["Importance"],
)


fi.sort_values(by="Importance", ascending=False)

In [100]:
features = [
    "hour_cos",
    "hour",
    "temp_hour_interaction",
    "counter_name",
    "hour_sin",
    "t",
    "t_lag24h",
    "month",
    "etat_sol",
    "dayofyear",
    "humidity_day_interaction",
    "dayofweek_sin",
    "dayofweek",
    "lockdown_3_1",
    "t_lag6h",
]
target = ["log_bike_count"]

In [101]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]
y_test = combined_test[target]
X_pred = combined_prediction[features]

In [102]:
best_params = {
    "n_estimators": 633,
    "max_depth": 11,
    "min_child_weight": 2,
    "gamma": 0.5,
    "learning_rate": 0.01745767642563374,
    "subsample": 0.6852898171340072,
    "colsample_bytree": 0.5752583768824626,
    "reg_alpha": 0.6174748033948815,
    "reg_lambda": 0.37071451261939165,
}

In [103]:
final_model = xgb.XGBRegressor(
    tree_method="hist", **best_params, enable_categorical=True
)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=10,
)

[0]	validation_0-rmse:3.20441
[10]	validation_0-rmse:2.71263
[20]	validation_0-rmse:2.29165
[30]	validation_0-rmse:1.93980
[40]	validation_0-rmse:1.64524
[50]	validation_0-rmse:1.39728
[60]	validation_0-rmse:1.19562
[70]	validation_0-rmse:1.03813
[80]	validation_0-rmse:0.90902
[90]	validation_0-rmse:0.80907
[100]	validation_0-rmse:0.71949
[110]	validation_0-rmse:0.65609
[120]	validation_0-rmse:0.60286
[130]	validation_0-rmse:0.56435
[140]	validation_0-rmse:0.53450
[150]	validation_0-rmse:0.51235
[160]	validation_0-rmse:0.48996
[170]	validation_0-rmse:0.47367
[180]	validation_0-rmse:0.45781
[190]	validation_0-rmse:0.44724
[200]	validation_0-rmse:0.43873
[210]	validation_0-rmse:0.43271
[220]	validation_0-rmse:0.42571
[230]	validation_0-rmse:0.42082
[240]	validation_0-rmse:0.41867
[250]	validation_0-rmse:0.41703
[260]	validation_0-rmse:0.41482
[270]	validation_0-rmse:0.41324
[280]	validation_0-rmse:0.41232
[290]	validation_0-rmse:0.41033
[300]	validation_0-rmse:0.40845
[310]	validation_0-

In [104]:
y_pred_train = final_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Final Model  Train RMSE:", rmse_train)

y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
print("Final Model Test RMSE:", rmse_test)

Final Model  Train RMSE: 0.3272936890684361
Final Model Test RMSE: 0.3995836693921537


In [108]:
def objective(trial):
    # Define the hyperparameters to search over
    params = {
        "iterations": trial.suggest_int(
            "iterations", 100, 1000
        ),  # Number of iterations
        "learning_rate": trial.suggest_float(
            "learning_rate", 0.001, 0.1
        ),  # Learning rate
        "depth": trial.suggest_int("depth", 4, 10),  # Depth of the tree
        "l2_leaf_reg": trial.suggest_float(
            "l2_leaf_reg", 1e-5, 100
        ),  # L2 regularization
        "random_strength": trial.suggest_float(
            "random_strength", 0, 1
        ),  # Random strength
        "bagging_temperature": trial.suggest_float(
            "bagging_temperature", 0, 1
        ),  # Bagging temperature
    }

    # Create the CatBoost model
    model = cat.CatBoostRegressor(**params, verbose=10, cat_features=cat_feature)

    # Fit the model to the training data
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=10,
    )

    # Calculate the validation RMSE (you can use other metrics as well)
    val_pred = model.predict(X_test)
    val_rmse = np.sqrt(mean_squared_error(y_test, val_pred))

    # Return the validation RMSE as the objective value to minimize
    return val_rmse

In [109]:
study = optuna.create_study(direction="minimize")

[I 2023-12-10 22:14:16,351] A new study created in memory with name: no-name-67df545d-73c3-4bb8-97d9-ae8c8911399b


In [110]:
study.optimize(objective, n_trials=10)

0:	learn: 1.5802455	test: 1.4037767	best: 1.4037767 (0)	total: 230ms	remaining: 37.7s
10:	learn: 1.0787870	test: 0.9795331	best: 0.9795331 (10)	total: 998ms	remaining: 14s
20:	learn: 0.8023502	test: 0.6901341	best: 0.6901341 (20)	total: 1.62s	remaining: 11.1s
30:	learn: 0.7261838	test: 0.6157188	best: 0.6157188 (30)	total: 2.23s	remaining: 9.66s
40:	learn: 0.6966682	test: 0.5893661	best: 0.5893661 (40)	total: 2.85s	remaining: 8.61s
50:	learn: 0.6800956	test: 0.5782165	best: 0.5782165 (50)	total: 3.47s	remaining: 7.75s
60:	learn: 0.6682401	test: 0.5700130	best: 0.5700130 (60)	total: 4.08s	remaining: 6.95s
70:	learn: 0.6590114	test: 0.5607407	best: 0.5607407 (70)	total: 4.68s	remaining: 6.2s
80:	learn: 0.6523921	test: 0.5529502	best: 0.5529502 (80)	total: 5.29s	remaining: 5.48s
90:	learn: 0.6461300	test: 0.5489759	best: 0.5486619 (88)	total: 5.9s	remaining: 4.79s
100:	learn: 0.6420136	test: 0.5427650	best: 0.5427637 (99)	total: 6.5s	remaining: 4.12s
110:	learn: 0.6382057	test: 0.5381022	

[I 2023-12-10 22:14:28,205] Trial 0 finished with value: 0.5269272576867213 and parameters: {'iterations': 165, 'learning_rate': 0.09439256744665762, 'depth': 8, 'l2_leaf_reg': 81.16421667393797, 'random_strength': 0.4280994859006493, 'bagging_temperature': 0.5449245497198891}. Best is trial 0 with value: 0.5269272576867213.


164:	learn: 0.6235238	test: 0.5269273	best: 0.5269273 (164)	total: 10.4s	remaining: 0us

bestTest = 0.526927257
bestIteration = 164

0:	learn: 1.5923922	test: 1.4086507	best: 1.4086507 (0)	total: 74.6ms	remaining: 55.3s
10:	learn: 1.0843761	test: 0.9250459	best: 0.9250459 (10)	total: 609ms	remaining: 40.5s
20:	learn: 0.8916973	test: 0.7561058	best: 0.7561058 (20)	total: 1.13s	remaining: 38.9s
30:	learn: 0.8133993	test: 0.6991208	best: 0.6991208 (30)	total: 1.7s	remaining: 38.9s
40:	learn: 0.7701556	test: 0.6638623	best: 0.6638623 (40)	total: 2.29s	remaining: 39.2s
50:	learn: 0.7464590	test: 0.6416774	best: 0.6416774 (50)	total: 2.88s	remaining: 39.1s
60:	learn: 0.7303738	test: 0.6252763	best: 0.6252763 (60)	total: 3.49s	remaining: 38.9s
70:	learn: 0.7153465	test: 0.6158384	best: 0.6158384 (70)	total: 4.1s	remaining: 38.7s
80:	learn: 0.7046898	test: 0.6081218	best: 0.6081218 (80)	total: 4.71s	remaining: 38.5s
90:	learn: 0.6938701	test: 0.5959198	best: 0.5959198 (90)	total: 5.35s	remaini

[I 2023-12-10 22:15:19,501] Trial 1 finished with value: 0.4589130421866642 and parameters: {'iterations': 742, 'learning_rate': 0.07343853182367419, 'depth': 5, 'l2_leaf_reg': 76.31877800998082, 'random_strength': 0.7041084359182732, 'bagging_temperature': 0.4539232758358517}. Best is trial 1 with value: 0.4589130421866642.


0:	learn: 1.6366604	test: 1.4550879	best: 1.4550879 (0)	total: 67.2ms	remaining: 12.8s
10:	learn: 1.3506610	test: 1.2092052	best: 1.2092052 (10)	total: 655ms	remaining: 10.7s
20:	learn: 1.1902194	test: 1.0818839	best: 1.0818839 (20)	total: 1.24s	remaining: 10s
30:	learn: 1.0244158	test: 0.9174758	best: 0.9174758 (30)	total: 1.83s	remaining: 9.44s
40:	learn: 0.9074399	test: 0.7951326	best: 0.7951326 (40)	total: 2.41s	remaining: 8.82s
50:	learn: 0.8404539	test: 0.7276204	best: 0.7276204 (50)	total: 2.99s	remaining: 8.2s
60:	learn: 0.7995553	test: 0.6852033	best: 0.6852033 (60)	total: 3.56s	remaining: 7.59s
70:	learn: 0.7718819	test: 0.6598459	best: 0.6598459 (70)	total: 4.15s	remaining: 7.01s
80:	learn: 0.7526207	test: 0.6415503	best: 0.6415503 (80)	total: 4.75s	remaining: 6.45s
90:	learn: 0.7392384	test: 0.6284828	best: 0.6284828 (90)	total: 5.32s	remaining: 5.85s
100:	learn: 0.7279828	test: 0.6192005	best: 0.6192005 (100)	total: 5.91s	remaining: 5.27s
110:	learn: 0.7184535	test: 0.6107

[I 2023-12-10 22:15:31,038] Trial 2 finished with value: 0.5812601390650325 and parameters: {'iterations': 191, 'learning_rate': 0.03896944237593699, 'depth': 6, 'l2_leaf_reg': 42.69314499212935, 'random_strength': 0.956025546222303, 'bagging_temperature': 0.4642656759310554}. Best is trial 1 with value: 0.4589130421866642.


190:	learn: 0.6806107	test: 0.5812601	best: 0.5812601 (190)	total: 11.2s	remaining: 0us

bestTest = 0.5812601382
bestIteration = 190

0:	learn: 1.5884603	test: 1.4047071	best: 1.4047071 (0)	total: 98.7ms	remaining: 1m 12s
10:	learn: 1.0685853	test: 0.9129955	best: 0.9129955 (10)	total: 781ms	remaining: 51.3s
20:	learn: 0.8802094	test: 0.7473271	best: 0.7473271 (20)	total: 1.45s	remaining: 49.2s
30:	learn: 0.8039388	test: 0.6892363	best: 0.6892363 (30)	total: 2.18s	remaining: 49.5s
40:	learn: 0.7619505	test: 0.6556158	best: 0.6556158 (40)	total: 2.96s	remaining: 50s
50:	learn: 0.7395786	test: 0.6353257	best: 0.6353257 (50)	total: 3.72s	remaining: 49.8s
60:	learn: 0.7217911	test: 0.6200975	best: 0.6200975 (60)	total: 4.56s	remaining: 50.4s
70:	learn: 0.7084970	test: 0.6113179	best: 0.6113179 (70)	total: 5.42s	remaining: 50.6s
80:	learn: 0.6999781	test: 0.6047690	best: 0.6047690 (80)	total: 6.28s	remaining: 50.6s
90:	learn: 0.6904431	test: 0.5974369	best: 0.5974369 (90)	total: 7.13s	remai

[I 2023-12-10 22:16:31,346] Trial 3 finished with value: 0.45711483277100345 and parameters: {'iterations': 734, 'learning_rate': 0.0767229703518425, 'depth': 5, 'l2_leaf_reg': 26.643033530485667, 'random_strength': 0.5555767834324928, 'bagging_temperature': 0.6554050755600216}. Best is trial 3 with value: 0.45711483277100345.


0:	learn: 1.6645952	test: 1.4776086	best: 1.4776086 (0)	total: 111ms	remaining: 35.2s
10:	learn: 1.5657799	test: 1.3828049	best: 1.3828049 (10)	total: 1.03s	remaining: 28.6s
20:	learn: 1.4770620	test: 1.2981179	best: 1.2981179 (20)	total: 1.93s	remaining: 27.3s
30:	learn: 1.3976212	test: 1.2224775	best: 1.2224775 (30)	total: 2.79s	remaining: 25.7s
40:	learn: 1.3263833	test: 1.1539769	best: 1.1539769 (40)	total: 3.68s	remaining: 24.8s
50:	learn: 1.2625612	test: 1.0924768	best: 1.0924768 (50)	total: 4.58s	remaining: 23.9s
60:	learn: 1.2046119	test: 1.0367083	best: 1.0367083 (60)	total: 5.54s	remaining: 23.3s
70:	learn: 1.1530394	test: 0.9873184	best: 0.9873184 (70)	total: 6.45s	remaining: 22.3s
80:	learn: 1.1073597	test: 0.9443173	best: 0.9443173 (80)	total: 7.42s	remaining: 21.6s
90:	learn: 1.0657642	test: 0.9043611	best: 0.9043611 (90)	total: 8.31s	remaining: 20.7s
100:	learn: 1.0291930	test: 0.8709270	best: 0.8709270 (100)	total: 9.21s	remaining: 19.7s
110:	learn: 0.9962817	test: 0.84

[I 2023-12-10 22:17:02,215] Trial 4 finished with value: 0.6273334247776383 and parameters: {'iterations': 317, 'learning_rate': 0.008684626617892731, 'depth': 7, 'l2_leaf_reg': 11.439924847373302, 'random_strength': 0.6560397070010395, 'bagging_temperature': 0.5251910111136892}. Best is trial 3 with value: 0.45711483277100345.


316:	learn: 0.7387478	test: 0.6273334	best: 0.6273334 (316)	total: 30.4s	remaining: 0us

bestTest = 0.6273334263
bestIteration = 316

0:	learn: 1.6478696	test: 1.4618165	best: 1.4618165 (0)	total: 133ms	remaining: 2m 9s
10:	learn: 1.4135017	test: 1.2378218	best: 1.2378218 (10)	total: 1.26s	remaining: 1m 50s
20:	learn: 1.2376324	test: 1.0693547	best: 1.0693547 (20)	total: 2.35s	remaining: 1m 46s
30:	learn: 1.1043971	test: 0.9428715	best: 0.9428715 (30)	total: 3.44s	remaining: 1m 44s
40:	learn: 1.0039047	test: 0.8497433	best: 0.8497433 (40)	total: 4.54s	remaining: 1m 43s
50:	learn: 0.9288972	test: 0.7821108	best: 0.7821108 (50)	total: 5.64s	remaining: 1m 41s
60:	learn: 0.8713006	test: 0.7290492	best: 0.7290492 (60)	total: 6.79s	remaining: 1m 41s
70:	learn: 0.8291023	test: 0.6926873	best: 0.6926873 (70)	total: 7.95s	remaining: 1m 41s
80:	learn: 0.7973750	test: 0.6665315	best: 0.6665315 (80)	total: 9.1s	remaining: 1m 40s
90:	learn: 0.7732981	test: 0.6451939	best: 0.6451939 (90)	total: 10.3

[I 2023-12-10 22:19:07,400] Trial 5 finished with value: 0.46141822907461727 and parameters: {'iterations': 973, 'learning_rate': 0.02215881765218313, 'depth': 8, 'l2_leaf_reg': 35.36028480880766, 'random_strength': 0.756742579100867, 'bagging_temperature': 0.2720957534987236}. Best is trial 3 with value: 0.45711483277100345.


0:	learn: 1.5758280	test: 1.4010276	best: 1.4010276 (0)	total: 113ms	remaining: 21.3s
10:	learn: 1.0271551	test: 0.9247652	best: 0.9247652 (10)	total: 1.21s	remaining: 19.6s
20:	learn: 0.7650471	test: 0.6532061	best: 0.6532061 (20)	total: 2.27s	remaining: 18.2s
30:	learn: 0.6945015	test: 0.5898790	best: 0.5898790 (30)	total: 3.33s	remaining: 17s
40:	learn: 0.6686440	test: 0.5687596	best: 0.5687596 (40)	total: 4.39s	remaining: 15.9s
50:	learn: 0.6531157	test: 0.5572001	best: 0.5572001 (50)	total: 5.44s	remaining: 14.7s
60:	learn: 0.6440676	test: 0.5476819	best: 0.5476819 (60)	total: 6.48s	remaining: 13.6s
70:	learn: 0.6368789	test: 0.5443325	best: 0.5441033 (69)	total: 7.52s	remaining: 12.5s
80:	learn: 0.6310251	test: 0.5392803	best: 0.5392803 (80)	total: 8.57s	remaining: 11.4s
90:	learn: 0.6265277	test: 0.5322939	best: 0.5322939 (90)	total: 9.62s	remaining: 10.4s
100:	learn: 0.6228839	test: 0.5278745	best: 0.5278745 (100)	total: 10.7s	remaining: 9.3s
110:	learn: 0.6202859	test: 0.52274

[I 2023-12-10 22:19:27,788] Trial 6 finished with value: 0.5108173624393574 and parameters: {'iterations': 189, 'learning_rate': 0.09804375251795373, 'depth': 10, 'l2_leaf_reg': 90.63142675095345, 'random_strength': 0.2903490140278303, 'bagging_temperature': 0.5494225858882874}. Best is trial 3 with value: 0.45711483277100345.


188:	learn: 0.6054894	test: 0.5108310	best: 0.5108174 (186)	total: 20s	remaining: 0us

bestTest = 0.5108173623
bestIteration = 186

Shrink model to first 187 iterations.
0:	learn: 1.6342366	test: 1.4486455	best: 1.4486455 (0)	total: 110ms	remaining: 1m 15s
10:	learn: 1.3154813	test: 1.1411406	best: 1.1411406 (10)	total: 718ms	remaining: 43.9s
20:	learn: 1.1111603	test: 0.9515337	best: 0.9515337 (20)	total: 1.39s	remaining: 44s
30:	learn: 0.9835342	test: 0.8378625	best: 0.8378625 (30)	total: 2.05s	remaining: 43.1s
40:	learn: 0.9040394	test: 0.7681834	best: 0.7681834 (40)	total: 2.72s	remaining: 42.6s
50:	learn: 0.8537292	test: 0.7284558	best: 0.7284558 (50)	total: 3.41s	remaining: 42.3s
60:	learn: 0.8187132	test: 0.7014002	best: 0.7014002 (60)	total: 4.15s	remaining: 42.4s
70:	learn: 0.7933463	test: 0.6814811	best: 0.6814811 (70)	total: 4.91s	remaining: 42.4s
80:	learn: 0.7738572	test: 0.6670663	best: 0.6670663 (80)	total: 5.71s	remaining: 42.5s
90:	learn: 0.7585540	test: 0.6543894	best

[I 2023-12-10 22:19:50,286] Trial 7 finished with value: 0.5724611081867986 and parameters: {'iterations': 684, 'learning_rate': 0.036069268264529455, 'depth': 5, 'l2_leaf_reg': 48.733621453768336, 'random_strength': 0.9097331954644571, 'bagging_temperature': 0.002866613903413362}. Best is trial 3 with value: 0.45711483277100345.


Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.5724611071
bestIteration = 268

Shrink model to first 269 iterations.
0:	learn: 1.6443711	test: 1.4592820	best: 1.4592820 (0)	total: 94ms	remaining: 1m 19s
10:	learn: 1.3894776	test: 1.2151263	best: 1.2151263 (10)	total: 640ms	remaining: 48.7s
20:	learn: 1.2120730	test: 1.0511761	best: 1.0511761 (20)	total: 1.21s	remaining: 47.6s
30:	learn: 1.0875305	test: 0.9417053	best: 0.9417053 (30)	total: 1.73s	remaining: 45.6s
40:	learn: 1.0003012	test: 0.8661876	best: 0.8661876 (40)	total: 2.28s	remaining: 44.8s
50:	learn: 0.9396088	test: 0.8137262	best: 0.8137262 (50)	total: 2.81s	remaining: 43.9s
60:	learn: 0.8963686	test: 0.7760331	best: 0.7760331 (60)	total: 3.33s	remaining: 42.9s
70:	learn: 0.8650668	test: 0.7489940	best: 0.7489940 (70)	total: 3.9s	remaining: 42.7s
80:	learn: 0.8396475	test: 0.7300530	best: 0.7300530 (80)	total: 4.53s	remaining: 42.9s
90:	learn: 0.8199233	test: 0.7148840	best: 0.7148840 (90)	total: 5.1s	rem

[I 2023-12-10 22:20:41,570] Trial 8 finished with value: 0.5341666741216935 and parameters: {'iterations': 848, 'learning_rate': 0.028541900663317573, 'depth': 4, 'l2_leaf_reg': 57.867517704486694, 'random_strength': 0.7526239722579084, 'bagging_temperature': 0.45483490582198927}. Best is trial 3 with value: 0.45711483277100345.


Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.534166673
bestIteration = 803

Shrink model to first 804 iterations.
0:	learn: 1.6407478	test: 1.4550145	best: 1.4550145 (0)	total: 126ms	remaining: 43.1s
10:	learn: 1.3563922	test: 1.1817785	best: 1.1817785 (10)	total: 1.24s	remaining: 37.6s
20:	learn: 1.1583489	test: 0.9933720	best: 0.9933720 (20)	total: 2.38s	remaining: 36.6s
30:	learn: 1.0195987	test: 0.8638049	best: 0.8638049 (30)	total: 3.47s	remaining: 35s
40:	learn: 0.9225834	test: 0.7761246	best: 0.7761246 (40)	total: 4.59s	remaining: 33.9s
50:	learn: 0.8554015	test: 0.7164853	best: 0.7164853 (50)	total: 5.72s	remaining: 32.9s
60:	learn: 0.8091263	test: 0.6787569	best: 0.6787569 (60)	total: 6.93s	remaining: 32.2s
70:	learn: 0.7754792	test: 0.6512159	best: 0.6512159 (70)	total: 8.13s	remaining: 31.2s
80:	learn: 0.7519492	test: 0.6337715	best: 0.6337715 (80)	total: 9.34s	remaining: 30.3s
90:	learn: 0.7354304	test: 0.6217407	best: 0.6217407 (90)	total: 10.5s	rema

[I 2023-12-10 22:21:24,060] Trial 9 finished with value: 0.5154128476308684 and parameters: {'iterations': 344, 'learning_rate': 0.027872143031210172, 'depth': 8, 'l2_leaf_reg': 17.961836542483496, 'random_strength': 0.675856762509769, 'bagging_temperature': 0.7298084275195226}. Best is trial 3 with value: 0.45711483277100345.


343:	learn: 0.5588292	test: 0.5154128	best: 0.5154128 (343)	total: 41.9s	remaining: 0us

bestTest = 0.5154128466
bestIteration = 343



In [111]:
best_params_catboost = study.best_params

In [117]:
best_params_catboost

{'iterations': 734,
 'learning_rate': 0.0767229703518425,
 'depth': 5,
 'l2_leaf_reg': 26.643033530485667,
 'random_strength': 0.5555767834324928,
 'bagging_temperature': 0.6554050755600216}

In [112]:
final_catboost_model = cat.CatBoostRegressor(
    **best_params_catboost,
    verbose=10,
)

In [114]:
final_catboost_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    cat_features=cat_feature,
)

0:	learn: 1.5884603	test: 1.4047071	best: 1.4047071 (0)	total: 74ms	remaining: 54.2s
10:	learn: 1.0685853	test: 0.9129955	best: 0.9129955 (10)	total: 672ms	remaining: 44.2s
20:	learn: 0.8802094	test: 0.7473271	best: 0.7473271 (20)	total: 1.26s	remaining: 42.7s
30:	learn: 0.8039388	test: 0.6892363	best: 0.6892363 (30)	total: 1.81s	remaining: 41.1s
40:	learn: 0.7619505	test: 0.6556158	best: 0.6556158 (40)	total: 2.39s	remaining: 40.4s
50:	learn: 0.7395786	test: 0.6353257	best: 0.6353257 (50)	total: 2.96s	remaining: 39.6s
60:	learn: 0.7217911	test: 0.6200975	best: 0.6200975 (60)	total: 3.59s	remaining: 39.6s
70:	learn: 0.7084970	test: 0.6113179	best: 0.6113179 (70)	total: 4.21s	remaining: 39.3s
80:	learn: 0.6999781	test: 0.6047690	best: 0.6047690 (80)	total: 4.81s	remaining: 38.8s
90:	learn: 0.6904431	test: 0.5974369	best: 0.5974369 (90)	total: 5.43s	remaining: 38.4s
100:	learn: 0.6819692	test: 0.5910887	best: 0.5910887 (100)	total: 6.05s	remaining: 38s
110:	learn: 0.6694927	test: 0.57920

<catboost.core.CatBoostRegressor at 0x2a5b5bf7cd0>

In [115]:
y_pred = final_catboost_model.predict(X_test)
rmse_cat = mean_squared_error(y_test, y_pred, squared=False)
print(rmse_cat)

0.45711483277100345
