In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import cross_val_score



train = pd.read_parquet("..\data\\train.parquet")
test = pd.read_parquet("..\data\\final_test.parquet")

# We drop bike count, as it can't be used for training
train.drop(columns=["bike_count"], inplace=True)

X = train.drop(columns=["log_bike_count"])
y = train["log_bike_count"]


def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

def kaggle_prediction(model, file_path, test_set=test):
    """
    Return a .csv file for kaggle submission (predictions of the test dataset)
    Parameters:
        - model : a fitted sklearn model object
        - test_set : the dataset to predict log_bike_count on 
    Output:
        - file.csv : a .csv file to submit to kaggle 
    """

    y_pred = model.predict(test_set)
    y_pred_df = pd.DataFrame(y_pred, columns=["log_bike_count"])
    y_pred_df.index.name = "Id"

    y_pred_df.to_csv(file_path)

    return None

columns_to_drop =  ["counter_name", "counter_id", "site_name", "site_id", "coordinates", "counter_technical_id", "counter_installation_date"]

def drop_columns(X, columns_to_drop=columns_to_drop):
    return X.drop(columns=columns_to_drop)

# column_dropper = FunctionTransformer(lambda X: drop_columns(X, columns_to_drop=columns_to_drop), validate=False)
column_dropper = FunctionTransformer(drop_columns, validate=False)


def encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the "date" columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

date_encoder = FunctionTransformer(encode_dates, validate=False)
date_cols = encode_dates(X[["date"]]).columns.tolist()


vacation_paris_2020 = [
    ("2020-10-17", "2020-11-01"),  
    ("2020-12-19", "2021-01-03"),  
    ("2021-02-13", "2021-02-28"),  
    ("2021-04-17", "2021-05-02"), 
    ("2021-07-06", "2021-08-31"), 
]

vacation_paris_2020 = [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in vacation_paris_2020]

def encode_vacation(X, vacation=vacation_paris_2020):
    X = X.copy()
    X["is_vacation"] = 0
    for start, end in vacation:
        X["is_vacation"] |= (X["date"] >= start) & (X["date"] <= end)
    X["is_vacation"] = X["is_vacation"].astype(int)

    return X

vacation_encoder = FunctionTransformer(encode_vacation, validate=False)


holidays = [
    "2020-11-01",  
    "2020-11-11", 
    "2020-12-25",  
    "2021-01-01",  
    "2021-04-05",  
    "2021-05-01",  
    "2021-05-08",  
    "2021-05-13", 
    "2021-05-24",  
    "2021-07-14",  
    "2021-08-15", 
]
holidays = pd.to_datetime(holidays)


def encode_holidays(X, holidays=holidays):
    X = X.copy()
    X["jour_ferie"] = X["date"].apply(lambda x: 1 if x in holidays else 0)
    return X

holidays_encoder = FunctionTransformer(encode_holidays, validate=False)

categorical_columns = drop_columns(X).select_dtypes(include="category").columns.to_list()
categorical_columns.append("site_id")
numerical_columns = drop_columns(X).select_dtypes(include="float").columns.to_list()

from sklearn.metrics import root_mean_squared_error

def compute_score(estimator, X_train, X_valid, y_train, y_valid):
    estimator.fit(X_train, y_train)
    score = root_mean_squared_error(y_valid, estimator.predict(X_valid))
    return score


weather = pd.read_csv("../data/H_75_previous-2020-2022.csv.gz", sep=";",
                      parse_dates=["AAAAMMJJHH"],
                      date_format="%Y%m%d%H",
                      ).rename(columns={"AAAAMMJJHH" : "date"})


weather_columns = ["U", "FF", "RR1", "T"]
columns_to_merge = weather_columns + ["date"]

def merge_weather(X, weather=weather, columns_to_merge=columns_to_merge):
    weather = weather.copy()
    weather = weather[columns_to_merge].sort_values("date")
    weather["date"] = weather["date"].astype("datetime64[us]")

    start = pd.Timestamp(min(X["date"]))
    end = pd.Timestamp(max(X["date"]))

    weather = weather[(weather["date"] >= start) & (weather["date"] <= end)]

    grouped_weather = weather.groupby("date").mean()

    X = X.copy()
    initial_index = X.index  # Save original index
    X = X.sort_values("date")

    # Perform the merge_asof
    merged_df = pd.merge_asof(X, grouped_weather, on="date", direction="nearest")

    # Reindex to ensure alignment with X
    merged_df.index = initial_index  # Reassign original index
    merged_df = merged_df.reindex(initial_index)  # Align rows to original order

    return merged_df


weather_merger = FunctionTransformer(merge_weather, validate=False)


from xgboost import XGBRegressor


preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        # ("weather numerical", StandardScaler(), numerical_weather_columns),
        # ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_columns),
    ],
    remainder="passthrough"
)

pipeline = Pipeline(
    [
        ("dropper", column_dropper),
        ("vacation", vacation_encoder),
        ("date", date_encoder),
        ("preprocessor", preprocessor),
        ("linear_regressor", XGBRegressor(n_estimators=350, 
                                          learning_rate=0.1, 
                                          max_depth=20, 
                                          random_state=42, 
                                          objective='reg:squarederror'))
    ]
)

compute_score(pipeline, X_train, X_valid, y_train, y_valid)

np.float64(0.5559983705040591)

In [2]:
def kaggle_prediction(model, file_path, test_set=test):
    """
    Return a .csv file for kaggle submission (predictions of the test dataset)
    Parameters:
        - model : a fitted sklearn model object
        - test_set : the dataset to predict log_bike_count on 
    Output:
        - file.csv : a .csv file to submit to kaggle 
    """

    y_pred = model.predict(test_set)
    y_pred_df = pd.DataFrame(y_pred, columns=["log_bike_count"])
    y_pred_df.index.name = "Id"

    y_pred_df.to_csv(file_path)

    return None

In [None]:
kaggle_prediction(pipeline.fit(X, y), "../prediction_csvs/test_kaggle.csv")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import cross_val_score



train = pd.read_parquet("..\data\\train.parquet")
test = pd.read_parquet("..\data\\final_test.parquet")

# We drop bike count, as it can't be used for training
train.drop(columns=["bike_count"], inplace=True)

X = train.drop(columns=["log_bike_count"])
y = train["log_bike_count"]


def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

def kaggle_prediction(model, file_path, test_set=test):
    """
    Return a .csv file for kaggle submission (predictions of the test dataset)
    Parameters:
        - model : a fitted sklearn model object
        - test_set : the dataset to predict log_bike_count on 
    Output:
        - file.csv : a .csv file to submit to kaggle 
    """

    y_pred = model.predict(test_set)
    y_pred_df = pd.DataFrame(y_pred, columns=["log_bike_count"])
    y_pred_df.index.name = "Id"

    y_pred_df.to_csv(file_path)

    return None

columns_to_drop =  ["counter_name", "counter_id", "site_name", "site_id", "coordinates", "counter_technical_id", "counter_installation_date"]

def drop_columns(X, columns_to_drop=columns_to_drop):
    return X.drop(columns=columns_to_drop)

# column_dropper = FunctionTransformer(lambda X: drop_columns(X, columns_to_drop=columns_to_drop), validate=False)
column_dropper = FunctionTransformer(drop_columns, validate=False)


def encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the "date" columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

date_encoder = FunctionTransformer(encode_dates, validate=False)
date_cols = encode_dates(X[["date"]]).columns.tolist()


vacation_paris_2020 = [
    ("2020-10-17", "2020-11-01"),  
    ("2020-12-19", "2021-01-03"),  
    ("2021-02-13", "2021-02-28"),  
    ("2021-04-17", "2021-05-02"), 
    ("2021-07-06", "2021-08-31"), 
]

vacation_paris_2020 = [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in vacation_paris_2020]

def encode_vacation(X, vacation=vacation_paris_2020):
    X = X.copy()
    X["is_vacation"] = 0
    for start, end in vacation:
        X["is_vacation"] |= (X["date"] >= start) & (X["date"] <= end)
    X["is_vacation"] = X["is_vacation"].astype(int)

    return X

vacation_encoder = FunctionTransformer(encode_vacation, validate=False)


holidays = [
    "2020-11-01",  
    "2020-11-11", 
    "2020-12-25",  
    "2021-01-01",  
    "2021-04-05",  
    "2021-05-01",  
    "2021-05-08",  
    "2021-05-13", 
    "2021-05-24",  
    "2021-07-14",  
    "2021-08-15", 
]
holidays = pd.to_datetime(holidays)


lockdown = [
    ("2020-10-30", "2020-12-15", "lockdown"),
    ("2021-03-20", "2021-04-17", "lockdown"), 
]

curfew = [
    ("2020-10-17", "2020-12-14", "21:00", "06:00", "curfew"),
    ("2020-12-15", "2021-01-15", "20:00", "06:00", "curfew"),
    ("2021-01-16", "2021-03-19", "18:00", "06:00", "curfew"),
    ("2021-03-20", "2021-06-08", "19:00", "06:00", "curfew"),
    ("2021-06-09", "2021-06-19", "23:00", "06:00", "curfew"),
]

lockdown = [(pd.to_datetime(start), pd.to_datetime(end), label) for start, end, label in lockdown]
curfew = [(pd.to_datetime(start), pd.to_datetime(end), start_hour, end_hour, label) 
          for start, end, start_hour, end_hour, label in curfew]

def encode_lockdown_curfew(X, lockdown=lockdown, curfew=curfew):
    X = X.copy()
    X["is_lockdown"] = 0
    X["is_curfew"] = 0

    for start, end, label in lockdown:
        X["is_lockdown"] |= (X["date"] >= start) & (X["date"] <= end)

    for start, end, start_hour, end_hour, label in curfew:
        X["is_curfew"] |= ((X["date"] >= start) & (X["date"] <= end) & 
                           ((X["date"].dt.time >= pd.to_datetime(start_hour).time()) | 
                            (X["date"].dt.time <= pd.to_datetime(end_hour).time())))
    return X

lockdown_curfew_encoder = FunctionTransformer(encode_lockdown_curfew, validate=False)

def encode_holidays(X, holidays=holidays):
    X = X.copy()
    X["jour_ferie"] = X["date"].apply(lambda x: 1 if x in holidays else 0)
    return X

holidays_encoder = FunctionTransformer(encode_holidays, validate=False)

categorical_columns = drop_columns(X).select_dtypes(include="category").columns.to_list()
categorical_columns.append("site_id")
numerical_columns = drop_columns(X).select_dtypes(include="float").columns.to_list()

from sklearn.metrics import root_mean_squared_error

def compute_score(estimator, X_train, X_valid, y_train, y_valid):
    estimator.fit(X_train, y_train)
    score = root_mean_squared_error(y_valid, estimator.predict(X_valid))
    return score


weather = pd.read_csv("../external_data/external_data.csv")


weather_columns = ["u", "ff", "rr1", "t"]
columns_to_merge = weather_columns + ["date"]


def merge_weather(X, weather=weather, columns_to_merge=columns_to_merge):
    weather = weather.copy()
    weather = weather[columns_to_merge]
    weather["date"] = weather["date"].astype("datetime64[us]")
    
    grouped_weather = weather.groupby("date").mean()

    X = X.copy()
    
    # Perform the merge while keeping the left index
    merged_df = pd.merge(X, grouped_weather, on="date", how="left")

    return merged_df




weather_merger = FunctionTransformer(merge_weather, validate=False)


from xgboost import XGBRegressor


preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        # ("weather numerical", StandardScaler(), numerical_weather_columns),
        # ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_columns),
    ],
    remainder="passthrough"
)

pipeline = Pipeline(
    [
        ("dropper", column_dropper),
        ("vacation", vacation_encoder),
        ("holiday", holidays_encoder),
        ("lockdown curfew", lockdown_curfew_encoder),
        ("weather merge", weather_merger),
        ("date", date_encoder),
        ("preprocessor", preprocessor),
        ("linear_regressor", XGBRegressor(n_estimators=350, 
                                          learning_rate=0.1, 
                                          max_depth=20, 
                                          random_state=42, 
                                          objective='reg:squarederror'))
    ]
)
compute_score(pipeline, X_train, X_valid, y_train, y_valid)

np.float64(0.5432520928059277)

In [None]:
import optuna
from sklearn.metrics import mean_squared_error
from math import sqrt

def objective(trial):
    """
    Objective function for Optuna study to optimize XGBRegressor hyperparameters.
    """
    n_estimators = trial.suggest_int("n_estimators", 100, 250)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.5, log=True)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)

    pipeline = Pipeline(
        [
            ("dropper", column_dropper),
            ("vacation", vacation_encoder),
            ("holiday", holidays_encoder),
            ("lockdown curfew", lockdown_curfew_encoder),
            ("weather merge", weather_merger),
            ("date", date_encoder),
            ("preprocessor", preprocessor),
            ("xgb", XGBRegressor(
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth,
                subsample=subsample,
                objective='reg:squarederror'
            ))
        ]
    )

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_valid)
    rmse = sqrt(mean_squared_error(y_valid, y_pred))

    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)


[I 2024-12-11 22:22:54,867] A new study created in memory with name: no-name-5854b32a-f35d-4f38-9bd0-95fd1db73356
[I 2024-12-11 22:23:43,428] Trial 0 finished with value: 0.5970131368965089 and parameters: {'n_estimators': 218, 'learning_rate': 0.27581786863679475, 'max_depth': 12, 'subsample': 0.7539848818038062}. Best is trial 0 with value: 0.5970131368965089.
[I 2024-12-11 22:25:37,323] Trial 1 finished with value: 0.5579666452834916 and parameters: {'n_estimators': 154, 'learning_rate': 0.13241939173207412, 'max_depth': 19, 'subsample': 0.7215656469779265}. Best is trial 1 with value: 0.5579666452834916.
[I 2024-12-11 22:26:15,487] Trial 2 finished with value: 0.5877736499074718 and parameters: {'n_estimators': 126, 'learning_rate': 0.45839134999175607, 'max_depth': 8, 'subsample': 0.6918605172191894}. Best is trial 1 with value: 0.5579666452834916.
[I 2024-12-11 22:27:05,119] Trial 3 finished with value: 0.568361267272805 and parameters: {'n_estimators': 242, 'learning_rate': 0.18

Best hyperparameters: {'n_estimators': 203, 'learning_rate': 0.07814257788803206, 'max_depth': 11, 'subsample': 0.8420136321973922}
Best RMSE: 0.5374323737361055
