In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import cross_val_score



train = pd.read_parquet("..\data\\train.parquet")
test = pd.read_parquet("..\data\\final_test.parquet")

# We drop bike count, as it can't be used for training
train.drop(columns=["bike_count"], inplace=True)

X = train.drop(columns=["log_bike_count"])
y = train["log_bike_count"]


def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

def kaggle_prediction(model, file_path, test_set=test):
    """
    Return a .csv file for kaggle submission (predictions of the test dataset)
    Parameters:
        - model : a fitted sklearn model object
        - test_set : the dataset to predict log_bike_count on 
    Output:
        - file.csv : a .csv file to submit to kaggle 
    """

    y_pred = model.predict(test_set)
    y_pred_df = pd.DataFrame(y_pred, columns=["log_bike_count"])
    y_pred_df.index.name = "Id"

    y_pred_df.to_csv(file_path)

    return None

columns_to_drop =  ["counter_name", "counter_id", "site_name", "site_id", "coordinates", "counter_technical_id", "counter_installation_date"]

def drop_columns(X, columns_to_drop=columns_to_drop):
    return X.drop(columns=columns_to_drop)

# column_dropper = FunctionTransformer(lambda X: drop_columns(X, columns_to_drop=columns_to_drop), validate=False)
column_dropper = FunctionTransformer(drop_columns, validate=False)


def encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the "date" columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

date_encoder = FunctionTransformer(encode_dates, validate=False)
date_cols = encode_dates(X[["date"]]).columns.tolist()


vacation_paris_2020 = [
    ("2020-10-17", "2020-11-01"),  
    ("2020-12-19", "2021-01-03"),  
    ("2021-02-13", "2021-02-28"),  
    ("2021-04-17", "2021-05-02"), 
    ("2021-07-06", "2021-08-31"), 
]

vacation_paris_2020 = [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in vacation_paris_2020]

def encode_vacation(X, vacation=vacation_paris_2020):
    X = X.copy()
    X["is_vacation"] = 0
    for start, end in vacation:
        X["is_vacation"] |= (X["date"] >= start) & (X["date"] <= end)
    X["is_vacation"] = X["is_vacation"].astype(int)

    return X

vacation_encoder = FunctionTransformer(encode_vacation, validate=False)


holidays = [
    "2020-11-01",  
    "2020-11-11", 
    "2020-12-25",  
    "2021-01-01",  
    "2021-04-05",  
    "2021-05-01",  
    "2021-05-08",  
    "2021-05-13", 
    "2021-05-24",  
    "2021-07-14",  
    "2021-08-15", 
]
holidays = pd.to_datetime(holidays)


def encode_holidays(X, holidays=holidays):
    X = X.copy()
    X["jour_ferie"] = X["date"].apply(lambda x: 1 if x in holidays else 0)
    return X

holidays_encoder = FunctionTransformer(encode_holidays, validate=False)

categorical_columns = drop_columns(X).select_dtypes(include="category").columns.to_list()
categorical_columns.append("site_id")
numerical_columns = drop_columns(X).select_dtypes(include="float").columns.to_list()

from sklearn.metrics import root_mean_squared_error

def compute_score(estimator, X_train, X_valid, y_train, y_valid):
    estimator.fit(X_train, y_train)
    score = root_mean_squared_error(y_valid, estimator.predict(X_valid))
    return score


weather = pd.read_csv("../data/H_75_previous-2020-2022.csv.gz", sep=";",
                      parse_dates=["AAAAMMJJHH"],
                      date_format="%Y%m%d%H",
                      ).rename(columns={"AAAAMMJJHH" : "date"})


weather_columns = ["U", "FF", "RR1", "T"]
columns_to_merge = weather_columns + ["date"]

def merge_weather(X, weather=weather, columns_to_merge=columns_to_merge):
    weather = weather.copy()
    weather = weather[columns_to_merge].sort_values("date")
    weather["date"] = weather["date"].astype("datetime64[us]")

    start = pd.Timestamp(min(X["date"]))
    end = pd.Timestamp(max(X["date"]))

    weather = weather[(weather["date"] >= start) & (weather["date"] <= end)]

    grouped_weather = weather.groupby("date").mean()

    X = X.copy()
    initial_index = X.index  # Save original index
    X = X.sort_values("date")

    # Perform the merge_asof
    merged_df = pd.merge_asof(X, grouped_weather, on="date", direction="nearest")

    # Reindex to ensure alignment with X
    merged_df.index = initial_index  # Reassign original index
    merged_df = merged_df.reindex(initial_index)  # Align rows to original order

    return merged_df


weather_merger = FunctionTransformer(merge_weather, validate=False)


from xgboost import XGBRegressor


preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        # ("weather numerical", StandardScaler(), numerical_weather_columns),
        # ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_columns),
    ],
    remainder="passthrough"
)

pipeline = Pipeline(
    [
        ("dropper", column_dropper),
        ("vacation", vacation_encoder),
        ("date", date_encoder),
        ("preprocessor", preprocessor),
        ("linear_regressor", XGBRegressor(n_estimators=350, 
                                          learning_rate=0.1, 
                                          max_depth=20, 
                                          random_state=42, 
                                          objective='reg:squarederror'))
    ]
)

compute_score(pipeline, X_train, X_valid, y_train, y_valid)

np.float64(0.5559983705040591)

In [2]:
def kaggle_prediction(model, file_path, test_set=test):
    """
    Return a .csv file for kaggle submission (predictions of the test dataset)
    Parameters:
        - model : a fitted sklearn model object
        - test_set : the dataset to predict log_bike_count on 
    Output:
        - file.csv : a .csv file to submit to kaggle 
    """

    y_pred = model.predict(test_set)
    y_pred_df = pd.DataFrame(y_pred, columns=["log_bike_count"])
    y_pred_df.index.name = "Id"

    y_pred_df.to_csv(file_path)

    return None

In [None]:
kaggle_prediction(pipeline.fit(X, y), "../prediction_csvs/test_kaggle.csv")