In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as MSE
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
import xgboost as xgb
from xgboost import plot_importance, XGBRegressor
from skrub import TableVectorizer
import matplotlib.pyplot as plt
import holidays

In [2]:
df = pd.read_parquet('data/train.parquet')

In [3]:
def df_manipulation(df):
    df['date'] = pd.to_datetime(df['date'])
    # extract useful information from dates
    df['hour'] = df['date'].dt.hour
    df['weekday'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year

    df['weekend_day'] = np.where(df['weekday'].isin([5, 6]), 1, 0)

    season_mapping = {
        3: 'spring', 4: 'spring', 5: 'spring',
        6: 'summer', 7: 'summer', 8: 'summer',
        9: 'autumn', 10: 'autumn', 11: 'autumn',
        12: 'winter', 1: 'winter', 2: 'winter'
    }

    # Map the 'month' column to seasons
    df['season'] = df['month'].map(season_mapping)

    # consider holidays in France
    holid = holidays.France(years=df['year'].unique())
    df['holidays'] = np.where(df['date'].isin(holid), 1, 0)

    #consider lockdowns
    lockdown_1_start = pd.Timestamp('2020-10-30')
    lockdown_1_end = pd.Timestamp('2020-12-15')

    lockdown_2_start = pd.Timestamp('2021-04-03')
    lockdown_2_end = pd.Timestamp('2021-05-03')

    # Create a lockdown flag using conditions
    df['lockdown'] = 0  # Initialize column with 0
    df.loc[(df['date'] >= lockdown_1_start) & (df['date'] <= lockdown_1_end), 'lockdown'] = 1
    df.loc[(df['date'] >= lockdown_2_start) & (df['date'] <= lockdown_2_end), 'lockdown'] = 1

    return df


In [9]:
df = df_manipulation(df)

In [10]:
# create X and y
y = df['log_bike_count']
X = df.drop(columns=['bike_count', 'log_bike_count', 'counter_id',
                            'site_id', 'coordinates', 'counter_technical_id',
                            'site_name', 'date'])

X

Unnamed: 0,counter_name,counter_installation_date,latitude,longitude,hour,weekday,month,year,weekend_day,season,holidays,lockdown
48321,28 boulevard Diderot E-O,2013-01-18,48.846028,2.375429,2,1,9,2020,0,autumn,0,0
48324,28 boulevard Diderot E-O,2013-01-18,48.846028,2.375429,3,1,9,2020,0,autumn,0,0
48327,28 boulevard Diderot E-O,2013-01-18,48.846028,2.375429,4,1,9,2020,0,autumn,0,0
48330,28 boulevard Diderot E-O,2013-01-18,48.846028,2.375429,15,1,9,2020,0,autumn,0,0
48333,28 boulevard Diderot E-O,2013-01-18,48.846028,2.375429,18,1,9,2020,0,autumn,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
929175,254 rue de Vaugirard SO-NE,2020-11-29,48.839770,2.301980,6,3,9,2021,0,autumn,0,0
929178,254 rue de Vaugirard SO-NE,2020-11-29,48.839770,2.301980,10,3,9,2021,0,autumn,0,0
929181,254 rue de Vaugirard SO-NE,2020-11-29,48.839770,2.301980,15,3,9,2021,0,autumn,0,0
929184,254 rue de Vaugirard SO-NE,2020-11-29,48.839770,2.301980,22,3,9,2021,0,autumn,0,0


In [11]:
# Divide in train and test
df_train, df_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
df_train.head(1)

Unnamed: 0,counter_name,counter_installation_date,latitude,longitude,hour,weekday,month,year,weekend_day,season,holidays,lockdown
413467,152 boulevard du Montparnasse O-E,2018-12-07,48.840801,2.333233,7,4,11,2020,0,autumn,0,1


In [13]:
# Model
variables = ['counter_name', 'year', 'month', 'weekend_day', 'weekday', 'hour', 
             'counter_installation_date', 'season']

preprocess = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(handle_unknown="ignore"), variables)
    ],
    remainder='passthrough'
)

xgb = XGBRegressor(random_state=42)

pipe = Pipeline(steps=[
                ('preprocess', preprocess),
                ('regressor', xgb)
])

In [14]:
param_grid = {
    'regressor__max_depth': [8, 12,15],
    'regressor__n_estimators': [600, 700, 800],
    'regressor__learning_rate': [0.1, 0.01]
}

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)

grid_search_results = grid_search.fit(df_train, y_train)
print("The best parameters are ",grid_search.best_params_)  

Fitting 5 folds for each of 18 candidates, totalling 90 fits
The best parameters are  {'regressor__learning_rate': 0.1, 'regressor__max_depth': 8, 'regressor__n_estimators': 700}


In [15]:
xgb = grid_search.best_estimator_
xgb.fit(df_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [16]:
y_test_pred = xgb.predict(df_test)
rmse = MSE(y_test, y_test_pred)
print(f"The RMSE is: {np.sqrt(rmse)}")

The RMSE is: 0.4858539086967299


In [17]:
X_test = pd.read_parquet('data/final_test.parquet')
X_test['date'] = pd.to_datetime(X_test['date'])

In [None]:
X_test = df_manipulation(X_test)

In [None]:
y_pred = xgb.predict(X_test)
y_pred = np.where(y_pred < 0, 0, y_pred)

In [None]:
y_pred = xgb.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)