In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import catboost as cat
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from pandas.tseries.holiday import Holiday, AbstractHolidayCalendar
from dateutil.easter import easter
from datetime import timedelta

color_pal = sns.color_palette()

In [93]:
data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "final_test.parquet")
data["date"] = pd.to_datetime(data["date"])
data = data.set_index("date")
test_data["date"] = pd.to_datetime(test_data["date"])
test_data = test_data.set_index("date")

In [94]:
train = data[["counter_name", "log_bike_count"]]
test = test_data[["counter_name"]]

In [95]:
weather_data = pd.read_csv("data/external_data_cleaned.csv")

In [96]:
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data.set_index("date", inplace=True)
weather_data_hourly = weather_data.resample("H").ffill()
weather_data_hourly = weather_data_hourly.drop(columns=["week", "day"])

In [97]:
class FrenchHolidayCalendar(AbstractHolidayCalendar):
    rules = [
        Holiday("New Year's Day", month=1, day=1),
        Holiday("Labour Day", month=5, day=1),
        Holiday("Victory in Europe Day", month=5, day=8),
        Holiday("Bastille Day", month=7, day=14),
        Holiday("Assumption of Mary", month=8, day=15),
        Holiday("All Saints' Day", month=11, day=1),
        Holiday("Armistice Day", month=11, day=11),
        Holiday("Christmas Day", month=12, day=25),
    ]

    @staticmethod
    def easter_related_holidays(year):
        easter_sunday = easter(year)
        return [
            (easter_sunday + timedelta(days=1), "Easter Monday"),
            (easter_sunday + timedelta(days=39), "Ascension Day"),
        ]

In [98]:
def cyclical_encode(df, column, max_value):
    df[column + "_sin"] = np.sin(2 * np.pi * df[column] / max_value)
    df[column + "_cos"] = np.cos(2 * np.pi * df[column] / max_value)
    return df

In [99]:
def create_features(df):
    df = df.copy()
    df["hour"] = df.index.hour
    df["dayofweek"] = df.index.dayofweek
    df["quarter"] = df.index.quarter
    df["month"] = df.index.month
    df["dayofyear"] = df.index.dayofyear

    # Boolean for weekends
    df["is_weekend"] = df["dayofweek"].isin([5, 6])

    # cyclical
    df = cyclical_encode(df, "hour", 24)
    df = cyclical_encode(df, "dayofweek", 7)

    # Boolean for holidays
    cal = FrenchHolidayCalendar()
    holidays = cal.holidays(start=df.index.min(), end=df.index.max())
    easter_holidays = []
    for year in range(df.index.year.min(), df.index.year.max() + 1):
        for date, _ in FrenchHolidayCalendar.easter_related_holidays(year):
            easter_holidays.append(date)
    holidays = holidays.union(pd.to_datetime(easter_holidays))
    df["is_holiday"] = df.index.isin(holidays)

    # Lockdown periods
    lockdowns = {
        "lockdown_1": ("2020-03-17", "2020-05-10"),
        "lockdown_2": ("2020-10-28", "2020-12-01"),
        # with curfew from 7 PM to 6 AM
        "lockdown_3_1": ("2021-04-03", "2021-05-18"),
        # with curfew from 9 PM to 6 AM
        "lockdown_3_2": ("2021-05-19", "2021-06-08"),
        # with curfew from 11 PM to 6 AM
        "lockdown_3_3": ("2021-06-09", "2021-06-29"),
    }
    for lockdown, (start_date, end_date) in lockdowns.items():
        mask = (df.index >= start_date) & (df.index <= end_date)
        df[lockdown] = mask

    return df

In [100]:
train = create_features(train)

In [101]:
train.head()

Unnamed: 0_level_0,counter_name,log_bike_count,hour,dayofweek,quarter,month,dayofyear,is_weekend,hour_sin,hour_cos,dayofweek_sin,dayofweek_cos,is_holiday,lockdown_1,lockdown_2,lockdown_3_1,lockdown_3_2,lockdown_3_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-09-01 02:00:00,28 boulevard Diderot E-O,0.0,2,1,3,9,245,False,0.5,0.8660254,0.781831,0.62349,False,False,False,False,False,False
2020-09-01 03:00:00,28 boulevard Diderot E-O,0.693147,3,1,3,9,245,False,0.707107,0.7071068,0.781831,0.62349,False,False,False,False,False,False
2020-09-01 04:00:00,28 boulevard Diderot E-O,0.0,4,1,3,9,245,False,0.866025,0.5,0.781831,0.62349,False,False,False,False,False,False
2020-09-01 15:00:00,28 boulevard Diderot E-O,1.609438,15,1,3,9,245,False,-0.707107,-0.7071068,0.781831,0.62349,False,False,False,False,False,False
2020-09-01 18:00:00,28 boulevard Diderot E-O,2.302585,18,1,3,9,245,False,-1.0,-1.83697e-16,0.781831,0.62349,False,False,False,False,False,False


In [102]:
test = create_features(test)

In [104]:
test.head()

Unnamed: 0_level_0,counter_name,hour,dayofweek,quarter,month,dayofyear,is_weekend,hour_sin,hour_cos,dayofweek_sin,dayofweek_cos,is_holiday,lockdown_1,lockdown_2,lockdown_3_1,lockdown_3_2,lockdown_3_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-09-10 01:00:00,28 boulevard Diderot E-O,1,4,3,9,253,False,0.258819,0.965926,-0.433884,-0.900969,False,False,False,False,False,False
2021-09-10 13:00:00,28 boulevard Diderot E-O,13,4,3,9,253,False,-0.258819,-0.965926,-0.433884,-0.900969,False,False,False,False,False,False
2021-09-10 17:00:00,28 boulevard Diderot E-O,17,4,3,9,253,False,-0.965926,-0.258819,-0.433884,-0.900969,False,False,False,False,False,False
2021-09-10 19:00:00,28 boulevard Diderot E-O,19,4,3,9,253,False,-0.965926,0.258819,-0.433884,-0.900969,False,False,False,False,False,False
2021-09-10 22:00:00,28 boulevard Diderot E-O,22,4,3,9,253,False,-0.5,0.866025,-0.433884,-0.900969,False,False,False,False,False,False


In [105]:
boolean_columns = [
    "is_weekend",
    "is_holiday",
    "lockdown_1",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
]

for column in boolean_columns:
    train[column] = train[column].astype(int)


for column in boolean_columns:
    test[column] = test[column].astype(int)

In [106]:
combined_train = train.merge(
    weather_data_hourly, left_index=True, right_index=True, how="left"
)
combined_test = test.merge(
    weather_data_hourly, left_index=True, right_index=True, how="left"
)

In [107]:
combined_test.head()

Unnamed: 0_level_0,counter_name,hour,dayofweek,quarter,month,dayofyear,is_weekend,hour_sin,hour_cos,dayofweek_sin,...,raf10,rafper,u,vv,n,cl,cm,ch,precipitation,cloudy_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-10 01:00:00,28 boulevard Diderot E-O,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,3.4,3.4,94,34270,100.0,36.857143,20.75,10.0,0,0
2021-09-10 01:00:00,28 boulevard Diderot O-E,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,3.4,3.4,94,34270,100.0,36.857143,20.75,10.0,0,0
2021-09-10 01:00:00,39 quai François Mauriac NO-SE,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,3.4,3.4,94,34270,100.0,36.857143,20.75,10.0,0,0
2021-09-10 01:00:00,39 quai François Mauriac SE-NO,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,3.4,3.4,94,34270,100.0,36.857143,20.75,10.0,0,0
2021-09-10 01:00:00,18 quai de l'Hôtel de Ville NO-SE,1,4,3,9,253,0,0.258819,0.965926,-0.433884,...,3.4,3.4,94,34270,100.0,36.857143,20.75,10.0,0,0


In [108]:
features = [
    "counter_name",
    "hour",
    "dayofweek",
    "quarter",
    "month",
    "dayofyear",
    "hour_sin",
    "hour_cos",
    "is_holiday",
    "lockdown_2",
    "lockdown_3_1",
    "lockdown_3_2",
    "lockdown_3_3",
    "t",
    "rr1",
    "rr3",
    "rr6",
    "ff",
    "raf10",
    "rafper",
    "u",
    "vv",
    "n",
    "cl",
    "cm",
    "ch",
    "precipitation",
    "cloudy_day",
]
target = ["log_bike_count"]
cat_feature = ["counter_name"]

In [110]:
X_train = combined_train[features]
y_train = combined_train[target]
X_test = combined_test[features]

In [111]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 455163 entries, 2020-09-01 01:00:00 to 2021-08-09 23:00:00
Data columns (total 28 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   counter_name   455163 non-null  category
 1   hour           455163 non-null  int64   
 2   dayofweek      455163 non-null  int64   
 3   quarter        455163 non-null  int64   
 4   month          455163 non-null  int64   
 5   dayofyear      455163 non-null  int64   
 6   hour_sin       455163 non-null  float64 
 7   hour_cos       455163 non-null  float64 
 8   is_holiday     455163 non-null  int32   
 9   lockdown_2     455163 non-null  int32   
 10  lockdown_3_1   455163 non-null  int32   
 11  lockdown_3_2   455163 non-null  int32   
 12  lockdown_3_3   455163 non-null  int32   
 13  t              455163 non-null  float64 
 14  rr1            455163 non-null  float64 
 15  rr3            455163 non-null  float64 
 16  rr6            455163 

In [117]:
reg = xgb.XGBRegressor(
    tree_method="hist", n_estimators=75, enable_categorical=True)

In [118]:
reg.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train)],
    early_stopping_rounds=10,
    verbose=10,
)

[0]	validation_0-rmse:2.21259




[10]	validation_0-rmse:0.57838
[20]	validation_0-rmse:0.48361
[30]	validation_0-rmse:0.44812
[40]	validation_0-rmse:0.43198
[50]	validation_0-rmse:0.41551
[60]	validation_0-rmse:0.40733
[70]	validation_0-rmse:0.40196
[74]	validation_0-rmse:0.39897


In [119]:
predictions = reg.predict(X_test)

In [120]:
predictions

array([0.38610488, 1.7425479 , 1.7188671 , ..., 2.822218  , 1.9940313 ,
       1.7751577 ], dtype=float32)

In [121]:
predictions_df = pd.DataFrame({"log_bike_count": predictions})
predictions_df = predictions_df.reset_index(drop=True)
predictions_df.to_csv("submissions.csv", index=True, index_label="Id")