In [1]:
import pandas as pd

In [2]:
path_data = "https://data.education.gouv.fr/api/explore/v2.1/catalog/datasets/fr-en-calendrier-scolaire/exports/csv?lang=fr&timezone=Europe%2FParis&use_labels=true&delimiter=%3B"
df_holidays = pd.read_csv(path_data, sep=";").sort_values(by="Date de début")
# Select only metropolitan France dates
df_holidays = df_holidays[df_holidays["Zones"].isin(["Zone A", "Zone B", "Zone C"])]
# Drop duplicates between same zones and dates
df_holidays.drop_duplicates(subset=["Zones", "Date de début"], inplace=True)
# Change types
df_holidays["Date de début"] = pd.to_datetime(df_holidays["Date de début"].str[:10], format="%Y-%m-%d")
df_holidays["Date de fin"] = pd.to_datetime(df_holidays["Date de fin"].str[:10], format="%Y-%m-%d")
df_holidays.rename(columns={"Date de début": "date_begin", "Date de fin": "date_end"}, inplace=True)

In [3]:
df_holidays

Unnamed: 0,Description,Population,date_begin,date_end,Académies,Zones,annee_scolaire
1704,Vacances de la Toussaint,-,2017-10-21,2017-11-06,Limoges,Zone A,2017-2018
1507,Vacances de la Toussaint,-,2017-10-21,2017-11-06,Versailles,Zone C,2017-2018
1810,Vacances de la Toussaint,-,2017-10-21,2017-11-06,Strasbourg,Zone B,2017-2018
824,Vacances de Noël,-,2017-12-23,2018-01-08,Poitiers,Zone A,2017-2018
923,Vacances de Noël,-,2017-12-23,2018-01-08,Orléans-Tours,Zone B,2017-2018
...,...,...,...,...,...,...,...
1500,Pont de l'Ascension,-,2026-05-14,2026-05-18,Normandie,Zone B,2025-2026
688,Pont de l'Ascension,-,2026-05-14,2026-05-18,Toulouse,Zone C,2025-2026
216,Début des Vacances d'Été,-,2026-07-04,2026-07-04,Nice,Zone B,2025-2026
1566,Début des Vacances d'Été,-,2026-07-04,2026-07-04,Toulouse,Zone C,2025-2026


In [4]:
df_holidays.dtypes

Description               object
Population                object
date_begin        datetime64[ns]
date_end          datetime64[ns]
Académies                 object
Zones                     object
annee_scolaire            object
dtype: object

In [5]:
df = pd.read_parquet("../data/02_intermediate/df_data_engineered.parquet")

In [6]:
df.sample(2)

Unnamed: 0,stationcode,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,date
139,13001,True,18.0,15,2,1,1,True,True,2024-08-19 09:04:46,75056,2024-08-19
433,11041,True,43.0,31,11,9,2,True,True,2024-08-19 09:04:42,75056,2024-08-19


In [7]:
df.dtypes

stationcode                   object
is_installed                    bool
capacity                     float64
numdocksavailable              int64
numbikesavailable              int64
mechanical                     int64
ebike                          int64
is_renting                      bool
is_returning                    bool
duedate               datetime64[ms]
code_insee_commune            object
date                          object
dtype: object

In [8]:
df_test = pd.read_parquet("../data/03_primary/df_test_w_date_feat.parquet")

In [9]:
df_test.sample(2)

Unnamed: 0,stationcode,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,date,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend
969,4103,True,25.0,2,22,10,12,True,True,2024-08-19 09:02:58,75056,2024-08-19,2024,8,19,0,0
363,20040,True,26.0,24,2,1,1,True,True,2024-08-19 09:03:15,75056,2024-08-19,2024,8,19,0,0


In [10]:
import sys
sys.path.append("../src/velib_prediction/pipelines/feature_engineering")
from nodes import add_holidays_period

In [11]:
add_holidays_period(df_test, df_holidays, feat_date="date", zone="Zone A")

Unnamed: 0,stationcode,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,date,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend,Description_ZoneA
0,44015,True,20.0,5,15,3,12,True,True,2024-08-19 09:03:07,94081,2024-08-19,2024,8,19,0,0,Vacances d'Été
979,21026,True,30.0,5,25,10,15,True,True,2024-08-19 09:04:15,92012,2024-08-19,2024,8,19,0,0,Vacances d'Été
978,13005,True,46.0,24,21,19,2,True,True,2024-08-19 09:04:37,75056,2024-08-19,2024,8,19,0,0,Vacances d'Été
977,2015,True,33.0,1,31,16,15,True,True,2024-08-19 09:04:23,75056,2024-08-19,2024,8,19,0,0,Vacances d'Été
976,18140,True,40.0,37,3,1,2,True,True,2024-08-19 08:58:08,75056,2024-08-19,2024,8,19,0,0,Vacances d'Été
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,9101,True,19.0,16,3,1,2,True,True,2024-08-19 08:58:29,75056,2024-08-19,2024,8,19,0,0,Vacances d'Été
484,16123,True,26.0,16,10,4,6,True,True,2024-08-19 08:59:02,75056,2024-08-19,2024,8,19,0,0,Vacances d'Été
483,6017,True,33.0,4,27,21,6,True,True,2024-08-19 09:04:17,75056,2024-08-19,2024,8,19,0,0,Vacances d'Été
481,17022,True,46.0,41,5,3,2,True,True,2024-08-19 09:02:01,75056,2024-08-19,2024,8,19,0,0,Vacances d'Été
