https://www.data.gouv.fr/en/datasets/le-holidays_calendar-scolaire/

In [5]:
import pandas as pd
import datetime

In [6]:
holidays_calendar = pd.read_csv("fr-en-calendrier-scolaire.csv", sep=";")
holidays_calendar.head()

Unnamed: 0,description,population,start_date,end_date,location,zones,annee_scolaire
0,Vacances d'Été,Élèves,2010-07-08T00:00:00+02:00,2010-09-09T00:00:00+02:00,Corse,Corse,2009-2010
1,Vacances de Printemps,-,2011-04-18T00:00:00+02:00,2011-05-02T00:00:00+02:00,Corse,Corse,2010-2011
2,Vacances de Noël,-,2011-12-19T00:00:00+01:00,2012-01-03T00:00:00+01:00,Corse,Corse,2011-2012
3,Vacances d'Hiver,-,2013-02-18T00:00:00+01:00,2013-03-04T00:00:00+01:00,Corse,Corse,2012-2013
4,Vacances d'Hiver,-,2014-02-24T00:00:00+01:00,2014-03-10T00:00:00+01:00,Corse,Corse,2013-2014


## Cleaning

#### Clean "population"

In [7]:
holidays_calendar["population"].value_counts().index.to_list()

['-',
 'Enseignants',
 'Élèves',
 'Élèves du premier degré',
 'Enseignants du premier degré',
 'Élèves du second degré',
 'Guadeloupe & Saint-Martin',
 'Saint-Barthélémy',
 'Saint-Martin',
 'Premier degré et collèges',
 'Enseignants du second degré',
 'Guadeloupe sauf Saint-Martin',
 'Élèves des lycées',
 'Enseignants des lycées',
 'Enseignants des collèges',
 'Élèves des collèges',
 'Lycées',
 'Premier degré',
 'Collèges']

In [8]:
pop_to_drop = ['Enseignants', 'Enseignants du premier degré', 'Guadeloupe & Saint-Martin', 'Saint-Barthélémy', 'Saint-Martin', 'Enseignants du second degré', 'Guadeloupe sauf Saint-Martin', 'Enseignants des lycées', 'Enseignants des collèges']
holidays_calendar = holidays_calendar[~holidays_calendar["population"].isin(pop_to_drop)]
holidays_calendar.value_counts("population")

population
-                            1427
Élèves                        223
Élèves du premier degré         6
Élèves du second degré          4
Premier degré et collèges       3
Élèves des collèges             2
Élèves des lycées               2
Collèges                        1
Lycées                          1
Premier degré                   1
Name: count, dtype: int64

#### Clean zones

In [9]:
holidays_calendar["zones"].value_counts().index.to_list()
zones_to_keep = ['Zone A', 'Zone B', 'Zone C']
holidays_calendar = holidays_calendar[holidays_calendar["zones"].isin(zones_to_keep)]

#### Clean locations

In [10]:
holidays_calendar.drop(columns=["location", "population"], inplace=True)
holidays_calendar.drop_duplicates(inplace=True)

In [11]:
holiday_df = pd.read_csv("scolar_holidays.csv")
holiday_df["start_date"] = holiday_df["start_date"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d'))
holiday_df["end_date"] = holiday_df["end_date"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d'))
holiday_df.sort_values(by=["start_date"], inplace=True)
holiday_df

Unnamed: 0,description,start_date,end_date,zones,annee_scolaire
0,Vacances de la Toussaint,2017-10-21,2017-11-06,Zone A,2017-2018
123,Vacances de la Toussaint,2017-10-21,2017-11-06,Zone C,2017-2018
1,Vacances de la Toussaint,2017-10-21,2017-11-06,Zone B,2017-2018
2,Vacances de Noël,2017-12-23,2018-01-08,Zone A,2017-2018
3,Vacances de Noël,2017-12-23,2018-01-08,Zone B,2017-2018
...,...,...,...,...,...
157,Pont de l'Ascension,2026-05-15,2026-05-16,Zone C,2025-2026
120,Pont de l'Ascension,2026-05-15,2026-05-16,Zone A,2025-2026
149,Début des Vacances d'Été,2026-07-04,2026-07-04,Zone A,2025-2026
122,Début des Vacances d'Été,2026-07-04,2026-07-04,Zone C,2025-2026


## Formating

Le départ en vacances a lieu après la classe des jours indiqués.

La reprise des cours s'effectue le matin des jours indiqués.

We're going to creat five columns : "holidays_departure", "first_day_holidays", "last_dayholidays", "during_holidays" (exclue les jours de départ et de retour de vacances)

#### holiday_departure
holiday_departure : pd.Serie avec le nom des zones le jour de leur départ en vacances (lendemain de la dernière journée de cours)

In [12]:
holiday_departure = holiday_df[["start_date","zones"]]
holiday_departure.loc[:,"start_date"] = holiday_departure.loc[:,"start_date"].apply(lambda x : x.strftime('%Y-%m-%d'))
holiday_departure = holiday_departure.pivot_table(index=["start_date"], columns=["zones"], aggfunc=len, fill_value=0).astype(bool).reset_index()
holiday_departure.columns.name = None
holiday_departure = holiday_departure.rename(columns={"start_date":"date", "Zone A":"holiday_departure_zone_A", "Zone B":"holiday_departure_zone_B", "Zone C":"holiday_departure_zone_C"})
holiday_departure.head()

Unnamed: 0,date,holiday_departure_zone_A,holiday_departure_zone_B,holiday_departure_zone_C
0,2017-10-21,True,True,True
1,2017-12-23,True,True,True
2,2018-02-10,True,False,False
3,2018-02-17,False,False,True
4,2018-02-24,False,True,False


#### first_day_holidays

First day without classes

In [13]:
first_day_holidays = holiday_df[["start_date","zones"]]
first_day_holidays.loc[:,"start_date"] = first_day_holidays["start_date"] + datetime.timedelta(days=1)
first_day_holidays.loc[:,"start_date"] = first_day_holidays["start_date"].apply(lambda x : x.strftime('%Y-%m-%d'))
first_day_holidays.reset_index(drop=True, inplace=True)
first_day_holidays = first_day_holidays.pivot_table(index=["start_date"], columns=["zones"], aggfunc=len, fill_value=0).astype(bool).reset_index()
first_day_holidays.columns.name = None
first_day_holidays.rename(columns={"start_date":"date", "Zone A":"first_day_holidays_zone_A", "Zone B":"first_day_holidays_zone_B", "Zone C":"first_day_holidays_zone_C"}, inplace=True)
first_day_holidays.head()

Unnamed: 0,date,first_day_holidays_zone_A,first_day_holidays_zone_B,first_day_holidays_zone_C
0,2017-10-22,True,True,True
1,2017-12-24,True,True,True
2,2018-02-11,True,False,False
3,2018-02-18,False,False,True
4,2018-02-25,False,True,False


#### last_day_holidays

Day before first day of work after holidays

In [14]:
last_day_holidays = holiday_df[["end_date","zones"]]
last_day_holidays.loc[:,"end_date"] = last_day_holidays.loc[:,"end_date"] - datetime.timedelta(days=1)
last_day_holidays.loc[:,"end_date"] = last_day_holidays.loc[:,"end_date"].apply(lambda x : x.strftime('%Y-%m-%d'))
last_day_holidays.reset_index(drop=True, inplace=True)
last_day_holidays = last_day_holidays.pivot_table(index=["end_date"], columns=["zones"], aggfunc=len, fill_value=0).astype(bool).reset_index()
last_day_holidays.columns.name = None
last_day_holidays.rename(columns={"end_date":"date", "Zone A":"last_day_holidays_zone_A", "Zone B":"last_day_holidays_zone_B", "Zone C":"last_day_holidays_zone_C"}, inplace=True)
last_day_holidays.head()

Unnamed: 0,date,last_day_holidays_zone_A,last_day_holidays_zone_B,last_day_holidays_zone_C
0,2017-11-05,True,True,True
1,2018-01-07,True,True,True
2,2018-02-25,True,False,False
3,2018-03-04,False,False,True
4,2018-03-11,False,True,False


#### during_holidays

In [15]:
during_holidays = holiday_df.loc[:,["start_date","end_date","zones"]]
during_holidays.loc[:,"end_date"] = during_holidays.loc[:,"end_date"] - datetime.timedelta(days=2)
during_holidays.loc[:,"start_date"] = during_holidays.loc[:,"start_date"] + datetime.timedelta(days=2)
during_holidays.head()

during_holidays

Unnamed: 0,start_date,end_date,zones
0,2017-10-23,2017-11-04,Zone A
123,2017-10-23,2017-11-04,Zone C
1,2017-10-23,2017-11-04,Zone B
2,2017-12-25,2018-01-06,Zone A
3,2017-12-25,2018-01-06,Zone B
...,...,...,...
157,2026-05-17,2026-05-14,Zone C
120,2026-05-17,2026-05-14,Zone A
149,2026-07-06,2026-07-02,Zone A
122,2026-07-06,2026-07-02,Zone C


In [16]:
during_holidays_zone_A = during_holidays[during_holidays["zones"] == "Zone A"].reset_index()
during_holidays_zone_B = during_holidays[during_holidays["zones"] == "Zone B"].reset_index()
during_holidays_zone_C = during_holidays[during_holidays["zones"] == "Zone C"].reset_index()

In [17]:
holidays_days_zone_A= pd.DataFrame()

for row in range(during_holidays_zone_A.shape[0]) : 
    start_date, end_date = during_holidays_zone_A.loc[row,["start_date","end_date"]].apply(lambda x : x.strftime('%Y-%m-%d'))
    curr_holidays_days_zone_A = pd.date_range(start_date, end_date)
    curr_holidays_days_zone_A = pd.DataFrame(curr_holidays_days_zone_A, columns=["date"])
    holidays_days_zone_A = pd.concat([holidays_days_zone_A, curr_holidays_days_zone_A], ignore_index=True)


holidays_days_zone_A["holiday_day_zone_A"] = True

In [18]:
holidays_days_zone_B= pd.DataFrame()

for row in range(during_holidays_zone_B.shape[0]) : 
    start_date, end_date = during_holidays_zone_B.loc[row,["start_date","end_date"]].apply(lambda x : x.strftime('%Y-%m-%d'))
    curr_holidays_days_zone_B = pd.date_range(start_date, end_date)
    curr_holidays_days_zone_B = pd.DataFrame(curr_holidays_days_zone_B, columns=["date"])
    holidays_days_zone_B = pd.concat([holidays_days_zone_B, curr_holidays_days_zone_B], ignore_index=True)

holidays_days_zone_B["holiday_day_zone_B"] = True

In [19]:
holidays_days_zone_C= pd.DataFrame()

for row in range(during_holidays_zone_C.shape[0]) : 
    start_date, end_date = during_holidays_zone_C.loc[row,["start_date","end_date"]].apply(lambda x : x.strftime('%Y-%m-%d'))
    curr_holidays_days_zone_C = pd.date_range(start_date, end_date)
    curr_holidays_days_zone_C = pd.DataFrame(curr_holidays_days_zone_C, columns=["date"])
    holidays_days_zone_C = pd.concat([holidays_days_zone_C, curr_holidays_days_zone_C], ignore_index=True)

holidays_days_zone_C["holiday_day_zone_C"] = True

In [20]:
during_holidays = holidays_days_zone_A.merge(holidays_days_zone_B, on='date', how='outer').merge(holidays_days_zone_C, on='date', how='outer').fillna(False)

during_holidays

Unnamed: 0,date,holiday_day_zone_A,holiday_day_zone_B,holiday_day_zone_C
0,2017-10-23,True,True,True
1,2017-10-24,True,True,True
2,2017-10-25,True,True,True
3,2017-10-26,True,True,True
4,2017-10-27,True,True,True
...,...,...,...,...
1148,2026-04-28,False,False,True
1149,2026-04-29,False,False,True
1150,2026-04-30,False,False,True
1151,2026-05-01,False,False,True


In [21]:
during_holidays["holiday_day_zone_A"].isna().sum()

0

In [22]:
during_holidays

Unnamed: 0,date,holiday_day_zone_A,holiday_day_zone_B,holiday_day_zone_C
0,2017-10-23,True,True,True
1,2017-10-24,True,True,True
2,2017-10-25,True,True,True
3,2017-10-26,True,True,True
4,2017-10-27,True,True,True
...,...,...,...,...
1148,2026-04-28,False,False,True
1149,2026-04-29,False,False,True
1150,2026-04-30,False,False,True
1151,2026-05-01,False,False,True


## Merging and saving

Merge : holiday_departure, first_day_holidays, last_day_holidays, during_holidays

In [23]:
holidays_calendar = holiday_departure.merge(first_day_holidays, on='date', how='outer').merge(during_holidays, on='date', how='outer').merge(last_day_holidays, on='date', how='outer').fillna(False)
holidays_calendar.sort_values(by=["date"], inplace=True)

In [26]:
holidays_calendar.to_csv("school_holiday_calendar.csv", index=False)