# Import librairies

In [None]:
import pandas as pd
import requests

# Récupérer les vacances scolaires

In [213]:
# URL de l'API des vacances scolaires
url_vacances = "https://data.education.gouv.fr/api/records/1.0/search/?dataset=fr-en-calendrier-scolaire&q=&rows=1000&sort=start_date&facet=zone&facet=description&facet=start_date&facet=end_date"

# Charger les données
response = requests.get(url_vacances)
if response.status_code == 200 :
    data = response.json()
    records = data['records']

# Convertir en DataFrame
df = pd.json_normalize(records)

df.columns = df.columns.str.replace('^fields.', '', regex=True)

# Filtrer la Zone B uniquement
df_zone_b_nantes = df[(df['zones'] == 'Zone B') & (df['location'] == 'Nantes')].copy()

# Filtrer années 2025 et 2026
df_year_filtered = df_zone_b_nantes[(df_zone_b_nantes['annee_scolaire'] == '2024-2025') | (df_zone_b_nantes['annee_scolaire'] == '2025-2026')].copy()

# Garder l'essentiel
df_vacances = df_year_filtered[['start_date', 'end_date', 'location', 'annee_scolaire', 'description']].copy()
df_vacances['end_date'] = pd.to_datetime(df_vacances['end_date']).dt.date
df_vacances['start_date'] = pd.to_datetime(df_vacances['start_date']).dt.date


In [214]:
df_vacances.head()

Unnamed: 0,start_date,end_date,location,annee_scolaire,description
28,2026-07-03,2026-07-03,Nantes,2025-2026,Début des Vacances d'Été
58,2026-05-13,2026-05-17,Nantes,2025-2026,Pont de l'Ascension
79,2026-04-10,2026-04-26,Nantes,2025-2026,Vacances de Printemps
111,2026-02-13,2026-03-01,Nantes,2025-2026,Vacances d'Hiver
150,2025-12-19,2026-01-04,Nantes,2025-2026,Vacances de Noël


Faire une ligne par jour de vacances

In [215]:
# On génère une colonne avec les plages de dates
df_vacances['dates'] = df_vacances.apply(
    lambda row: pd.date_range(start=row['start_date'], end=row['end_date']).tolist(),
    axis=1
)

df_vacances_day_by_day = df_vacances.explode('dates')

df_vacances_final = df_vacances_day_by_day[['dates', 'description']].reset_index(drop=True)


In [216]:
df_vacances_final.head()

Unnamed: 0,dates,description
0,2026-07-03,Début des Vacances d'Été
1,2026-05-13,Pont de l'Ascension
2,2026-05-14,Pont de l'Ascension
3,2026-05-15,Pont de l'Ascension
4,2026-05-16,Pont de l'Ascension


In [218]:
df_vacances_final['is_holiday'] = 1

# Récupérer les jours feriés

In [219]:
# Récupération des jours fériés
fr_holidays = holidays.France(years=[2024,2025,2026])
jours_feries_data = [{'dates': date, 'description': name} for date, name in sorted(fr_holidays.items())]
df_feries = pd.DataFrame(jours_feries_data)

In [220]:
df_feries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   dates        33 non-null     object
 1   description  33 non-null     object
dtypes: object(2)
memory usage: 656.0+ bytes


In [221]:
df_feries['is_public_holiday'] = 1

# Récupérer les jours de week-end

In [222]:
# Définir la période
start_date = '2025-02-15'
end_date = '2026-12-31'

# Générer toutes les dates
dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Filtrer uniquement les week-ends (samedi=5, dimanche=6)
weekends = dates[dates.weekday >= 5]

# Mettre dans un DataFrame
df_weekends = pd.DataFrame({'dates': weekends})

# Ajouter la colonne 'description' => "Samedi" ou "Dimanche"
df_weekends['description'] = df_weekends['dates'].dt.weekday.map({5: 'Samedi', 6: 'Dimanche'})



In [223]:
df_weekends['Saturday'] = df_weekends.apply(lambda x : 1 if x['description'] == 'Samedi' else 0, axis = 1)
df_weekends['Sunday'] = df_weekends.apply(lambda x : 1 if x['description'] == 'Dimanche' else 0, axis = 1)

In [224]:
df_weekends.head()

Unnamed: 0,dates,description,Saturday,Sunday
0,2025-02-15,Samedi,1,0
1,2025-02-16,Dimanche,0,1
2,2025-02-22,Samedi,1,0
3,2025-02-23,Dimanche,0,1
4,2025-03-01,Samedi,1,0


# Grouper les dataset

In [225]:

# Construire le calendrier complet
start_date = '2025-02-15'
end_date = '2026-12-31'
calendrier = pd.DataFrame({'dates': pd.date_range(start=start_date, end=end_date, freq='D')})

# Ajouter les indicateurs
calendrier['is_vacances'] = calendrier['dates'].isin(df_vacances_final['dates']).astype(int)
calendrier['is_ferie'] = calendrier['dates'].isin(df_feries['dates']).astype(int)
calendrier['is_samedi'] = (calendrier['dates'].dt.weekday == 5).astype(int)
calendrier['is_dimanche'] = (calendrier['dates'].dt.weekday == 6).astype(int)


  calendrier['is_ferie'] = calendrier['dates'].isin(df_feries['dates']).astype(int)


In [226]:
calendrier['dates'] = pd.to_datetime(calendrier['dates'])

In [227]:
calendrier.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   dates        685 non-null    datetime64[ns]
 1   is_vacances  685 non-null    int64         
 2   is_ferie     685 non-null    int64         
 3   is_samedi    685 non-null    int64         
 4   is_dimanche  685 non-null    int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 26.9 KB


In [228]:
#calendrier.to_csv('data/preprocessed/weekend_holiday_public_holidays.csv', index=False)s