In [7]:
# ===============================================
# üå¶Ô∏è Donn√©es externes Paris (M√©t√©o + F√©ri√©s + Vacances + √âv√©nements)
# ===============================================

import pandas as pd
import requests
from datetime import datetime
import os

# ------------------------------------------------------------
# üìÅ Dossier de sortie
# ------------------------------------------------------------
output_dir = "external_data"
os.makedirs(output_dir, exist_ok=True)

# ------------------------------------------------------------
# 1Ô∏è‚É£ Grille temporelle
# ------------------------------------------------------------
start = datetime(2024, 10, 1)
end = datetime(2025, 11, 12, 23)
hours = pd.date_range(start=start, end=end, freq="H")
features = pd.DataFrame({"time": hours})
features["date"] = features["time"].dt.date
features["hour"] = features["time"].dt.hour

# ------------------------------------------------------------
# 2Ô∏è‚É£ M√©t√©o : d√©coupe archive + forecast
# ------------------------------------------------------------
print("T√©l√©chargement des donn√©es m√©t√©o...")

def fetch_meteo(base_url, start_date, end_date):
    """T√©l√©charge la m√©t√©o entre start_date et end_date"""
    params = {
        "latitude": 48.8566,
        "longitude": 2.3522,
        "start_date": start_date.strftime("%Y-%m-%d"),
        "end_date": end_date.strftime("%Y-%m-%d"),
        "hourly": "temperature_2m,precipitation,cloud_cover,wind_speed_10m",
        "timezone": "Europe/Paris"
    }
    r = requests.get(base_url, params=params)
    r.raise_for_status()
    data = r.json()
    if "hourly" not in data:
        return pd.DataFrame()
    meteo = pd.DataFrame(data["hourly"])
    meteo["time"] = pd.to_datetime(meteo["time"])
    return meteo

today = datetime.utcnow().date()
try:
    # 1Ô∏è‚É£ Archive (jusqu‚Äô√† aujourd‚Äôhui)
    meteo_archive = fetch_meteo("https://archive-api.open-meteo.com/v1/archive",
                                datetime(2024, 10, 1),
                                datetime(today.year, today.month, today.day))
    # 2Ô∏è‚É£ Pr√©vision (de maintenant √† 12 novembre)
    meteo_forecast = fetch_meteo("https://api.open-meteo.com/v1/forecast",
                                 datetime(today.year, today.month, today.day),
                                 datetime(2025, 11, 12))

    meteo = pd.concat([meteo_archive, meteo_forecast], ignore_index=True)
    meteo.to_csv(os.path.join(output_dir, "meteo_raw.csv"), index=False)
    features = features.merge(meteo, on="time", how="left")
except Exception as e:
    print("‚ö†Ô∏è Erreur t√©l√©chargement m√©t√©o :", e)

# ------------------------------------------------------------
# 3Ô∏è‚É£ Jours f√©ri√©s
# ------------------------------------------------------------
print("T√©l√©chargement des jours f√©ri√©s...")
try:
    feries = pd.read_csv("https://etalab.github.io/jours-feries-france-data/csv/jours_feries_metropole.csv")
    feries["date"] = pd.to_datetime(feries["date"]).dt.date
    feries["is_holiday"] = 1
    features = features.merge(feries[["date", "is_holiday"]], on="date", how="left").fillna({"is_holiday": 0})
except Exception as e:
    print("‚ö†Ô∏è Erreur jours f√©ri√©s :", e)
    features["is_holiday"] = 0

# ------------------------------------------------------------
# 4Ô∏è‚É£ Vacances scolaires (Zone C)
# ------------------------------------------------------------
print("T√©l√©chargement des vacances scolaires...")
try:
    vac_url = "https://data.education.gouv.fr/explore/dataset/fr-en-calendrier-scolaire/download/?format=csv"
    vac = pd.read_csv(vac_url, sep=";", engine="python", on_bad_lines="skip")
    vac = vac[vac["zones"].str.contains("Zone C", na=False)]
    start_col = "date_debut" if "date_debut" in vac.columns else "start_date"
    end_col = "date_fin" if "date_fin" in vac.columns else "end_date"
    vac["start_date"] = pd.to_datetime(vac[start_col], errors="coerce")
    vac["end_date"] = pd.to_datetime(vac[end_col], errors="coerce")
    vac = vac.dropna(subset=["start_date", "end_date"])
    vac.to_csv(os.path.join(output_dir, "vacances_zoneC.csv"), index=False)

    def in_vacation(date):
        return any((row.start_date.date() <= date <= row.end_date.date()) for row in vac.itertuples())

    features["is_vacation"] = features["date"].apply(in_vacation)
except Exception as e:
    print("‚ö†Ô∏è Erreur vacances scolaires :", e)
    features["is_vacation"] = 0

# ------------------------------------------------------------
# 5Ô∏è‚É£ √âv√©nements (optionnel, g√©r√© avec tol√©rance)
# ------------------------------------------------------------
print("T√©l√©chargement des √©v√©nements...")
try:
    events_url = "https://opendata.paris.fr/explore/dataset/que-faire-a-paris-/download/?format=csv"
    events = pd.read_csv(events_url, sep=";", engine="python", on_bad_lines="skip")
    events["date_start"] = pd.to_datetime(events["date_start"], errors="coerce")
    events["date_end"] = pd.to_datetime(events["date_end"], errors="coerce")
    events = events.dropna(subset=["date_start", "date_end"])

    def has_event(date):
        return any((row.date_start.date() <= date <= row.date_end.date()) for row in events.itertuples())

    features["has_event"] = features["date"].apply(has_event)
except Exception as e:
    print("‚ö†Ô∏è Erreur √©v√©nements :", e)
    features["has_event"] = 0

# ------------------------------------------------------------
# 6Ô∏è‚É£ Export final
# ------------------------------------------------------------
output_path = os.path.join(output_dir, "external_features_paris.csv")
features.to_csv(output_path, index=False)
print(f"\n‚úÖ Donn√©es externes fusionn√©es sauvegard√©es dans : {output_path}\n")
print(features.head())


T√©l√©chargement des donn√©es m√©t√©o...
T√©l√©chargement des jours f√©ri√©s...
T√©l√©chargement des vacances scolaires...
T√©l√©chargement des √©v√©nements...
‚ö†Ô∏è Erreur √©v√©nements : HTTP Error 500: Internal Server Error

‚úÖ Donn√©es externes fusionn√©es sauvegard√©es dans : external_data\external_features_paris.csv

                 time        date  hour  temperature_2m  precipitation  \
0 2024-10-01 00:00:00  2024-10-01     0            14.0            0.0   
1 2024-10-01 01:00:00  2024-10-01     1            13.8            0.0   
2 2024-10-01 02:00:00  2024-10-01     2            13.7            0.0   
3 2024-10-01 03:00:00  2024-10-01     3            13.5            0.0   
4 2024-10-01 04:00:00  2024-10-01     4            12.6            0.0   

   cloud_cover  wind_speed_10m  is_holiday  is_vacation  has_event  
0          100            19.2         0.0        False          0  
1          100            18.7         0.0        False          0  
2          100        

In [15]:
import pandas as pd
from datetime import datetime
import os

# ------------------------------------------------------------
# 1Ô∏è‚É£ Lecture du CSV local
# ------------------------------------------------------------
csv_path = "./que_faire_a_paris.csv"  # fichier t√©l√©charg√©
output_dir = "external_data"
os.makedirs(output_dir, exist_ok=True)

print("Lecture du fichier des √©v√©nements locaux...")

# Lecture tol√©rante avec encodage fran√ßais
events = pd.read_csv(
    csv_path, sep=";", engine="python", on_bad_lines="skip", encoding="utf-8"
)

# ------------------------------------------------------------
# 2Ô∏è‚É£ Conversion robuste des dates
# ------------------------------------------------------------
def safe_to_datetime(series):
    """Convertit une colonne en datetime, g√®re formats fran√ßais et fuseaux horaires."""
    s = pd.to_datetime(series, errors="coerce", dayfirst=True, utc=True)
    # Supprime le fuseau horaire (convertit en heure locale sans offset)
    return s.dt.tz_convert("Europe/Paris").dt.tz_localize(None)

# Conversion forc√©e des deux colonnes principales
for col in ["Date de d√©but", "Date de fin"]:
    if col in events.columns:
        events[col] = safe_to_datetime(events[col])
    else:
        print(f"‚ö†Ô∏è Colonne manquante : {col}")

# Suppression des lignes sans dates valides
events = events.dropna(subset=["Date de d√©but", "Date de fin"])
print(f"‚úÖ {len(events)} lignes avec dates valides.")

# ------------------------------------------------------------
# 3Ô∏è‚É£ Filtrage temporel (2024-10-01 ‚Üí 2025-11-12)
# ------------------------------------------------------------
start = datetime(2024, 10, 1)
end = datetime(2025, 11, 12)

mask = (events["Date de fin"] >= start) & (events["Date de d√©but"] <= end)
events = events.loc[mask].copy()

print(f"‚úÖ {len(events)} √©v√©nements conserv√©s entre {start.date()} et {end.date()}")

# ------------------------------------------------------------
# 4Ô∏è‚É£ S√©lection des colonnes pertinentes
# ------------------------------------------------------------
cols_to_keep = [
    "Titre",
    "Date de d√©but",
    "Date de fin",
    "Nom du lieu",
    "Adresse du lieu",
    "Code postal",
    "Ville",
    "Type d'acc√®s",
    "Type de prix",
    "D√©tail du prix",
    "Transport",
]
cols_to_keep = [c for c in cols_to_keep if c in events.columns]
events = events[cols_to_keep]

# Nettoyage basique des cha√Ænes
for col in events.select_dtypes(include="object").columns:
    events[col] = events[col].astype(str).str.strip()

# ------------------------------------------------------------
# 5Ô∏è‚É£ Sauvegarde du fichier nettoy√©
# ------------------------------------------------------------
output_path = os.path.join(output_dir, "evenements_filtr√©s.csv")
events.to_csv(output_path, index=False, encoding="utf-8")
print(f"‚úÖ Fichier nettoy√© sauvegard√© dans : {output_path}")

# ------------------------------------------------------------
# 6Ô∏è‚É£ (Optionnel) V√©rification
# ------------------------------------------------------------
print("\nAper√ßu des premiers √©v√©nements :")
print(events.head(10))


Lecture du fichier des √©v√©nements locaux...
‚úÖ 971 lignes avec dates valides.
‚úÖ 749 √©v√©nements conserv√©s entre 2024-10-01 et 2025-11-12
‚úÖ Fichier nettoy√© sauvegard√© dans : external_data\evenements_filtr√©s.csv

Aper√ßu des premiers √©v√©nements :
                                                Titre       Date de d√©but  \
0                Balade au c≈ìur des Passages Couverts 2024-01-10 01:00:00   
8                                  La Cage aux Folles 2025-05-12 02:00:00   
11               D√©couverte des petits jardins du 20e 2025-05-07 18:00:00   
13            L'amour universel envahit le Lucernaire 2025-04-09 00:00:00   
19           Pierre Bertrand et La Caja Negra Quartet 2025-03-12 21:00:00   
20                         Thomas Galliano Organ Trio 2025-06-11 22:00:00   
21             Conf√©rence sur les Catacombes de Paris 2025-05-11 21:00:00   
26  #BrasilianJam La jam du dimanche de Isa√Øa Alve... 2025-02-11 21:30:00   
29  Les gens de Paris, 1926-1936 ¬∑ Dans le

In [16]:
import pandas as pd
from datetime import datetime, timedelta
import os

# ------------------------------------------------------------
# 1Ô∏è‚É£ Lecture du fichier d'√©v√©nements nettoy√©
# ------------------------------------------------------------
events_path = "external_data/evenements_filtr√©s.csv"
events = pd.read_csv(events_path, encoding="utf-8")

# Conversion en datetime
events["Date de d√©but"] = pd.to_datetime(events["Date de d√©but"], errors="coerce")
events["Date de fin"] = pd.to_datetime(events["Date de fin"], errors="coerce")
events = events.dropna(subset=["Date de d√©but", "Date de fin"])

# ------------------------------------------------------------
# 2Ô∏è‚É£ Cr√©ation de la plage temporelle d‚Äôint√©r√™t
# ------------------------------------------------------------
start = datetime(2024, 10, 1)
end = datetime(2025, 11, 12)
dates = pd.date_range(start=start, end=end, freq="D")
features = pd.DataFrame({"date": dates})

# ------------------------------------------------------------
# 3Ô∏è‚É£ Fonction pour savoir si un √©v√©nement est actif un jour donn√©
# ------------------------------------------------------------
def has_event(date, subset=None):
    df = events if subset is None else subset
    return ((df["Date de d√©but"] <= date) & (df["Date de fin"] >= date)).any()

# ------------------------------------------------------------
# 4Ô∏è‚É£ Indicateur global
# ------------------------------------------------------------
features["has_event"] = features["date"].apply(has_event)

# ------------------------------------------------------------
# 5Ô∏è‚É£ Indicateurs locaux (par zone)
# ------------------------------------------------------------
def filter_zone(keyword):
    mask = events["Adresse du lieu"].fillna("").str.contains(keyword, case=False, regex=True) | \
           events["Code postal"].fillna("").astype(str).str.contains(keyword)
    return events[mask].copy()

zones = {
    "champs": r"elys(√©es)?|75008",
    "stsperes": r"saints?-p(√®|e)res|75006",
    "convention": r"convention|75015",
}

for zone_name, pattern in zones.items():
    subset = filter_zone(pattern)
    features[f"has_event_{zone_name}"] = features["date"].apply(lambda d: has_event(d, subset))

# ------------------------------------------------------------
# 6Ô∏è‚É£ Conversion en 0/1 et sauvegarde
# ------------------------------------------------------------
for c in features.columns:
    if c != "date":
        features[c] = features[c].astype(int)

output_path = "external_data/events_features.csv"
features.to_csv(output_path, index=False)
print(f"‚úÖ Fichier d'√©v√©nements temporels sauvegard√© : {output_path}")

print("\nAper√ßu :")
print(features.head(10))


  mask = events["Adresse du lieu"].fillna("").str.contains(keyword, case=False, regex=True) | \
  events["Code postal"].fillna("").astype(str).str.contains(keyword)
  mask = events["Adresse du lieu"].fillna("").str.contains(keyword, case=False, regex=True) | \
  events["Code postal"].fillna("").astype(str).str.contains(keyword)


‚úÖ Fichier d'√©v√©nements temporels sauvegard√© : external_data/events_features.csv

Aper√ßu :
        date  has_event  has_event_champs  has_event_stsperes  \
0 2024-10-01          1                 0                   1   
1 2024-10-02          1                 0                   1   
2 2024-10-03          1                 0                   1   
3 2024-10-04          1                 0                   1   
4 2024-10-05          1                 0                   1   
5 2024-10-06          1                 0                   1   
6 2024-10-07          1                 0                   1   
7 2024-10-08          1                 0                   1   
8 2024-10-09          1                 0                   1   
9 2024-10-10          1                 0                   1   

   has_event_convention  
0                     1  
1                     1  
2                     1  
3                     1  
4                     1  
5                     1  
6     

Fusion finale

In [19]:
import pandas as pd

# ------------------------------------------------------------
# 1Ô∏è‚É£ Chargement des deux fichiers
# ------------------------------------------------------------
external = pd.read_csv("external_data/external_features_paris.csv", parse_dates=["date"])
events = pd.read_csv("external_data/events_features.csv", parse_dates=["date"])

# ------------------------------------------------------------
# 2Ô∏è‚É£ Fusion sur la colonne 'date'
# ------------------------------------------------------------
merged = external.merge(
    events,
    on="date",
    how="left",
    suffixes=("", "_event")  # √©vite les doublons si 'has_event' existe d√©j√†
)

# ------------------------------------------------------------
# 3Ô∏è‚É£ Nettoyage : suppression du doublon global 'has_event'
# ------------------------------------------------------------
if "has_event_event" in merged.columns:
    merged.drop(columns=["has_event_event"], inplace=True)

# ------------------------------------------------------------
# 4Ô∏è‚É£ Valeurs manquantes ‚Üí 0 pour les indicateurs binaires
# ------------------------------------------------------------
for col in ["has_event_champs", "has_event_stsperes", "has_event_convention"]:
    if col in merged.columns:
        merged[col] = merged[col].fillna(0).astype(int)

# ------------------------------------------------------------
# 5Ô∏è‚É£ Sauvegarde finale
# ------------------------------------------------------------
output_path = "external_data/external_features_final.csv"
merged.to_csv(output_path, index=False, encoding="utf-8")

print(f"‚úÖ Donn√©es finales fusionn√©es sauvegard√©es dans : {output_path}")
print(merged.head(10))


‚úÖ Donn√©es finales fusionn√©es sauvegard√©es dans : external_data/external_features_final.csv
                  time       date  hour  temperature_2m  precipitation  \
0  2024-10-01 00:00:00 2024-10-01     0            14.0            0.0   
1  2024-10-01 01:00:00 2024-10-01     1            13.8            0.0   
2  2024-10-01 02:00:00 2024-10-01     2            13.7            0.0   
3  2024-10-01 03:00:00 2024-10-01     3            13.5            0.0   
4  2024-10-01 04:00:00 2024-10-01     4            12.6            0.0   
5  2024-10-01 05:00:00 2024-10-01     5            12.1            0.0   
6  2024-10-01 06:00:00 2024-10-01     6            11.8            0.0   
7  2024-10-01 07:00:00 2024-10-01     7            11.5            0.0   
8  2024-10-01 08:00:00 2024-10-01     8            12.0            0.0   
9  2024-10-01 09:00:00 2024-10-01     9            13.1            0.0   

   cloud_cover  wind_speed_10m  is_holiday  is_vacation  has_event  \
0          100     