In [1]:
# %% [markdown]
# 04_meteo_integration — VERSION FINALE ULTRA-STABLE (PRO)
# --------------------------------------------------------
# - Intégration météo propre
# - Merge robuste (aucune perte de colonnes)
# - Normalisation colonnes : carrier / airport / names / year / month
# - Nettoyage météo intermédiaire
# - Contrôle strict : features == get_feature_columns()
# - Sauvegarde train_weather / val_weather / test_weather

# %%
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

ROOT = Path.cwd().resolve().parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("[INFO] ROOT =", ROOT)

# %%
import pandas as pd

from src.config import print_config_summary, DATA_PROCESSED_DIR
from src.fetch_faa_airports import load_airports_with_coords
from src.weather_api import build_weather_dataset
from src.features import get_feature_columns

print_config_summary()

# %% [markdown]
# 1. Charger train / val / test

# %%
train = pd.read_csv(DATA_PROCESSED_DIR / "train.csv")
val   = pd.read_csv(DATA_PROCESSED_DIR / "val.csv")
test  = pd.read_csv(DATA_PROCESSED_DIR / "test.csv")

print("[INFO] train/val/test :", train.shape, val.shape, test.shape)

# %% [markdown]
# 2. Charger aéroports

# %%
airports_df = load_airports_with_coords()
print("[INFO] airports_df :", airports_df.shape)

# %% [markdown]
# 3. Concat + split

# %%
train_ = train.copy(); train_["split"] = "train"
val_   = val.copy();   val_["split"] = "val"
test_  = test.copy();  test_["split"] = "test"

flights_all = pd.concat([train_, val_, test_], ignore_index=True)
print("[INFO] flights_all :", flights_all.shape)

# %% [markdown]
# 4. Générer dataset météo COMPLET (12 mois d’un coup par airport/year)

# %%
flights_weather = build_weather_dataset(flights_all, airports_df)
print("[INFO] flights_weather (brut) :", flights_weather.shape)

# %% [markdown]
# 5. Normalisation colonnes — VERSION ROBUSTE

# %%
dfw = flights_weather.copy()

# -----------------------------
# 5.1 Restaurer colonnes _x/_y
# -----------------------------
renames = {
    "carrier_x": "carrier",
    "carrier_name_x": "carrier_name",
    "airport_x": "airport",
    "airport_name_x": "airport_name",
    "year_x": "year",
    "month_x": "month",
}

dfw = dfw.rename(columns={k: v for k, v in renames.items() if k in dfw.columns})

cols_to_drop = [
    "carrier_y", "carrier_name_y",
    "airport_y", "airport_name_y",
    "year_y", "month_y",
]

dfw = dfw.drop(columns=[c for c in cols_to_drop if c in dfw.columns], errors="ignore")

# -----------------------------
# 5.2 Harmonisation colonne month
# -----------------------------
if "month" not in dfw.columns:
    raise ValueError("❌ ERREUR FATALE : month absente après normalisation météo")

# -----------------------------
# 5.3 Nettoyage colonnes météo intermédiaires
# -----------------------------
unused_meteo = [
    "temp_mean", "temp_min", "temp_max",
    "precip_sum", "windspeed_max",
]
dfw = dfw.drop(columns=[c for c in unused_meteo if c in dfw.columns], errors="ignore")

# -----------------------------
# 5.4 Colonnes techniques inutiles
# -----------------------------
technical_cols = ["faa_code", "lat", "lon", "key"]
dfw = dfw.drop(columns=[c for c in technical_cols if c in dfw.columns], errors="ignore")

print("[INFO] Colonnes après normalisation :", len(dfw.columns))

# %% [markdown]
# 6. Vérification stricte features == get_feature_columns()

# %%
expected = set(get_feature_columns())
cols_dfw = set(dfw.columns)

missing = expected - cols_dfw
extra   = cols_dfw - expected

if missing:
    raise ValueError(
        f"❌ Features manquantes : {missing}\n"
        f"▶ Vérifie build_weather_dataset ou normalisation"
    )

print("[CHECK] Toutes les features attendues sont présentes ✔")

# %% [markdown]
# 7. Re-split + save

# %%
if "split" not in dfw.columns:
    raise ValueError("❌ Colonne 'split' absente !")

train_weather = dfw[dfw["split"] == "train"].drop(columns=["split"])
val_weather   = dfw[dfw["split"] == "val"].drop(columns=["split"])
test_weather  = dfw[dfw["split"] == "test"].drop(columns=["split"])

print("[INFO] train_weather :", train_weather.shape)
print("[INFO] val_weather   :", val_weather.shape)
print("[INFO] test_weather  :", test_weather.shape)

train_weather.to_csv(DATA_PROCESSED_DIR / "train_weather.csv", index=False)
val_weather.to_csv(DATA_PROCESSED_DIR / "val_weather.csv", index=False)
test_weather.to_csv(DATA_PROCESSED_DIR / "test_weather.csv", index=False)

print("✔ 04_meteo_integration — VERSION FINALE ULTRA-STABLE")

[INFO] ROOT = C:\Users\Balerion\Desktop\us-flights-delay
[INFO] ROOT = C:\Users\Balerion\Desktop\us-flights-delay
[INFO] src exists?  True
[INFO] config.yml chargé depuis : C:\Users\Balerion\Desktop\us-flights-delay\config.yml
[INFO] DATA_RAW_DIR      = C:\Users\Balerion\Desktop\us-flights-delay\data\raw
[INFO] DATA_PROCESSED_DIR= C:\Users\Balerion\Desktop\us-flights-delay\data\processed
[INFO] WEATHER_DIR       = C:\Users\Balerion\Desktop\us-flights-delay\data\weather
[INFO] MODELS_DIR        = C:\Users\Balerion\Desktop\us-flights-delay\models
[INFO] REPORTS_DIR       = C:\Users\Balerion\Desktop\us-flights-delay\reports
[INFO] MLRUNS_DIR        = C:\Users\Balerion\Desktop\us-flights-delay\mlruns
[INFO] MLFLOW_TRACKING_URI (cfg) = file:///C:/Users/Balerion/Desktop/us-flights-delay/mlruns
[INFO] MLflow experiment name    = us_flights_delay
[INFO] HIGH_DELAY_RATE_THRESHOLD = 0.25
[INFO] WEATHER_CACHE_DIR         = C:\Users\Balerion\Desktop\us-flights-delay\data\weather
[INFO] WEATHER_CAC