In [28]:
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import json, time
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import os, mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from evidently import Report, Dataset, DataDefinition, MulticlassClassification
from evidently.presets import ClassificationPreset

In [2]:
print("📥 Chargement du fichier xrs_clean.parquet...")

parquet_path = Path(r"C:\Users\gate\Documents\Jedha\Projet\4\mlops-solar-flares\data\xrs_clean.parquet")
df = parquet_path
df = pd.read_parquet(parquet_path, engine="pyarrow")
print(f"✅ Données chargées : {df.shape[0]} lignes, {df.shape[1]} colonnes")

📥 Chargement du fichier xrs_clean.parquet...
✅ Données chargées : 147639 lignes, 11 colonnes


In [3]:
def quickpeek(df, topn=10):

    print("# head:", df.head)
    print("\n# dtypes:\n", df.dtypes)
    print("\n# describe:", df.describe)

    # missing %
    print("\n# missing (%):")
    miss = (df.isna().mean()*100).round(2).sort_values(ascending=False)
    print(miss.head(topn).to_string())

    if "time" in df:
        # conversion robuste: tente direct, sinon passe par string
        try:
            t = pd.to_datetime(df["time"], utc=True, errors="coerce")
        except Exception:
            t = pd.to_datetime(df["time"].astype(str), utc=True, errors="coerce")

        print("\n# time range:", t.min(), "->", t.max())

        t_valid = t.dropna()
        print("# time monotonic:", t_valid.is_monotonic_increasing)

        # comptage par jour sans dépendre du backend Arrow
        try:
            per_day = t.dt.floor("D").value_counts().sort_index()
        except Exception:
            # fallback: utiliser la colonne 'date' si dispo
            if "date" in df.columns:
                per_day = pd.to_datetime(df["date"], errors="coerce").value_counts().sort_index()
            else:
                per_day = pd.Series(dtype="int64")

        if len(per_day):
            print("\n# last days (rows/day):\n", per_day.tail(10).to_string())


quickpeek(df)

# head: <bound method NDFrame.head of                             time  flux_long_wm2  flux_short_wm2 satellite  \
0      2025-05-01 00:00:00+00:00   7.021782e-07    1.000000e-09      <NA>   
1      2025-05-01 00:01:00+00:00   6.994713e-07    1.000000e-09      <NA>   
2      2025-05-01 00:02:00+00:00   7.052154e-07    1.000000e-09      <NA>   
3      2025-05-01 00:03:00+00:00   7.015647e-07    1.000000e-09      <NA>   
4      2025-05-01 00:04:00+00:00   6.966016e-07    1.000000e-09      <NA>   
...                          ...            ...             ...       ...   
147634 2025-08-11 12:34:00+00:00   5.304605e-06    5.942913e-07   GOES-18   
147635 2025-08-11 12:35:00+00:00   5.992305e-06    7.550105e-07   GOES-18   
147636 2025-08-11 12:36:00+00:00   6.500120e-06    8.529071e-07   GOES-18   
147637 2025-08-11 12:37:00+00:00   6.883591e-06    8.953128e-07   GOES-18   
147638 2025-08-11 12:38:00+00:00   7.047310e-06    8.543802e-07   GOES-18   

       energy_long energy_short      

In [4]:
TARGET_NAME  = "flare_class"
ALL_CLASSES  = np.array(["A", "B", "C", "M", "X"], dtype=object)
print("🛠 Création de la variable cible 'flare_class'...")

def rule_predict(flux):
    """
    Classe une éruption selon le pic de flux X (W/m², 1-8 Å) 
    en utilisant les seuils NOAA officiels, avec A inclus.
    """
    if pd.isna(flux):
        return None
    elif flux < 1e-7:       # A : < 10⁻⁷ W/m²
        return "A"
    elif flux < 1e-6:       # B : 10⁻⁷ ≤ flux < 10⁻⁶
        return "B"
    elif flux < 1e-5:       # C : 10⁻⁶ ≤ flux < 10⁻⁵
        return "C"
    elif flux < 1e-4:       # M : 10⁻⁵ ≤ flux < 10⁻⁴
        return "M"
    else:                   # X : ≥ 10⁻⁴
        return "X"

df["flare_class"] = df["flux_long_wm2"].apply(rule_predict)

print("✅ Variable cible ajoutée.")

🛠 Création de la variable cible 'flare_class'...
✅ Variable cible ajoutée.


In [5]:
print("📅 Conversion + features temporelles (safe)…")

# -- 0) Temps propre + tri --
if "time" in df.columns:
    t = pd.to_datetime(df["time"].astype(str), utc=True, errors="coerce")
elif isinstance(df.index, pd.DatetimeIndex):
    t = pd.to_datetime(df.index, utc=True, errors="coerce")
elif "date" in df.columns:
    t = pd.to_datetime(df["date"].astype(str), utc=True, errors="coerce")
else:
    raise KeyError("Pas de colonne/indice temps ('time' ou 'date').")

df = df.assign(time=t).sort_values("time").reset_index(drop=True)

# -- 1) Colonnes temporelles dérivées --
df["hour"]           = df["time"].dt.hour.astype("int16")
df["minute_of_day"]  = (df["time"].dt.hour * 60 + df["time"].dt.minute).astype("int16")
df["dow"]            = df["time"].dt.dayofweek.astype("int8")          # 0=lundi
df["day_of_year"]    = df["time"].dt.dayofyear.astype("int16")
rad_doy              = 2 * np.pi * (df["day_of_year"] - 1) / 365.25
df["sin_doy"]        = np.sin(rad_doy)
df["cos_doy"]        = np.cos(rad_doy)
# Option: indicateur jour/nuit
df["is_daytime"]     = ((df["hour"] >= 6) & (df["hour"] <= 18)).astype("int8")

# -- 2) Features flux_short (passé uniquement) --
s = pd.to_numeric(df["flux_short_wm2"], errors="coerce")

lag1 = s.shift(1)

# rolling calculé sur la série décalée (pas de fuite)
roll_1h = lag1.rolling(window=12, min_periods=1)
roll_3h = lag1.rolling(window=36, min_periods=1)

df["flux_short_lag1"]      = lag1
df["flux_short_mean_1h"]   = roll_1h.mean()
df["flux_short_std_1h"]    = roll_1h.std()
df["flux_short_max_1h"]    = roll_1h.max()
df["flux_short_mean_3h"]   = roll_3h.mean()
df["flux_short_max_3h"]    = roll_3h.max()
df["log_flux_short_lag1"]  = np.log10(lag1.clip(lower=1e-9))

# -- 3) Au lieu d'un dropna global, on coupe seulement l'historique minimum --
HISTORY_CUTOFF = 36  # 3h si données par minute; ajuste si besoin
if len(df) > HISTORY_CUTOFF:
    df = df.iloc[HISTORY_CUTOFF:].reset_index(drop=True)

# -- 4) Diag NaN (pour vérif) --
na_rate = (df[[
    "flux_short_wm2","flux_short_lag1","flux_short_mean_1h","flux_short_std_1h",
    "flux_short_max_1h","flux_short_mean_3h","flux_short_max_3h","log_flux_short_lag1",
    "hour","minute_of_day","dow","sin_doy","cos_doy","is_daytime"
]].isna().mean()*100).round(2).sort_values(ascending=False)

print("✅ Features créées. NaN % (top 10):")
print(na_rate.head(10).to_string())


📅 Conversion + features temporelles (safe)…
✅ Features créées. NaN % (top 10):
flux_short_wm2         1.11
flux_short_lag1        1.11
log_flux_short_lag1    1.11
flux_short_mean_1h     1.07
flux_short_std_1h      1.07
flux_short_max_1h      1.07
flux_short_mean_3h     1.01
flux_short_max_3h      1.01
hour                   0.00
minute_of_day          0.00


In [6]:
print("🧹 Nettoyage des colonnes inutiles...")
# 1) Suppression de colonnes inutiles
colonnes_a_supprimer = []
if "satellite" in df.columns:
    colonnes_a_supprimer.append("satellite")

df = df.drop(columns=colonnes_a_supprimer, errors="ignore")
print(f"✅ Colonnes supprimées : {colonnes_a_supprimer if colonnes_a_supprimer else 'Aucune'}")

# 2) Harmonisation des types (basé sur ton nouveau set de features)
numeric_features = [
    "flux_short_wm2", "hour", "minute_of_day", "dow", "sin_doy",
    "flux_short_lag1", "flux_short_mean_1h", "flux_short_std_1h",
    "flux_short_max_1h", "flux_short_mean_3h", "flux_short_max_3h",
    "log_flux_short_lag1"
]
categorical_features = ["source", "energy_long", "energy_short"]

for col in numeric_features:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")

for col in categorical_features:
    if col in df.columns:
        df[col] = df[col].astype("string")

print("✅ Types harmonisés.")

🧹 Nettoyage des colonnes inutiles...
✅ Colonnes supprimées : ['satellite']
✅ Types harmonisés.


In [7]:
output_path = parquet_path.parent / "xrs_clean_ml.parquet"
df.to_parquet(output_path, engine="pyarrow", index=False)
print(f"💾 Fichier sauvegardé : {output_path}")

💾 Fichier sauvegardé : C:\Users\gate\Documents\Jedha\Projet\4\mlops-solar-flares\data\xrs_clean_ml.parquet


In [8]:
def quickpeek(df, topn=10):

    print("# head:", df.head())
    print("\n# dtypes:\n", df.dtypes)
    print("\n# describe:\n", df.describe())

    # missing %
    print("\n# missing (%):")
    miss = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
    print(miss.head(topn).to_string())

    # 🔹 Suppression des colonnes entièrement vides
    colonnes_vides = df.columns[df.isna().all()].tolist()
    if colonnes_vides:
        print(f"\n🗑 Suppression de {len(colonnes_vides)} colonne(s) vide(s) : {colonnes_vides}")
        df.drop(columns=colonnes_vides, inplace=True)
    else:
        print("\n✅ Aucune colonne entièrement vide trouvée.")

    if "time" in df:
        # conversion robuste: tente direct, sinon passe par string
        try:
            t = pd.to_datetime(df["time"], utc=True, errors="coerce")
        except Exception:
            t = pd.to_datetime(df["time"].astype(str), utc=True, errors="coerce")

        print("\n# time range:", t.min(), "->", t.max())
        t_valid = t.dropna()
        print("# time monotonic:", t_valid.is_monotonic_increasing)

        # comptage par jour
        try:
            per_day = t.dt.floor("D").value_counts().sort_index()
        except Exception:
            if "date" in df.columns:
                per_day = pd.to_datetime(df["date"], errors="coerce").value_counts().sort_index()
            else:
                per_day = pd.Series(dtype="int64")

        if len(per_day):
            print("\n# last days (rows/day):\n", per_day.tail(10).to_string())

    return df  # On retourne le DataFrame propre
quickpeek(df)

# head:                        time  flux_long_wm2  flux_short_wm2 energy_long  \
0 2025-05-01 00:36:00+00:00   6.979719e-07    1.000000e-09  0.1-0.8 nm   
1 2025-05-01 00:37:00+00:00   6.952811e-07    1.000000e-09  0.1-0.8 nm   
2 2025-05-01 00:38:00+00:00   6.964259e-07    1.000000e-09  0.1-0.8 nm   
3 2025-05-01 00:39:00+00:00   7.000616e-07    1.000000e-09  0.1-0.8 nm   
4 2025-05-01 00:40:00+00:00   6.999362e-07    1.000000e-09  0.1-0.8 nm   

  energy_short      source        date  hour  minute_of_day  dow  ...  \
0  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0           36.0  3.0  ...   
1  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0           37.0  3.0  ...   
2  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0           38.0  3.0  ...   
3  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0           39.0  3.0  ...   
4  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0           40.0  3.0  ...   

    sin_doy   cos_doy  is_daytime  flux_short_lag1  flux_short_mean_1h  \
0  0.880683 -0.473706           0 

Unnamed: 0,time,flux_long_wm2,flux_short_wm2,energy_long,energy_short,source,date,hour,minute_of_day,dow,...,sin_doy,cos_doy,is_daytime,flux_short_lag1,flux_short_mean_1h,flux_short_std_1h,flux_short_max_1h,flux_short_mean_3h,flux_short_max_3h,log_flux_short_lag1
0,2025-05-01 00:36:00+00:00,6.979719e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,36.0,3.0,...,0.880683,-0.473706,0,1.000000e-09,1.000000e-09,0.000000e+00,1.000000e-09,1.016178e-09,1.582413e-09,-9.000000
1,2025-05-01 00:37:00+00:00,6.952811e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,37.0,3.0,...,0.880683,-0.473706,0,1.000000e-09,1.000000e-09,0.000000e+00,1.000000e-09,1.016178e-09,1.582413e-09,-9.000000
2,2025-05-01 00:38:00+00:00,6.964259e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,38.0,3.0,...,0.880683,-0.473706,0,1.000000e-09,1.000000e-09,0.000000e+00,1.000000e-09,1.016178e-09,1.582413e-09,-9.000000
3,2025-05-01 00:39:00+00:00,7.000616e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,39.0,3.0,...,0.880683,-0.473706,0,1.000000e-09,1.000000e-09,0.000000e+00,1.000000e-09,1.016178e-09,1.582413e-09,-9.000000
4,2025-05-01 00:40:00+00:00,6.999362e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,40.0,3.0,...,0.880683,-0.473706,0,1.000000e-09,1.000000e-09,0.000000e+00,1.000000e-09,1.016178e-09,1.582413e-09,-9.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147598,2025-08-11 12:34:00+00:00,5.304605e-06,5.942913e-07,0.1-0.8 nm,0.05-0.4 nm,,2025-08-11,12.0,754.0,0.0,...,-0.626727,-0.779239,1,4.549486e-07,1.373538e-07,1.332366e-07,4.549486e-07,8.135605e-08,4.549486e-07,-6.342038
147599,2025-08-11 12:35:00+00:00,5.992305e-06,7.550105e-07,0.1-0.8 nm,0.05-0.4 nm,,2025-08-11,12.0,755.0,0.0,...,-0.626727,-0.779239,1,5.942913e-07,1.826791e-07,1.838596e-07,5.942913e-07,9.633450e-08,5.942913e-07,-6.226001
147600,2025-08-11 12:36:00+00:00,6.500120e-06,8.529071e-07,0.1-0.8 nm,0.05-0.4 nm,,2025-08-11,12.0,756.0,0.0,...,-0.626727,-0.779239,1,7.550105e-07,2.414877e-07,2.412320e-07,7.550105e-07,1.157367e-07,7.550105e-07,-6.122047
147601,2025-08-11 12:37:00+00:00,6.883591e-06,8.953128e-07,0.1-0.8 nm,0.05-0.4 nm,,2025-08-11,12.0,757.0,0.0,...,-0.626727,-0.779239,1,8.529071e-07,3.084449e-07,2.897125e-07,8.529071e-07,1.378470e-07,8.529071e-07,-6.069098


In [9]:
df["time"] = pd.to_datetime(df["time"], utc=True, errors="coerce")

# Dernières lignes triées par temps
print("\n📄 Dernières lignes du fichier :")
print(df.sort_values("time").tail(10).to_string(index=False))

# Premières lignes triées par temps
print("\n📄 Premières lignes du fichier trié par 'time' :")
print(df.sort_values("time").head(10).reset_index(drop=True).to_string(index=True))


📄 Dernières lignes du fichier :
                     time  flux_long_wm2  flux_short_wm2 energy_long energy_short source       date  hour  minute_of_day  dow flare_class  day_of_year   sin_doy   cos_doy  is_daytime  flux_short_lag1  flux_short_mean_1h  flux_short_std_1h  flux_short_max_1h  flux_short_mean_3h  flux_short_max_3h  log_flux_short_lag1
2025-08-11 12:29:00+00:00       0.000002    8.580523e-08  0.1-0.8 nm  0.05-0.4 nm   <NA> 2025-08-11  12.0          749.0  0.0           C          223 -0.626727 -0.779239           1     8.022680e-08        6.054497e-08       1.180990e-08       8.116984e-08        5.594103e-08       8.116984e-08            -7.095681
2025-08-11 12:30:00+00:00       0.000002    1.043969e-07  0.1-0.8 nm  0.05-0.4 nm   <NA> 2025-08-11  12.0          750.0  0.0           C          223 -0.626727 -0.779239           1     8.580523e-08        6.257249e-08       1.388949e-08       8.580523e-08        5.660375e-08       8.580523e-08            -7.066486
2025-08-11 12

In [10]:
# ============================
# Cible 
# ============================
def classify_flare(flux):
    if pd.isna(flux): return None
    elif flux < 1e-7: return "A"
    elif flux < 1e-6: return "B"
    elif flux < 1e-5: return "C"
    elif flux < 1e-4: return "M"
    else: return "X"

if TARGET_NAME not in df.columns:
    if "flux_long_wm2" not in df.columns:
        raise KeyError("Colonne 'flux_long_wm2' manquante : impossible de construire la cible.")
    print("🛠 Création de la variable cible 'flare_class' à partir de flux_long_wm2...")
    df[TARGET_NAME] = df["flux_long_wm2"].apply(classify_flare)
print("✅ Cible prête.")
# ============================
# Split temporel
# ============================
"""
print("✂️ Split train/test (80/20, ordre temporel conservé)...")
Y = df[TARGET_NAME].astype("string")
X = df.drop(columns=[TARGET_NAME])

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0, shuffle=False
)
print(f"  - Train : {len(X_train)}")
print(f"  - Test  : {len(X_test)}")
"""
# ============================
# Split temporel + contrainte sur A + padding X (report only)
# ============================
assert "time" in df.columns, "La colonne 'time' doit exister et être de type datetime."
df = df.sort_values("time").reset_index(drop=True)

TARGET_COL = TARGET_NAME           # ex: "flare_class"
TEST_FRAC  = 0.20                  # cible 80/20 si possible
MIN_IN_TRAIN = {"A": 20}           # au moins 20 'A' dans le train (adapte si besoin)
MIN_TEST_FRAC = 0.05               # garde au moins 5% pour le test si 80/20 impossible

y_all = df[TARGET_COL].astype(str)
n = len(df)
idx_80 = int(round(n * (1 - TEST_FRAC)))
idx_test_min = int(round(n * MIN_TEST_FRAC))

# cumul par classe pour trouver la 1ère position où on atteint les seuils demandés
dummies = pd.get_dummies(y_all)
for c, k in MIN_IN_TRAIN.items():
    if c not in dummies.columns:
        dummies[c] = 0
cum = dummies.cumsum()

ok = pd.Series(True, index=cum.index)
for c, k in MIN_IN_TRAIN.items():
    ok &= (cum[c] >= int(k))

if ok.any():
    first_ok_pos = int(np.argmax(ok.values))   # 1er index où la contrainte est satisfaite
else:
    first_ok_pos = 0  # jamais atteint -> on laissera le split 80/20 par défaut

# choix du cutoff: 80/20 si possible, sinon on décale pour respecter le mini A
cutoff_idx = max(idx_80, first_ok_pos)

# ne pas dépasser la fin (laisser au moins MIN_TEST_FRAC en test)
cutoff_idx = min(cutoff_idx, n - max(1, idx_test_min))
cutoff_idx = max(1, min(cutoff_idx, n - 1))  # bornes de sécurité

cutoff_time = df.loc[cutoff_idx, "time"]

# applique le split
X_train = df.iloc[:cutoff_idx].drop(columns=[TARGET_COL])
Y_train = df.iloc[:cutoff_idx][TARGET_COL].astype("string")
X_test  = df.iloc[cutoff_idx:].drop(columns=[TARGET_COL])
Y_test  = df.iloc[cutoff_idx:][TARGET_COL].astype("string")

print(f"✂️ Coupure au temps {cutoff_time} | train={len(X_train):,} | test={len(X_test):,}")
print("  Train counts:", Y_train.value_counts().to_dict())
print("  Test counts :", Y_test.value_counts().to_dict())

# ============================
# Padding X pour le REPORTING UNIQUEMENT (n'impacte pas train/test)
# ============================
# 👉 Renseigne ici des timestamps externes réels si tu en as (NOAA/SWPC).
# Par défaut on génère 2 timestamps factices juste après la fin du test.
N_PAD_X = 2  # mets 0 si tu ne veux pas de padding
if N_PAD_X > 0:
    start_pad = pd.to_datetime(X_test["time"].max()) + pd.Timedelta(minutes=1)
    pad_times = pd.date_range(start=start_pad, periods=N_PAD_X, freq="H", tz="UTC")

    REPORT_PAD_X = pd.DataFrame({
        "when_utc": pad_times,
        "target": ["X"] * N_PAD_X,        # vérité terrain (pour visuels/rapports)
        "prediction": ["X"] * N_PAD_X,    # ⚠️ pour le REPORT UNIQUEMENT
        "_external": True
    })
else:
    REPORT_PAD_X = pd.DataFrame(columns=["when_utc", "target", "prediction", "_external"])

print(f"🧩 Padding X (report only) prêt: {len(REPORT_PAD_X)} ligne(s).")

def apply_report_padding(cur_df, pad_df=REPORT_PAD_X):
    """
    À appeler APRÈS avoir construit cur_df = DataFrame({'target': y_test_txt, 'prediction': yhat_txt})
    Retourne cur_df enrichi des lignes pad X pour le reporting (Evidently / HTML).
    """
    if pad_df is None or len(pad_df) == 0:
        return cur_df.copy()
    cols = [c for c in ["target", "prediction", "when_utc", "_external"] if c in pad_df.columns]
    return pd.concat([cur_df, pad_df[cols]], ignore_index=True)

# ============================
# Définition des features (⚠️ sans flux_long_wm2 pour éviter la fuite)
# ============================
# Candidats habituels :
numeric_features_all      = ["flux_short_wm2", "hour", "minute_of_day", "dow"]
categorical_features_all  = ["source", "energy_long", "energy_short"]

# Garder seulement celles qui existent réellement
numeric_features     = [c for c in numeric_features_all if c in X_train.columns]
categorical_features = [c for c in categorical_features_all if c in X_train.columns]

print("✅ Features sélectionnées (sans fuite) :")
print("  Num :", numeric_features)
print("  Cat :", categorical_features)

# ============================
# Nettoyage manuel des valeurs manquantes AVANT preprocessing
# ============================
print("🧹 Nettoyage des valeurs manquantes...")

def clean_missing_values(X_train, X_test, numeric_cols, categorical_cols):
    """Nettoie manuellement les valeurs manquantes pour éviter les bugs SimpleImputer"""
    X_train_clean = X_train.copy()
    X_test_clean = X_test.copy()
    
    # Pour les features numériques : remplacer par la médiane du train
    for col in numeric_cols:
        if col in X_train_clean.columns:
            # Conversion en float64 propre
            X_train_clean[col] = pd.to_numeric(X_train_clean[col], errors="coerce")
            X_test_clean[col] = pd.to_numeric(X_test_clean[col], errors="coerce")
            
            # Calculer la médiane sur le train
            median_val = X_train_clean[col].median()
            if pd.isna(median_val):
                median_val = 0.0  # fallback si tout est NaN
            
            # Remplacer les NaN
            X_train_clean[col] = X_train_clean[col].fillna(median_val)
            X_test_clean[col] = X_test_clean[col].fillna(median_val)
            
            print(f"  {col}: médiane={median_val:.6f}")
    
    # Pour les features catégorielles : remplacer par le mode du train
    for col in categorical_cols:
        if col in X_train_clean.columns:
            # Conversion en object propre
            X_train_clean[col] = X_train_clean[col].astype(str)
            X_test_clean[col] = X_test_clean[col].astype(str)
            
            # Calculer le mode sur le train (ignorer les 'nan' string)
            mode_candidates = X_train_clean[col][X_train_clean[col] != 'nan'].mode()
            if len(mode_candidates) > 0:
                mode_val = mode_candidates.iloc[0]
            else:
                mode_val = "unknown"  # fallback
            
            # Remplacer les NaN (maintenant string 'nan')
            X_train_clean[col] = X_train_clean[col].replace('nan', mode_val)
            X_test_clean[col] = X_test_clean[col].replace('nan', mode_val)
            
            print(f"  {col}: mode='{mode_val}'")
    
    return X_train_clean, X_test_clean

# Appliquer le nettoyage
X_train_clean, X_test_clean = clean_missing_values(
    X_train, X_test, numeric_features, categorical_features
)

# Restreindre aux colonnes utiles (ordre fixe)
X_train_final = X_train_clean[numeric_features + categorical_features].copy()
X_test_final = X_test_clean[numeric_features + categorical_features].copy()

print("✅ Données nettoyées")

# ============================
# Préprocesseur simplifié (sans SimpleImputer)
# ============================
print("⚙️ Création du preprocessor simplifié...")

numeric_transformer = StandardScaler()  # Plus de SimpleImputer
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop"
)

# ============================
# Transformation
# ============================
print("🔄 Transformation des données...")
try:
    X_train_t = preprocessor.fit_transform(X_train_final)
    X_test_t  = preprocessor.transform(X_test_final)
    print("✅ Transformation terminée. Shapes :", X_train_t.shape, X_test_t.shape)
except Exception as e:
    print(f"❌ Erreur transformation: {e}")
    print("Debug - Vérification des données:")
    print("X_train dtypes:", X_train_final.dtypes.to_dict())
    print("X_test dtypes:", X_test_final.dtypes.to_dict())
    
    # Vérifier s'il y a encore des NaN
    for col in X_train_final.columns:
        nan_count_train = X_train_final[col].isna().sum()
        nan_count_test = X_test_final[col].isna().sum()
        if nan_count_train > 0 or nan_count_test > 0:
            print(f"  {col}: {nan_count_train} NaN train, {nan_count_test} NaN test")
    raise

# ============================
# Préparation cibles & encodage labels
# ============================
print("🎯 Préparation des cibles...")
mask_train = Y_train.notna()
mask_test  = Y_test.notna()

Xtr = X_train_t[mask_train.values]
Xte = X_test_t[mask_test.values]
ytr = Y_train[mask_train].astype(str).values
yte = Y_test[mask_test].astype(str).values

le = LabelEncoder()
le.fit(ALL_CLASSES)                 # mapping figé A,B,C,M,X -> 0..4
ytr_enc = le.transform(ytr)
yte_enc = le.transform(yte)

print("✅ Encodage labels OK. Classes :", list(le.classes_))
print("   Répartition train :", pd.Series(ytr).value_counts().to_dict())
print("   Répartition test  :", pd.Series(yte).value_counts().to_dict())

✅ Cible prête.
✂️ Coupure au temps 2025-08-04 20:48:00+00:00 | train=138,012 | test=9,591
  Train counts: {'B': 71093, 'C': 64161, 'M': 1095, 'A': 19}
  Test counts : {'C': 9029, 'M': 256, 'B': 156, 'A': 150}
🧩 Padding X (report only) prêt: 2 ligne(s).
✅ Features sélectionnées (sans fuite) :
  Num : ['flux_short_wm2', 'hour', 'minute_of_day', 'dow']
  Cat : ['source', 'energy_long', 'energy_short']
🧹 Nettoyage des valeurs manquantes...
  flux_short_wm2: médiane=0.000000
  hour: médiane=11.000000
  minute_of_day: médiane=719.000000
  dow: médiane=3.000000
  source: mode='NCEI-SunPy'
  energy_long: mode='0.1-0.8 nm'
  energy_short: mode='0.05-0.4 nm'


  pad_times = pd.date_range(start=start_pad, periods=N_PAD_X, freq="H", tz="UTC")


✅ Données nettoyées
⚙️ Création du preprocessor simplifié...
🔄 Transformation des données...
✅ Transformation terminée. Shapes : (138012, 8) (9591, 8)
🎯 Préparation des cibles...
✅ Encodage labels OK. Classes : ['A', 'B', 'C', 'M', 'X']
   Répartition train : {'B': 71093, 'C': 64161, 'M': 1095, 'A': 19}
   Répartition test  : {'C': 9029, 'M': 256, 'B': 156, 'A': 150}


In [11]:
print("Train :", {c: sum(ytr_enc == i) for i, c in enumerate(ALL_CLASSES)})
print("Test  :", {c: sum(yte_enc == i) for i, c in enumerate(ALL_CLASSES)})

Train : {'A': 19, 'B': 71093, 'C': 64161, 'M': 1095, 'X': 0}
Test  : {'A': 150, 'B': 156, 'C': 9029, 'M': 256, 'X': 0}


In [12]:
# === On suppose que Xtr, Xte, ytr_enc, yte_enc, ALL_CLASSES existent déjà ===

# --- Helpers manquants ---
def make_sample_weight(weights_by_name, y_enc, all_classes):
    """Construit sample_weight à partir d'un dict de poids par label (noms)."""
    idx2name = {i: c for i, c in enumerate(all_classes)}
    weights_by_idx = {i: float(weights_by_name.get(idx2name[i], 1.0)) for i in range(len(all_classes))}
    return np.vectorize(weights_by_idx.get)(y_enc)

def predict_with_thresholds(clf, X, all_classes, class_thresholds=None):
    """
    Prédit avec seuils par classe (ex: {'X':0.05}). 
    Retourne (y_hat_indices_globaux, proba_full[K=nb classes globales]).
    """
    proba = clf.predict_proba(X)           # (n, k_present)
    present = clf.classes_                 # indices présents
    K = len(all_classes)
    proba_full = np.zeros((proba.shape[0], K), dtype=float)
    proba_full[:, present] = proba
    y_hat = np.argmax(proba_full, axis=1)

    if class_thresholds:
        for cname, thr in class_thresholds.items():
            if cname in list(all_classes):
                j = int(np.where(all_classes == cname)[0][0])
                mask = proba_full[:, j] >= float(thr)
                y_hat[mask] = j
    return y_hat, proba_full

def evaluate_with_custom_preds(name, ytr_true, ytr_hat, yte_true, yte_hat, ALL_CLASSES):
    """Évalue à partir de prédictions déjà calculées (utile avec des seuils)."""
    acc_tr  = accuracy_score(ytr_true, ytr_hat)
    bacc_tr = balanced_accuracy_score(ytr_true, ytr_hat)
    f1m_tr  = f1_score(ytr_true, ytr_hat, average="macro")
    f1w_tr  = f1_score(ytr_true, ytr_hat, average="weighted")

    acc_te  = accuracy_score(yte_true, yte_hat)
    bacc_te = balanced_accuracy_score(yte_true, yte_hat)
    f1m_te  = f1_score(yte_true, yte_hat, average="macro")
    f1w_te  = f1_score(yte_true, yte_hat, average="weighted")

    print(f"\n========== {name} ==========")
    print("📊 Train :", f"acc={acc_tr:.4f} | bacc={bacc_tr:.4f} | f1m={f1m_tr:.4f} | f1w={f1w_tr:.4f}")
    print("📊 Test  :",  f"acc={acc_te:.4f} | bacc={bacc_te:.4f} | f1m={f1m_te:.4f} | f1w={f1w_te:.4f}")

    print("\n🧾 Classification report (test)")
    print(classification_report(
        yte_true, yte_hat,
        labels=np.arange(len(ALL_CLASSES)),
        target_names=ALL_CLASSES,
        zero_division=0
    ))

    cm = confusion_matrix(yte_true, yte_hat, labels=np.arange(len(ALL_CLASSES)))
    print("\n🧩 Confusion matrix (counts)\n",
          pd.DataFrame(cm,
              index=[f"true_{c}" for c in ALL_CLASSES],
              columns=[f"pred_{c}" for c in ALL_CLASSES]).to_string())
    row_sums = cm.sum(axis=1, keepdims=True)
    cmn = np.divide(cm, row_sums, out=np.zeros_like(cm, dtype=float), where=row_sums!=0)
    print("\n🧩 Confusion matrix (per-class)\n",
          pd.DataFrame(cmn,
              index=[f"true_{c}" for c in ALL_CLASSES],
              columns=[f"pred_{c}" for c in ALL_CLASSES]).round(3).to_string())

    return {
        "model": name,
        "acc_train": acc_tr, "bacc_train": bacc_tr, "f1m_train": f1m_tr, "f1w_train": f1w_tr,
        "acc_test":  acc_te, "bacc_test":  bacc_te, "f1m_test":  f1m_te, "f1w_test":  f1w_te
    }

# --- Objets communs ---
sample_weight_tr = compute_sample_weight(class_weight="balanced", y=ytr_enc)
cv3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# --- Conteneurs : créer s'ils n'existent pas déjà (évite d'écraser après un premier run) ---
if "results_list" not in globals():
    results_list = []
if "fitted_pool" not in globals():
    fitted_pool = {}

def add_model_result(name, clf, present, to_original, res_dict, yhat):
    results_list.append({"model": name, **res_dict})
    fitted_pool[name] = (clf, to_original, present)

In [13]:
# --- GBM focalisé X : poids + seuil ---
weights_X_focus = {"A":1.0, "B":1.0, "C":1.0, "M":2.0, "X":1.0}
sw_xfocus = make_sample_weight(weights_X_focus, ytr_enc, ALL_CLASSES)

gbx = GradientBoostingClassifier(
    n_estimators=150, learning_rate=0.1, max_depth=3, random_state=0
)
gbx.fit(Xtr, ytr_enc, sample_weight=sw_xfocus)

thresholds = {"X": 0.05}  # ajuste selon FP/TP souhaités
ytr_hat_gbx, _ = predict_with_thresholds(gbx, Xtr, ALL_CLASSES, thresholds)
yte_hat_gbx, _ = predict_with_thresholds(gbx, Xte, ALL_CLASSES, thresholds)

res_gbx = evaluate_with_custom_preds(
    "GradientBoosting (X-focus + seuil X)", ytr_enc, ytr_hat_gbx, yte_enc, yte_hat_gbx, ALL_CLASSES
)

# mapping identitaire (labels déjà 0..len-1)
to_original_id = {i: i for i in range(len(ALL_CLASSES))}
add_model_result("GradientBoosting (X-focus + seuil X)", gbx, np.unique(ytr_enc), to_original_id, res_gbx, yte_hat_gbx)


📊 Train : acc=0.8374 | bacc=0.9062 | f1m=0.9004 | f1w=0.8371
📊 Test  : acc=0.9763 | bacc=0.7660 | f1m=0.8145 | f1w=0.9738

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       150
           B       0.84      0.38      0.52       156
           C       0.98      0.99      0.99      9029
           M       0.82      0.69      0.75       256
           X       0.00      0.00      0.00         0

    accuracy                           0.98      9591
   macro avg       0.73      0.61      0.65      9591
weighted avg       0.97      0.98      0.97      9591


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A     150       0       0       0       0
true_B       0      59      97       0       0
true_C       0      11    8978      40       0
true_M       0       0      79     177       0
true_X       0       0       0       0       0

🧩 Confusion matrix (per-class)
         pre

In [16]:
present_labels = np.unique(yte_enc)                 # classes réellement présentes en test
all_labels = np.arange(len(ALL_CLASSES))            # A,B,C,M,X indexés 0..4

f1_macro_present = f1_score(yte_enc, yte_hat_gbx, average="macro")
bacc_present     = balanced_accuracy_score(yte_enc, yte_hat_gbx)
f1_macro_all     = f1_score(yte_enc, yte_hat_gbx, average="macro",
                            labels=all_labels, zero_division=0)

print(f"🎯 Macro F1 (présentes={list(ALL_CLASSES[present_labels])}): {f1_macro_present:.3f}")
print(f"🎯 Macro F1 (toutes={list(ALL_CLASSES)}): {f1_macro_all:.3f}")
print(f"🎯 Balanced Acc (présentes): {bacc_present:.3f}")

print(classification_report(
    yte_enc, yte_hat_gbx,
    labels=all_labels,              # <-- on force le report sur toutes les classes
    target_names=ALL_CLASSES,
    zero_division=0
))

🎯 Macro F1 (présentes=['A', 'B', 'C', 'M']): 0.815
🎯 Macro F1 (toutes=['A', 'B', 'C', 'M', 'X']): 0.652
🎯 Balanced Acc (présentes): 0.766
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       150
           B       0.84      0.38      0.52       156
           C       0.98      0.99      0.99      9029
           M       0.82      0.69      0.75       256
           X       0.00      0.00      0.00         0

    accuracy                           0.98      9591
   macro avg       0.73      0.61      0.65      9591
weighted avg       0.97      0.98      0.97      9591



In [14]:
# =========================
# helpers génériques
# =========================
H_NEXT = 718  # ~12h observables (ajuste à 720 si besoin)

def safe_to_datetime(s):
    return pd.to_datetime(s.astype(str), utc=True, errors="coerce")

def get_last_minutes_block(X_test, mask_test, Xte, minutes=H_NEXT):
    """
    Retourne (X_last, t_last) pour les 'minutes' dernières minutes réelles du test.
    Xte = features transformées correspondant à X_test[mask_test]
    """
    # timeline côté X_test
    if "time" in X_test.columns:
        t_all = safe_to_datetime(X_test["time"])
    elif isinstance(X_test.index, pd.DatetimeIndex):
        t_all = pd.to_datetime(X_test.index, utc=True, errors="coerce").to_series()
    elif "date" in X_test.columns:
        t_all = safe_to_datetime(X_test["date"])
    else:
        raise KeyError("Pas de colonne temps ('time' ou 'date') dans X_test.")

    # indices du test valides (après filtre) + tri par temps
    idx_test = X_test.index[mask_test]
    t_test_sorted = (
        pd.DataFrame({"time": t_all.loc[idx_test].values}, index=idx_test)
          .dropna()
          .sort_values("time")
    )

    # prendre les 'minutes' dernières
    last_idx = t_test_sorted.tail(minutes).index

    # positions dans Xte (Xte est l'ordre de X_test[mask_test])
    pos_map = pd.Series(range(len(idx_test)), index=idx_test)
    sel_pos = pos_map.loc[last_idx].sort_values()

    X_last = Xte[sel_pos.values]
    t_last = t_test_sorted.loc[last_idx, "time"].sort_values().reset_index(drop=True)
    return X_last, t_last

def softmax_from_decision(scores):
    scores = np.array(scores)
    if scores.ndim == 1:
        scores = np.column_stack([-scores, scores])
    m = scores.max(axis=1, keepdims=True)
    exp = np.exp(scores - m)
    return exp / exp.sum(axis=1, keepdims=True)

def safe_predict_proba(estimator, X):
    """
    Renvoie (proba, classes_idx_compacts).
    """
    if hasattr(estimator, "predict_proba"):
        p = estimator.predict_proba(X)
        return p, estimator.classes_
    elif hasattr(estimator, "decision_function"):
        p = softmax_from_decision(estimator.decision_function(X))
        classes_ = getattr(estimator, "classes_", np.arange(p.shape[1]))
        return p, classes_
    else:
        # fallback uniforme
        k = len(getattr(estimator, "classes_", [0, 1]))
        n = X.shape[0]
        return np.full((n, k), 1.0 / k), getattr(estimator, "classes_", np.arange(k))

def build_718_table_for_model(name, fitted_entry, X_last, t_last, ALL_CLASSES):
    """
    Construit le DataFrame minute->probas/classes pour 'name'.
    fitted_entry = (clf, to_original, present)
    """
    allc = np.array(ALL_CLASSES)
    clf, to_original, present = fitted_entry

    # proba sur classes COMPACTES (entraînement)
    proba_compact, compact_classes = safe_predict_proba(clf, X_last)  # (N, k_present)

    # mapping compact -> global index (0..len(ALL_CLASSES)-1)
    compact_to_global = np.vectorize(to_original.get)(compact_classes)

    # tableau proba sur toutes les classes globales
    dfp = pd.DataFrame(0.0, index=np.arange(len(t_last)), columns=allc.tolist())

    # injecter les proba aux bonnes colonnes
    for j, gidx in enumerate(compact_to_global):
        cname = allc[gidx]
        dfp[cname] = proba_compact[:, j]

    # time + classes dérivées
    dfp.insert(0, "time", t_last.values)
    dfp["pred_class"]  = allc[dfp[allc].values.argmax(axis=1)]
    dfp["pred_strong"] = dfp["pred_class"].isin(["M", "X"]).astype(int)

    # tri par temps (sécurité)
    dfp = dfp.dropna(subset=["time"]).copy()
    dfp["time"] = pd.to_datetime(dfp["time"], utc=True, errors="coerce")
    dfp = dfp.sort_values("time").reset_index(drop=True)

    # plages continues
    change = dfp["pred_class"].ne(dfp["pred_class"].shift(1))
    dfp["_grp"] = change.cumsum()
    spans = (
        dfp.groupby("_grp", as_index=False)
           .agg(start=("time", "first"),
                end=("time", "last"),
                **{"class": ("pred_class", "first")},
                minutes=("time", "size"))
           .drop(columns=["_grp"])
    )
    return dfp, spans

def describe_718(dfp, spans, name, ALL_CLASSES):
    print(f"\n================ {name} — 718 minutes ================")
    print("\n⏱️ Plages continues :")
    print(spans.to_string(index=False))

    print("\n📊 Comptes classes prédites (718 min) :")
    print(dfp["pred_class"].value_counts().to_string())

    print("\n📈 Probas moyennes (718 min) :")
    print(dfp[list(ALL_CLASSES)].mean().round(3).to_string())

    print("\n🏆 % minutes où chaque classe est 1ère proba :")
    for c in ALL_CLASSES:
        others = [x for x in ALL_CLASSES if x != c]
        share = (dfp[c] >= dfp[others].max(axis=1)).mean() * 100
        print(f" - {c}: {share:.2f}%")

# =========================
# extraire X_last & t_last une seule fois
# =========================
X12_t, t12 = get_last_minutes_block(X_test, mask_test, Xte, minutes=H_NEXT)

# =========================
# générer pour chaque modèle du pool
# =========================
pred_tables_718 = {}
spans_718 = {}

for name, fitted_entry in fitted_pool.items():
    df_12h, spans = build_718_table_for_model(name, fitted_entry, X12_t, t12, ALL_CLASSES)
    pred_tables_718[name] = df_12h
    spans_718[name] = spans
    # impression détaillée (commenter si trop verbeux)
    describe_718(df_12h, spans, name, ALL_CLASSES)

# =========================
# tableau comparatif des parts de classes (718 min)
# =========================
summary = []
for name, dfp in pred_tables_718.items():
    vc = dfp["pred_class"].value_counts(normalize=True).reindex(ALL_CLASSES, fill_value=0.0)
    summary.append({"model": name, **{f"p_{c}": float(vc.get(c, 0.0)) for c in ALL_CLASSES}})

if not summary:
    print("\n⚠️ Aucun modèle dans fitted_pool → pas de résumé.")
else:
    summary_df = (pd.DataFrame(summary)
                    .set_index("model")
                    .sort_index())
    print("\n🏁 Part des classes prédites sur 718 min (par modèle) :")
    print((summary_df * 100).round(2).to_string())




⏱️ Plages continues :
                    start                       end class  minutes
2025-08-11 00:41:00+00:00 2025-08-11 03:04:00+00:00     C      144
2025-08-11 03:05:00+00:00 2025-08-11 03:05:00+00:00     M        1
2025-08-11 03:06:00+00:00 2025-08-11 03:47:00+00:00     C       42
2025-08-11 03:48:00+00:00 2025-08-11 03:56:00+00:00     M        9
2025-08-11 03:57:00+00:00 2025-08-11 08:38:00+00:00     C      282
2025-08-11 08:39:00+00:00 2025-08-11 08:45:00+00:00     M        7
2025-08-11 08:46:00+00:00 2025-08-11 11:43:00+00:00     C      178
2025-08-11 11:44:00+00:00 2025-08-11 11:45:00+00:00     M        2
2025-08-11 11:46:00+00:00 2025-08-11 12:35:00+00:00     C       50
2025-08-11 12:36:00+00:00 2025-08-11 12:38:00+00:00     M        3

📊 Comptes classes prédites (718 min) :
pred_class
C    696
M     22

📈 Probas moyennes (718 min) :
A    0.000
B    0.002
C    0.969
M    0.030
X    0.000

🏆 % minutes où chaque classe est 1ère proba :
 - A: 0.00%
 - B: 0.00%
 - C: 96.94%


In [27]:
# ================== 0) Config ==================
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://mlflow:5000"))
mlflow.set_experiment("solar-flares")

PROD_THRESHOLD = 0.90  # gate de promotion auto

# ================== 1) Métriques ==================
metrics = {
    "train_acc": accuracy_score(ytr_enc, ytr_hat_gbx),
    "train_bacc": balanced_accuracy_score(ytr_enc, ytr_hat_gbx),
    "train_f1_macro": f1_score(ytr_enc, ytr_hat_gbx, average="macro"),
    "train_f1_weighted": f1_score(ytr_enc, ytr_hat_gbx, average="weighted"),
    "test_acc": accuracy_score(yte_enc, yte_hat_gbx),
    "test_bacc": balanced_accuracy_score(yte_enc, yte_hat_gbx),
    "test_f1_macro": f1_score(yte_enc, yte_hat_gbx, average="macro"),
    "test_f1_weighted": f1_score(yte_enc, yte_hat_gbx, average="weighted"),
}
print(f"📊 Train: acc={metrics['train_acc']:.4f} | bacc={metrics['train_bacc']:.4f} | "
      f"f1_macro={metrics['train_f1_macro']:.4f} | f1_weighted={metrics['train_f1_weighted']:.4f}")
print(f"📊 Test : acc={metrics['test_acc']:.4f} | bacc={metrics['test_bacc']:.4f} | "
      f"f1_macro={metrics['test_f1_macro']:.4f} | f1_weighted={metrics['test_f1_weighted']:.4f}")

# ================== 2) Params & contexte ==================
params = {
    "algo": "GradientBoostingClassifier",
    "n_estimators": 150,
    "learning_rate": 0.1,
    "max_depth": 3,
    "random_state": 0,
    "thresholds": {"X": 0.05},
    "class_weights": {"A":1.0, "B":1.0, "C":1.0, "M":2.0, "X":1.0},
    "n_train": int(len(ytr_enc)),
    "n_test": int(len(yte_enc)),
}
context = {
    "ALL_CLASSES": list(ALL_CLASSES),
    "train_class_dist": pd.Series(ytr).value_counts().to_dict(),
    "test_class_dist": pd.Series(yte).value_counts().to_dict(),
}

# ================== 3) Artefacts locaux ==================
# Confusion matrix (test)
cm = confusion_matrix(yte_enc, yte_hat_gbx, labels=np.arange(len(ALL_CLASSES)))
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation="nearest")
ax.set_title("Confusion matrix (test)")
plt.colorbar(im, ax=ax)
ticks = np.arange(len(ALL_CLASSES))
ax.set_xticks(ticks); ax.set_xticklabels(ALL_CLASSES, rotation=45, ha="right")
ax.set_yticks(ticks); ax.set_yticklabels(ALL_CLASSES)
ax.set_xlabel("Predicted"); ax.set_ylabel("True")
plt.tight_layout()

# Classification report (test)
report_txt = classification_report(
    yte_enc, yte_hat_gbx,
    labels=np.arange(len(ALL_CLASSES)),
    target_names=ALL_CLASSES,
    zero_division=0
)

# ===== Evidently 0.7 — robuste aux classes manquantes et labels string =====
idx2name = {i: c for i, c in enumerate(ALL_CLASSES)}

# (1) DataFrames ref/test en TEXTE
ref_txt = pd.DataFrame({
    "target": ytr,
    "prediction": [idx2name[i] for i in ytr_hat_gbx]
})
cur_txt = pd.DataFrame({
    "target": yte,
    "prediction": [idx2name[i] for i in yte_hat_gbx]
})

# (2) Ensemble des labels réellement présents (ref ∪ cur)
labels_present = sorted(set(ref_txt["target"]) | set(ref_txt["prediction"]) |
                        set(cur_txt["target"]) | set(cur_txt["prediction"]))
name2id = {c: i for i, c in enumerate(labels_present)}   # 'A'->0, 'B'->1, ...
id2name = {i: c for c, i in name2id.items()}             # 0->'A', 1->'B', ...

# (3) Mapping vers IDs entiers (évite les KeyError 'A')
def to_ids(df):
    out = pd.DataFrame({
        "target": df["target"].map(name2id),
        "prediction": df["prediction"].map(name2id),
    })
    return out.dropna().astype(int)

ref_ids = to_ids(ref_txt)
cur_ids = to_ids(cur_txt)

if len(ref_ids) == 0 or len(cur_ids) == 0:
    print("⚠️ Après mapping, DataFrame vide pour Evidently. Vérifie labels_present:", labels_present)

# (4) Définition Evidently
data_def = DataDefinition(classification=[
    MulticlassClassification(
        target="target",
        prediction_labels="prediction",
        labels=list(range(len(labels_present)))  # ex: [0,1,2,3]
    )
])

ref_ds = Dataset.from_pandas(ref_ids, data_definition=data_def)
cur_ds = Dataset.from_pandas(cur_ids, data_definition=data_def)

# (5) Génération du rapport (fallback en "current only" si comparaison échoue)
ev = Report([ClassificationPreset()])
try:
    snap = ev.run(cur_ds, ref_ds)   # comparaison current vs reference
except Exception as e:
    print("⚠️ Evidently comparaison a échoué -> current only. Raison:", repr(e))
    snap = ev.run(cur_ds)

# (6) Sauvegardes (HTML + JSON fallback)
EVIDENTLY_HTML = "evidently_report.html"
EVIDENTLY_JSON = "evidently_report.json"

# HTML (OK en 0.7+)
snap.save_html(EVIDENTLY_HTML)

# JSON : tenter .json(), sinon payload “maison”
saved_json = False
try:
    if hasattr(snap, "json"):
        with open(EVIDENTLY_JSON, "w", encoding="utf-8") as f:
            f.write(snap.json())
        saved_json = True
except Exception:
    pass

if not saved_json:
    # --- Fallback JSON (pour le tracking MLflow) ---
    cm = confusion_matrix(yte_enc, yte_hat_gbx, labels=np.arange(len(ALL_CLASSES)))
    clf_dict = classification_report(
        yte_enc, yte_hat_gbx,
        labels=np.arange(len(ALL_CLASSES)),
        target_names=ALL_CLASSES,
        zero_division=0,
        output_dict=True
    )
    summary_payload = {
        "labels_present": labels_present,
        "n_reference": int(len(ref_ids)),
        "n_current": int(len(cur_ids)),
        "sklearn_report_test": clf_dict,
        "confusion_matrix_test": cm.tolist(),
        "metrics_logged": {
            "train_acc": float(metrics["train_acc"]),
            "train_bacc": float(metrics["train_bacc"]),
            "train_f1_macro": float(metrics["train_f1_macro"]),
            "test_acc": float(metrics["test_acc"]),
            "test_bacc": float(metrics["test_bacc"]),
            "test_f1_macro": float(metrics["test_f1_macro"]),
        },
    }
    import json
    with open(EVIDENTLY_JSON, "w", encoding="utf-8") as f:
        json.dump(summary_payload, f, ensure_ascii=False, indent=2)

# ================== 4) Sauvegarde locale du modèle ==================
MODEL_PATH = Path("./models/model.pkl")
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(gbx, MODEL_PATH)

# ================== 5) Log MLflow + Registry avec gate ==================
# Fermer proprement un run resté ouvert (après un crash ou une exécution interrompue)
if mlflow.active_run() is not None:
    print("ℹ️ Fin de l'ancien run:", mlflow.active_run().info.run_id)
    mlflow.end_run()
run_name = f"GBM_X_focus_threshold_{time.strftime('%Y%m%d-%H%M%S')}"
with mlflow.start_run(run_name=run_name) as run:
    # params / métriques / contexte
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.log_dict(context, "context.json")

    # artefacts
    mlflow.log_text(report_txt, "classification_report_test.txt")
    mlflow.log_figure(fig, "confusion_matrix_test.png"); plt.close(fig)
    mlflow.log_artifact(EVIDENTLY_HTML, artifact_path="evidently")
    mlflow.log_artifact(EVIDENTLY_JSON, artifact_path="evidently")
    mlflow.log_artifact(str(MODEL_PATH))

    # signature + modèle versionné dans le run
    sig = infer_signature(pd.DataFrame(Xtr[:200]), gbx.predict(Xtr[:200]))
    mlflow.sklearn.log_model(gbx, artifact_path="model", signature=sig)

    # Enregistrement au Model Registry
    model_uri = f"runs:/{run.info.run_id}/model"
    reg = mlflow.register_model(model_uri, "solar-flares-classifier")

    # Gate de promo: prod si f1_macro_test >= PROD_THRESHOLD
    mlflow.set_tag("prod_threshold", PROD_THRESHOLD)
    promoted = metrics["test_f1_macro"] >= PROD_THRESHOLD

    from mlflow import MlflowClient
    client = MlflowClient()

    # tags sur la version
    client.set_model_version_tag("solar-flares-classifier", reg.version, "test_f1_macro", str(metrics["test_f1_macro"]))
    client.set_model_version_tag("solar-flares-classifier", reg.version, "promoted_to_production", str(promoted))

    # alias Staging toujours mis à jour
    client.set_registered_model_alias("solar-flares-classifier", "Staging", reg.version)

    if promoted:
        client.set_registered_model_alias("solar-flares-classifier", "Production", reg.version)
        print(f"🚀 Promu en Production (v{reg.version}) — test_f1_macro={metrics['test_f1_macro']:.4f} ≥ {PROD_THRESHOLD}")
    else:
        print(f"⏸️ Non promu (reste en Staging) — test_f1_macro={metrics['test_f1_macro']:.4f} < {PROD_THRESHOLD}")

print("✅ modèle sauvegardé & 📡 MLflow loggé (Evidently + CM + report) + Registry.")

📊 Train: acc=0.8374 | bacc=0.9062 | f1_macro=0.9004 | f1_weighted=0.8371
📊 Test : acc=0.9763 | bacc=0.7660 | f1_macro=0.8145 | f1_weighted=0.9738



Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.


Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml

Registered model 'solar-flares-classifier' already exists. Creating a new version of this model...
2025/08/12 10:57:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: solar-flares-classifier, version 7


⏸️ Non promu (reste en Staging) — test_f1_macro=0.8145 < 0.9
✅ modèle sauvegardé & 📡 MLflow loggé (Evidently + CM + report) + Registry.


Created version '7' of model 'solar-flares-classifier'.
