In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
print("📥 Chargement du fichier xrs_clean.parquet...")

parquet_path = Path(r"C:\Users\gate\Documents\Jedha\Projet\4\mlops-solar-flares\data\xrs_clean.parquet")
df = parquet_path
df = pd.read_parquet(parquet_path, engine="pyarrow")
print(f"✅ Données chargées : {df.shape[0]} lignes, {df.shape[1]} colonnes")

📥 Chargement du fichier xrs_clean.parquet...
✅ Données chargées : 147639 lignes, 11 colonnes


In [3]:
def quickpeek(df, topn=10):

    print("# head:", df.head)
    print("\n# dtypes:\n", df.dtypes)
    print("\n# describe:", df.describe)

    # missing %
    print("\n# missing (%):")
    miss = (df.isna().mean()*100).round(2).sort_values(ascending=False)
    print(miss.head(topn).to_string())

    if "time" in df:
        # conversion robuste: tente direct, sinon passe par string
        try:
            t = pd.to_datetime(df["time"], utc=True, errors="coerce")
        except Exception:
            t = pd.to_datetime(df["time"].astype(str), utc=True, errors="coerce")

        print("\n# time range:", t.min(), "->", t.max())

        t_valid = t.dropna()
        print("# time monotonic:", t_valid.is_monotonic_increasing)

        # comptage par jour sans dépendre du backend Arrow
        try:
            per_day = t.dt.floor("D").value_counts().sort_index()
        except Exception:
            # fallback: utiliser la colonne 'date' si dispo
            if "date" in df.columns:
                per_day = pd.to_datetime(df["date"], errors="coerce").value_counts().sort_index()
            else:
                per_day = pd.Series(dtype="int64")

        if len(per_day):
            print("\n# last days (rows/day):\n", per_day.tail(10).to_string())


quickpeek(df)

# head: <bound method NDFrame.head of                             time  flux_long_wm2  flux_short_wm2 satellite  \
0      2025-05-01 00:00:00+00:00   7.021782e-07    1.000000e-09      <NA>   
1      2025-05-01 00:01:00+00:00   6.994713e-07    1.000000e-09      <NA>   
2      2025-05-01 00:02:00+00:00   7.052154e-07    1.000000e-09      <NA>   
3      2025-05-01 00:03:00+00:00   7.015647e-07    1.000000e-09      <NA>   
4      2025-05-01 00:04:00+00:00   6.966016e-07    1.000000e-09      <NA>   
...                          ...            ...             ...       ...   
147634 2025-08-11 12:34:00+00:00   5.304605e-06    5.942913e-07   GOES-18   
147635 2025-08-11 12:35:00+00:00   5.992305e-06    7.550105e-07   GOES-18   
147636 2025-08-11 12:36:00+00:00   6.500120e-06    8.529071e-07   GOES-18   
147637 2025-08-11 12:37:00+00:00   6.883591e-06    8.953128e-07   GOES-18   
147638 2025-08-11 12:38:00+00:00   7.047310e-06    8.543802e-07   GOES-18   

       energy_long energy_short      

In [4]:
df["time"] = pd.to_datetime(df["time"], utc=True, errors="coerce")

# Dernières lignes triées par temps
print("\n📄 Dernières lignes du fichier :")
print(df.sort_values("time").tail(10).to_string(index=False))

# Premières lignes triées par temps
print("\n📄 Premières lignes du fichier trié par 'time' :")
print(df.sort_values("time").head(10).reset_index(drop=True).to_string(index=True))


📄 Dernières lignes du fichier :
                     time  flux_long_wm2  flux_short_wm2 satellite energy_long energy_short source       date  hour  minute_of_day  dow
2025-08-11 12:29:00+00:00       0.000002    8.580523e-08   GOES-18  0.1-0.8 nm  0.05-0.4 nm   <NA> 2025-08-11    12            749    0
2025-08-11 12:30:00+00:00       0.000002    1.043969e-07   GOES-18  0.1-0.8 nm  0.05-0.4 nm   <NA> 2025-08-11    12            750    0
2025-08-11 12:31:00+00:00       0.000003    2.087382e-07   GOES-18  0.1-0.8 nm  0.05-0.4 nm   <NA> 2025-08-11    12            751    0
2025-08-11 12:32:00+00:00       0.000004    3.514351e-07   GOES-18  0.1-0.8 nm  0.05-0.4 nm   <NA> 2025-08-11    12            752    0
2025-08-11 12:33:00+00:00       0.000004    4.549486e-07   GOES-18  0.1-0.8 nm  0.05-0.4 nm   <NA> 2025-08-11    12            753    0
2025-08-11 12:34:00+00:00       0.000005    5.942913e-07   GOES-18  0.1-0.8 nm  0.05-0.4 nm   <NA> 2025-08-11    12            754    0
2025-08-11 12:3

In [5]:
TARGET_NAME  = "flare_class"
ALL_CLASSES  = np.array(["A", "B", "C", "M", "X"], dtype=object)
print("🛠 Création de la variable cible 'flare_class'...")

def rule_predict(flux):
    """
    Classe une éruption selon le pic de flux X (W/m², 1-8 Å) 
    en utilisant les seuils NOAA officiels, avec A inclus.
    """
    if pd.isna(flux):
        return None
    elif flux < 1e-7:       # A : < 10⁻⁷ W/m²
        return "A"
    elif flux < 1e-6:       # B : 10⁻⁷ ≤ flux < 10⁻⁶
        return "B"
    elif flux < 1e-5:       # C : 10⁻⁶ ≤ flux < 10⁻⁵
        return "C"
    elif flux < 1e-4:       # M : 10⁻⁵ ≤ flux < 10⁻⁴
        return "M"
    else:                   # X : ≥ 10⁻⁴
        return "X"

df["flare_class"] = df["flux_long_wm2"].apply(rule_predict)

print("✅ Variable cible ajoutée.")

🛠 Création de la variable cible 'flare_class'...
✅ Variable cible ajoutée.


In [6]:
print("📅 Conversion et enrichissement des features temporelles...")
# -- S'assurer d'avoir un datetime --
if "time" in df.columns:
    t = pd.to_datetime(df["time"].astype(str), utc=True, errors="coerce")
elif isinstance(df.index, pd.DatetimeIndex):
    t = pd.to_datetime(df.index, utc=True, errors="coerce")
elif "date" in df.columns:
    t = pd.to_datetime(df["date"].astype(str), utc=True, errors="coerce")
else:
    raise KeyError("Impossible de trouver une colonne/indice temps ('time' ou 'date').")

# -- Colonnes temporelles dérivées --
df["day_of_year"] = t.dt.dayofyear.astype("int16")     # 1..365/366
df["hour"] = t.dt.hour.astype("int16") if "hour" not in df else df["hour"]

# Encodage cyclique du jour de l'année
rad_doy = 2 * np.pi * (df["day_of_year"] - 1) / 365.25
df["sin_doy"] = np.sin(rad_doy)
df["cos_doy"] = np.cos(rad_doy)

# Optionnel : indicateur jour/nuit (si utile)
df["is_daytime"] = ((df["hour"] >= 6) & (df["hour"] <= 18)).astype("int8")

df["flux_ratio_short_long"] = df["flux_short_wm2"] / df["flux_long_wm2"]
df["flux_diff_short_long"] = df["flux_short_wm2"] - df["flux_long_wm2"]
df["log_flux_long"] = np.log10(df["flux_long_wm2"].clip(lower=1e-9))
df["log_flux_short"] = np.log10(df["flux_short_wm2"].clip(lower=1e-9))
df["sin_doy"] = np.sin(2 * np.pi * df["day_of_year"] / 365.25)
df["cos_doy"] = np.cos(2 * np.pi * df["day_of_year"] / 365.25)
df["flux_long_rolling_mean_1h"] = df["flux_long_wm2"].rolling(window=12, min_periods=1).mean()
df["flux_long_rolling_std_1h"] = df["flux_long_wm2"].rolling(window=12, min_periods=1).std()
df["flux_long_delta"] = df["flux_long_wm2"].diff()
hist = np.log10(df["flux_long_wm2"].clip(lower=1e-9)).shift(1)  # t-1
df["max_60"]  = hist.rolling(60,  min_periods=3).max()
df["mean_60"] = hist.rolling(60,  min_periods=3).mean()
df["max_180"] = hist.rolling(180, min_periods=5).max()


📅 Conversion et enrichissement des features temporelles...


In [7]:
print("🧹 Nettoyage des colonnes inutiles...")
colonnes_a_supprimer = ["satellite"] if "satellite" in df.columns else []
df = df.drop(columns=colonnes_a_supprimer)
print(f"✅ Colonnes supprimées : {colonnes_a_supprimer}")

# Harmonisation des types
numeric_features = ["flux_long_wm2", "flux_short_wm2", "hour", "minute_of_day", "dow"]
categorical_features = ["source", "energy_long", "energy_short"]

for col in numeric_features:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")

for col in categorical_features:
    if col in df.columns:
        df[col] = df[col].astype("string")

print("✅ Types harmonisés.")

🧹 Nettoyage des colonnes inutiles...
✅ Colonnes supprimées : ['satellite']
✅ Types harmonisés.


In [8]:
output_path = parquet_path.parent / "xrs_clean_ml.parquet"
df.to_parquet(output_path, engine="pyarrow", index=False)
print(f"💾 Fichier sauvegardé : {output_path}")

💾 Fichier sauvegardé : C:\Users\gate\Documents\Jedha\Projet\4\mlops-solar-flares\data\xrs_clean_ml.parquet


In [9]:
def quickpeek(df, topn=10):

    print("# head:", df.head())
    print("\n# dtypes:\n", df.dtypes)
    print("\n# describe:\n", df.describe())

    # missing %
    print("\n# missing (%):")
    miss = (df.isna().mean() * 100).round(2).sort_values(ascending=False)
    print(miss.head(topn).to_string())

    # 🔹 Suppression des colonnes entièrement vides
    colonnes_vides = df.columns[df.isna().all()].tolist()
    if colonnes_vides:
        print(f"\n🗑 Suppression de {len(colonnes_vides)} colonne(s) vide(s) : {colonnes_vides}")
        df.drop(columns=colonnes_vides, inplace=True)
    else:
        print("\n✅ Aucune colonne entièrement vide trouvée.")

    if "time" in df:
        # conversion robuste: tente direct, sinon passe par string
        try:
            t = pd.to_datetime(df["time"], utc=True, errors="coerce")
        except Exception:
            t = pd.to_datetime(df["time"].astype(str), utc=True, errors="coerce")

        print("\n# time range:", t.min(), "->", t.max())
        t_valid = t.dropna()
        print("# time monotonic:", t_valid.is_monotonic_increasing)

        # comptage par jour
        try:
            per_day = t.dt.floor("D").value_counts().sort_index()
        except Exception:
            if "date" in df.columns:
                per_day = pd.to_datetime(df["date"], errors="coerce").value_counts().sort_index()
            else:
                per_day = pd.Series(dtype="int64")

        if len(per_day):
            print("\n# last days (rows/day):\n", per_day.tail(10).to_string())

    return df  # On retourne le DataFrame propre
quickpeek(df)

# head:                        time  flux_long_wm2  flux_short_wm2 energy_long  \
0 2025-05-01 00:00:00+00:00   7.021782e-07    1.000000e-09  0.1-0.8 nm   
1 2025-05-01 00:01:00+00:00   6.994713e-07    1.000000e-09  0.1-0.8 nm   
2 2025-05-01 00:02:00+00:00   7.052154e-07    1.000000e-09  0.1-0.8 nm   
3 2025-05-01 00:03:00+00:00   7.015647e-07    1.000000e-09  0.1-0.8 nm   
4 2025-05-01 00:04:00+00:00   6.966016e-07    1.000000e-09  0.1-0.8 nm   

  energy_short      source        date  hour  minute_of_day  dow  ...  \
0  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0            0.0  3.0  ...   
1  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0            1.0  3.0  ...   
2  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0            2.0  3.0  ...   
3  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0            3.0  3.0  ...   
4  0.05-0.4 nm  NCEI-SunPy  2025-05-01   0.0            4.0  3.0  ...   

  flux_ratio_short_long  flux_diff_short_long  log_flux_long  log_flux_short  \
0              0.001424     

Unnamed: 0,time,flux_long_wm2,flux_short_wm2,energy_long,energy_short,source,date,hour,minute_of_day,dow,...,flux_ratio_short_long,flux_diff_short_long,log_flux_long,log_flux_short,flux_long_rolling_mean_1h,flux_long_rolling_std_1h,flux_long_delta,max_60,mean_60,max_180
0,2025-05-01 00:00:00+00:00,7.021782e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,0.0,3.0,...,0.001424,-7.011782e-07,-6.153553,-9.000000,7.021782e-07,,,,,
1,2025-05-01 00:01:00+00:00,6.994713e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,1.0,3.0,...,0.001430,-6.984714e-07,-6.155230,-9.000000,7.008248e-07,1.914016e-09,-2.706827e-09,,,
2,2025-05-01 00:02:00+00:00,7.052154e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,2.0,3.0,...,0.001418,-7.042154e-07,-6.151678,-9.000000,7.022883e-07,2.873626e-09,5.744084e-09,,,
3,2025-05-01 00:03:00+00:00,7.015647e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,3.0,3.0,...,0.001425,-7.005647e-07,-6.153932,-9.000000,7.021074e-07,2.374036e-09,-3.650712e-09,-6.151678,-6.153487,
4,2025-05-01 00:04:00+00:00,6.966016e-07,1.000000e-09,0.1-0.8 nm,0.05-0.4 nm,NCEI-SunPy,2025-05-01,0.0,4.0,3.0,...,0.001436,-6.956016e-07,-6.157016,-9.000000,7.010063e-07,3.207776e-09,-4.963113e-09,-6.151678,-6.153598,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147634,2025-08-11 12:34:00+00:00,5.304605e-06,5.942913e-07,0.1-0.8 nm,0.05-0.4 nm,,2025-08-11,12.0,754.0,0.0,...,0.112033,-4.710314e-06,-5.275347,-6.226001,2.810613e-06,1.091601e-06,8.360598e-07,-5.097139,-5.630177,-5.097139
147635,2025-08-11 12:35:00+00:00,5.992305e-06,7.550105e-07,0.1-0.8 nm,0.05-0.4 nm,,2025-08-11,12.0,755.0,0.0,...,0.125997,-5.237294e-06,-5.222406,-6.122047,3.142742e-06,1.390251e-06,6.877003e-07,-5.097139,-5.624355,-5.097139
147636,2025-08-11 12:36:00+00:00,6.500120e-06,8.529071e-07,0.1-0.8 nm,0.05-0.4 nm,,2025-08-11,12.0,756.0,0.0,...,0.131214,-5.647213e-06,-5.187079,-6.069098,3.517855e-06,1.638613e-06,5.078155e-07,-5.097139,-5.617594,-5.097139
147637,2025-08-11 12:37:00+00:00,6.883591e-06,8.953128e-07,0.1-0.8 nm,0.05-0.4 nm,,2025-08-11,12.0,757.0,0.0,...,0.130065,-5.988278e-06,-5.162185,-6.048025,3.920874e-06,1.827872e-06,3.834707e-07,-5.097139,-5.610246,-5.097139


In [10]:
print("📥 Chargement du fichier xrs_clean_ml.parquet...")
parquet_path = Path(r"C:\Users\gate\Documents\Jedha\Projet\4\mlops-solar-flares\data\xrs_clean_ml.parquet")
df = pd.read_parquet(parquet_path, engine="pyarrow")
print(f"✅ Données chargées : {df.shape[0]} lignes, {df.shape[1]} colonnes")

📥 Chargement du fichier xrs_clean_ml.parquet...
✅ Données chargées : 147639 lignes, 25 colonnes


In [11]:
# ============================
# Cible 
# ============================
def classify_flare(flux):
    if pd.isna(flux): return None
    elif flux < 1e-7: return "A"
    elif flux < 1e-6: return "B"
    elif flux < 1e-5: return "C"
    elif flux < 1e-4: return "M"
    else: return "X"

if TARGET_NAME not in df.columns:
    if "flux_long_wm2" not in df.columns:
        raise KeyError("Colonne 'flux_long_wm2' manquante : impossible de construire la cible.")
    print("🛠 Création de la variable cible 'flare_class' à partir de flux_long_wm2...")
    df[TARGET_NAME] = df["flux_long_wm2"].apply(classify_flare)
print("✅ Cible prête.")

# ============================
# Split temporel
# ============================
print("✂️ Split train/test (80/20, ordre temporel conservé)...")
Y = df[TARGET_NAME].astype("string")
X = df.drop(columns=[TARGET_NAME])

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0, shuffle=False
)
print(f"  - Train : {len(X_train)}")
print(f"  - Test  : {len(X_test)}")

# ============================
# Définition des features (⚠️ sans flux_long_wm2 pour éviter la fuite)
# ============================
# Candidats habituels :
numeric_features_all      = ["flux_short_wm2", "hour", "minute_of_day", "dow"]
categorical_features_all  = ["source", "energy_long", "energy_short"]

# Garder seulement celles qui existent réellement
numeric_features     = [c for c in numeric_features_all if c in X_train.columns]
categorical_features = [c for c in categorical_features_all if c in X_train.columns]

print("✅ Features sélectionnées (sans fuite) :")
print("  Num :", numeric_features)
print("  Cat :", categorical_features)

# ============================
# Nettoyage manuel des valeurs manquantes AVANT preprocessing
# ============================
print("🧹 Nettoyage des valeurs manquantes...")

def clean_missing_values(X_train, X_test, numeric_cols, categorical_cols):
    """Nettoie manuellement les valeurs manquantes pour éviter les bugs SimpleImputer"""
    X_train_clean = X_train.copy()
    X_test_clean = X_test.copy()
    
    # Pour les features numériques : remplacer par la médiane du train
    for col in numeric_cols:
        if col in X_train_clean.columns:
            # Conversion en float64 propre
            X_train_clean[col] = pd.to_numeric(X_train_clean[col], errors="coerce")
            X_test_clean[col] = pd.to_numeric(X_test_clean[col], errors="coerce")
            
            # Calculer la médiane sur le train
            median_val = X_train_clean[col].median()
            if pd.isna(median_val):
                median_val = 0.0  # fallback si tout est NaN
            
            # Remplacer les NaN
            X_train_clean[col] = X_train_clean[col].fillna(median_val)
            X_test_clean[col] = X_test_clean[col].fillna(median_val)
            
            print(f"  {col}: médiane={median_val:.6f}")
    
    # Pour les features catégorielles : remplacer par le mode du train
    for col in categorical_cols:
        if col in X_train_clean.columns:
            # Conversion en object propre
            X_train_clean[col] = X_train_clean[col].astype(str)
            X_test_clean[col] = X_test_clean[col].astype(str)
            
            # Calculer le mode sur le train (ignorer les 'nan' string)
            mode_candidates = X_train_clean[col][X_train_clean[col] != 'nan'].mode()
            if len(mode_candidates) > 0:
                mode_val = mode_candidates.iloc[0]
            else:
                mode_val = "unknown"  # fallback
            
            # Remplacer les NaN (maintenant string 'nan')
            X_train_clean[col] = X_train_clean[col].replace('nan', mode_val)
            X_test_clean[col] = X_test_clean[col].replace('nan', mode_val)
            
            print(f"  {col}: mode='{mode_val}'")
    
    return X_train_clean, X_test_clean

# Appliquer le nettoyage
X_train_clean, X_test_clean = clean_missing_values(
    X_train, X_test, numeric_features, categorical_features
)

# Restreindre aux colonnes utiles (ordre fixe)
X_train_final = X_train_clean[numeric_features + categorical_features].copy()
X_test_final = X_test_clean[numeric_features + categorical_features].copy()

print("✅ Données nettoyées")

# ============================
# Préprocesseur simplifié (sans SimpleImputer)
# ============================
print("⚙️ Création du preprocessor simplifié...")

numeric_transformer = StandardScaler()  # Plus de SimpleImputer
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop"
)

# ============================
# Transformation
# ============================
print("🔄 Transformation des données...")
try:
    X_train_t = preprocessor.fit_transform(X_train_final)
    X_test_t  = preprocessor.transform(X_test_final)
    print("✅ Transformation terminée. Shapes :", X_train_t.shape, X_test_t.shape)
except Exception as e:
    print(f"❌ Erreur transformation: {e}")
    print("Debug - Vérification des données:")
    print("X_train dtypes:", X_train_final.dtypes.to_dict())
    print("X_test dtypes:", X_test_final.dtypes.to_dict())
    
    # Vérifier s'il y a encore des NaN
    for col in X_train_final.columns:
        nan_count_train = X_train_final[col].isna().sum()
        nan_count_test = X_test_final[col].isna().sum()
        if nan_count_train > 0 or nan_count_test > 0:
            print(f"  {col}: {nan_count_train} NaN train, {nan_count_test} NaN test")
    raise

# ============================
# Préparation cibles & encodage labels
# ============================
print("🎯 Préparation des cibles...")
mask_train = Y_train.notna()
mask_test  = Y_test.notna()

Xtr = X_train_t[mask_train.values]
Xte = X_test_t[mask_test.values]
ytr = Y_train[mask_train].astype(str).values
yte = Y_test[mask_test].astype(str).values

le = LabelEncoder()
le.fit(ALL_CLASSES)                 # mapping figé A,B,C,M,X -> 0..4
ytr_enc = le.transform(ytr)
yte_enc = le.transform(yte)

print("✅ Encodage labels OK. Classes :", list(le.classes_))
print("   Répartition train :", pd.Series(ytr).value_counts().to_dict())
print("   Répartition test  :", pd.Series(yte).value_counts().to_dict())

✅ Cible prête.
✂️ Split train/test (80/20, ordre temporel conservé)...
  - Train : 118111
  - Test  : 29528
✅ Features sélectionnées (sans fuite) :
  Num : ['flux_short_wm2', 'hour', 'minute_of_day', 'dow']
  Cat : ['source', 'energy_long', 'energy_short']
🧹 Nettoyage des valeurs manquantes...
  flux_short_wm2: médiane=0.000000
  hour: médiane=11.000000
  minute_of_day: médiane=719.000000
  dow: médiane=3.000000
  source: mode='NCEI-SunPy'
  energy_long: mode='0.1-0.8 nm'
  energy_short: mode='0.05-0.4 nm'
✅ Données nettoyées
⚙️ Création du preprocessor simplifié...
🔄 Transformation des données...
✅ Transformation terminée. Shapes : (118111, 7) (29528, 7)
🎯 Préparation des cibles...
✅ Encodage labels OK. Classes : ['A', 'B', 'C', 'M', 'X']
   Répartition train : {'B': 64972, 'C': 50460, 'M': 1054}
   Répartition test  : {'C': 22730, 'B': 6313, 'M': 297, 'A': 169}


In [12]:
# === RECHARGER VOS DONNÉES / SPLITS ICI ===
# Xtr, Xte, ytr_enc, yte_enc, ALL_CLASSES = ...

# === Helpers (perdus au restart) ===
def compact_labels(y):
    present = np.unique(y)
    to_compact  = {c:i for i,c in enumerate(present)}
    to_original = {i:c for c,i in to_compact.items()}
    y_comp = np.vectorize(to_compact.get)(y)
    return y_comp, present, to_compact, to_original

def remap_back(y_hat_comp, to_original):
    return np.vectorize(to_original.get)(y_hat_comp)

def evaluate_and_print(name, clf, Xtr, ytr_enc, Xte, yte_enc, present, to_original, ALL_CLASSES):
    ytr_hat = remap_back(clf.predict(Xtr), to_original)
    yte_hat = remap_back(clf.predict(Xte), to_original)

    acc_tr  = accuracy_score(ytr_enc, ytr_hat)
    bacc_tr = balanced_accuracy_score(ytr_enc, ytr_hat)
    f1m_tr  = f1_score(ytr_enc, ytr_hat, average="macro")
    f1w_tr  = f1_score(ytr_enc, ytr_hat, average="weighted")

    acc_te  = accuracy_score(yte_enc, yte_hat)
    bacc_te = balanced_accuracy_score(yte_enc, yte_hat)
    f1m_te  = f1_score(yte_enc, yte_hat, average="macro")
    f1w_te  = f1_score(yte_enc, yte_hat, average="weighted")

    print(f"\n========== {name} ==========")
    print("📊 Train :", f"acc={acc_tr:.4f} | bacc={bacc_tr:.4f} | f1m={f1m_tr:.4f} | f1w={f1w_tr:.4f}")
    print("📊 Test  :", f"acc={acc_te:.4f} | bacc={bacc_te:.4f} | f1m={f1m_te:.4f} | f1w={f1w_te:.4f}")

    print("\n🧾 Classification report (test)")
    print(classification_report(yte_enc, yte_hat,
                                labels=np.arange(len(ALL_CLASSES)),
                                target_names=ALL_CLASSES,
                                zero_division=0))

    cm = confusion_matrix(yte_enc, yte_hat, labels=np.arange(len(ALL_CLASSES)))
    print("\n🧩 Confusion matrix (counts)\n",
          pd.DataFrame(cm, index=[f"true_{c}" for c in ALL_CLASSES],
                          columns=[f"pred_{c}" for c in ALL_CLASSES]).to_string())

    row_sums = cm.sum(axis=1, keepdims=True)
    cmn = np.divide(cm, row_sums, out=np.zeros_like(cm, dtype=float), where=row_sums!=0)
    print("\n🧩 Confusion matrix (per-class)\n",
          pd.DataFrame(cmn, index=[f"true_{c}" for c in ALL_CLASSES],
                            columns=[f"pred_{c}" for c in ALL_CLASSES]).round(3).to_string())

    return {"acc_train":acc_tr,"bacc_train":bacc_tr,"f1m_train":f1m_tr,"f1w_train":f1w_tr,
            "acc_test":acc_te,"bacc_test":bacc_te,"f1m_test":f1m_te,"f1w_test":f1w_te}, yte_hat

# === Objets communs recréés à chaque fois ===
sample_weight_tr = compute_sample_weight(class_weight="balanced", y=ytr_enc)
cv3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# === Conteneurs de résultats ===
results_list = []    # tableaux de scores
fitted_pool  = {}    # modèles entraînés + mapping

def add_model_result(name, clf, present, to_original, res_dict, yhat):
    results_list.append({"model": name, **res_dict})
    fitted_pool[name] = (clf, to_original, present)


In [13]:
# ==========================================================
# 0) Préparation conteneurs pour tous les modèles
# ==========================================================
results_list = []       # stocke les dicts avec les scores
fitted_pool = {}        # stocke les modèles entraînés + mapping

In [14]:
# ==========================================================
# 1) Fonction d'ajout d'un modèle dans les pools
# ==========================================================
def add_model_result(name, clf, present, to_original, res_dict, yte_hat):
    results_list.append(res_dict)
    fitted_pool[name] = (clf, to_original, present)  # on garde aussi "present" pour prédictions futures

_


In [16]:
# ==========================================================
# 2) Logistic Regression
# ==========================================================
ytr_comp, present_lr, to_compact_lr, to_original_lr = compact_labels(ytr_enc)
logreg = LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs")
logreg.fit(Xtr, ytr_comp)

res_lr, yhat_lr = evaluate_and_print("LogisticRegression", logreg, Xtr, ytr_enc, Xte, yte_enc,
                                     present_lr, to_original_lr, ALL_CLASSES)
add_model_result("LogisticRegression", logreg, present_lr, to_original_lr, res_lr, yhat_lr)



📊 Train : acc=0.8285 | bacc=0.8756 | f1m=0.7997 | f1w=0.8271
📊 Test  : acc=0.6399 | bacc=0.6214 | f1m=0.4321 | f1w=0.6664

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.38      0.92      0.54      6313
           C       0.96      0.56      0.71     22730
           M       0.32      1.00      0.48       297
           X       0.00      0.00      0.00         0

    accuracy                           0.64     29509
   macro avg       0.33      0.50      0.35     29509
weighted avg       0.83      0.64      0.67     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5839     474       0       0
true_C       0    9340   12747     643       0
true_M       0       0       0     297       0
true_X       0       0       0       0       0

🧩 Confusion matrix (per-class)
         pre

In [17]:
# ==========================================================
# 2) Logistic Regression (tuned)
# ==========================================================
# 1) Compactage des labels (gère classes absentes)
ytr_comp, present_lr, to_compact_lr, to_original_lr = compact_labels(ytr_enc)

# 2) Pondération des classes sur labels compactés
sample_weight_tr = compute_sample_weight(class_weight="balanced", y=ytr_comp)

# 3) Modèle de base (multinomial)
lr_base = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    multi_class="auto",
    n_jobs=None,           # (lbfgs n'accepte pas n_jobs)
    random_state=42
)

# 4) CV 3-fold
cv3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 5) Petite grille directionnelle (sobre)
grid_lr = {
    "C": [0.5, 1.0, 2.0],           # force/relâche la régularisation L2
    "class_weight": [None, "balanced"]
}

# 6) GridSearchCV
gs_lr = GridSearchCV(
    estimator=lr_base,
    param_grid=grid_lr,
    scoring="f1_macro",
    cv=cv3,
    n_jobs=-1,
    verbose=0
)

# 7) Fit avec sample_weight
gs_lr.fit(Xtr, ytr_comp, sample_weight=sample_weight_tr)

# 8) Résultats de la recherche
print("LR best params:", gs_lr.best_params_,
      "best CV f1_macro:", round(gs_lr.best_score_, 4))

# 9) Refit final + évaluation
lr_best = gs_lr.best_estimator_
res_lr, yhat_lr = evaluate_and_print(
    "LogisticRegression (tuned)",
    lr_best,
    Xtr, ytr_enc,
    Xte, yte_enc,
    present_lr, to_original_lr, ALL_CLASSES
)

# 10) Stockage pour comparatif global
add_model_result("LogisticRegression (tuned)",
                 lr_best, present_lr, to_original_lr,
                 res_lr, yhat_lr)



LR best params: {'C': 2.0, 'class_weight': None} best CV f1_macro: 0.7996

📊 Train : acc=0.8286 | bacc=0.8757 | f1m=0.7999 | f1w=0.8272
📊 Test  : acc=0.6407 | bacc=0.6216 | f1m=0.4322 | f1w=0.6672

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.38      0.92      0.54      6313
           C       0.96      0.56      0.71     22730
           M       0.31      1.00      0.48       297
           X       0.00      0.00      0.00         0

    accuracy                           0.64     29509
   macro avg       0.33      0.50      0.35     29509
weighted avg       0.83      0.64      0.67     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5835     478       0       0
true_C       0    9310   12774     646       0
true_M       0       0       0     297       0
true_X       0   

In [18]:
# ==========================================================
# 3) Decision Tree
# ==========================================================
ytr_comp, present_dt, to_compact_dt, to_original_dt = compact_labels(ytr_enc)
dt = DecisionTreeClassifier(class_weight="balanced", random_state=0)
dt.fit(Xtr, ytr_comp)

res_dt, yhat_dt = evaluate_and_print("DecisionTree", dt, Xtr, ytr_enc, Xte, yte_enc,
                                     present_dt, to_original_dt, ALL_CLASSES)
add_model_result("DecisionTree", dt, present_dt, to_original_dt, res_dt, yhat_dt)


📊 Train : acc=0.9843 | bacc=0.9880 | f1m=0.9892 | f1w=0.9842
📊 Test  : acc=0.6941 | bacc=0.5513 | f1m=0.5123 | f1w=0.7181

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.40      0.81      0.54      6313
           C       0.92      0.67      0.77     22730
           M       0.75      0.73      0.74       297
           X       0.00      0.00      0.00         0

    accuracy                           0.69     29509
   macro avg       0.42      0.44      0.41     29509
weighted avg       0.80      0.69      0.72     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5126    1187       0       0
true_C       0    7519   15140      71       0
true_M       0       0      81     216       0
true_X       0       0       0       0       0

🧩 Confusion matrix (per-class)
         pre

In [22]:
# ==========================================================
# 3) Decision Tree (tuned)
# ==========================================================
# Pondération des échantillons (équilibrage des classes)
# 1) Définir le modèle + grille AVANT de créer gs_dt
dt = DecisionTreeClassifier(class_weight="balanced", random_state=0)
grid_dt = {
    "max_depth": [None, 12, 8, 5],
    "min_samples_leaf": [1, 5, 20],
    "ccp_alpha": [0.0, 1e-4, 5e-4]
}

# 2) Créer gs_dt
gs_dt = GridSearchCV(
    estimator=dt,
    param_grid=grid_dt,
    scoring="f1_macro",
    cv=cv3,
    n_jobs=-1,
    verbose=0
)

# 3) Fit (maintenant gs_dt existe bien)
gs_dt.fit(Xtr, ytr_enc, sample_weight=sample_weight_tr)
print("DT best params:", gs_dt.best_params_, "best CV f1_macro:", round(gs_dt.best_score_,4))

# 4) Refit final + report
dt_best = gs_dt.best_estimator_
res_dt, yhat_dt = evaluate_and_print(
    "DecisionTree (tuned)", dt_best,
    Xtr, ytr_enc, Xte, yte_enc,
    present=np.unique(ytr_enc),
    to_original={i:i for i in range(len(ALL_CLASSES))},
    ALL_CLASSES=ALL_CLASSES
)
add_model_result("DecisionTree (tuned)", dt_best, np.unique(ytr_enc),
                 {i:i for i in range(len(ALL_CLASSES))}, res_dt, yhat_dt)

DT best params: {'ccp_alpha': 0.0, 'max_depth': None, 'min_samples_leaf': 1} best CV f1_macro: 0.8673

📊 Train : acc=0.9842 | bacc=0.9879 | f1m=0.9892 | f1w=0.9842
📊 Test  : acc=0.6933 | bacc=0.5535 | f1m=0.5142 | f1w=0.7174

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.40      0.82      0.54      6313
           C       0.92      0.66      0.77     22730
           M       0.76      0.73      0.75       297
           X       0.00      0.00      0.00         0

    accuracy                           0.69     29509
   macro avg       0.42      0.44      0.41     29509
weighted avg       0.80      0.69      0.72     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5152    1161       0       0
true_C       0    7573   15089      68       0
true_M       0       0      79     2

In [24]:
# ==========================================================
# 4) Random Forest 
# ==========================================================
ytr_comp, present_rf, to_compact_rf, to_original_rf = compact_labels(ytr_enc)
rf = RandomForestClassifier(n_estimators=200, class_weight="balanced", n_jobs=-1, random_state=0)
rf.fit(Xtr, ytr_comp)

res_rf, yhat_rf = evaluate_and_print("RandomForest", rf, Xtr, ytr_enc, Xte, yte_enc,
                                     present_rf, to_original_rf, ALL_CLASSES)
add_model_result("RandomForest", rf, present_rf, to_original_rf, res_rf, yhat_rf)


📊 Train : acc=0.9843 | bacc=0.9880 | f1m=0.9892 | f1w=0.9842
📊 Test  : acc=0.6993 | bacc=0.4442 | f1m=0.4285 | f1w=0.7212

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.41      0.83      0.55      6313
           C       0.92      0.67      0.78     22730
           M       0.68      0.27      0.39       297
           X       0.00      0.00      0.00         0

    accuracy                           0.70     29509
   macro avg       0.40      0.36      0.34     29509
weighted avg       0.81      0.70      0.72     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5270    1043       0       0
true_C       0    7405   15287      38       0
true_M       0       0     217      80       0
true_X       0       0       0       0       0

🧩 Confusion matrix (per-class)
         pre

In [26]:
# ==========================================================
# 4) Random Forest (tuned)
# ==========================================================
# Pondération des échantillons (équilibrage des classes)
sample_weight_tr = compute_sample_weight(class_weight="balanced", y=ytr_enc)

# Modèle de base
rf = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

# Cross-validation 3-fold stratifiée
cv3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Petite grille "directionnelle" (élargis si besoin)
grid_rf = {
    "n_estimators": [200,],
    "max_depth": [None, 5,],
    "min_samples_leaf": [1,2],
    "max_features": ["sqrt", "log2"]  # ou None si tu veux tester
}

# GridSearchCV
gs_rf = GridSearchCV(
    estimator=rf,
    param_grid=grid_rf,
    scoring="f1_macro",
    cv=cv3,
    n_jobs=-1,
    verbose=0
)

# Fit avec pondération des classes
gs_rf.fit(Xtr, ytr_enc, sample_weight=sample_weight_tr)

# Résultats de la recherche
print("RF best params:", gs_rf.best_params_,
      "best CV f1_macro:", round(gs_rf.best_score_, 4))

# Refit final + évaluation
rf_best = gs_rf.best_estimator_
res_rf, yhat_rf = evaluate_and_print(
    "RandomForest (tuned)",
    rf_best,
    Xtr, ytr_enc,
    Xte, yte_enc,
    present=np.unique(ytr_enc),
    to_original={i: i for i in range(len(ALL_CLASSES))},
    ALL_CLASSES=ALL_CLASSES
)

# Stockage pour comparaison ultérieure (si tu utilises ce helper)
add_model_result(
    "RandomForest (tuned)",
    rf_best,
    np.unique(ytr_enc),
    {i: i for i in range(len(ALL_CLASSES))},
    res_rf,
    yhat_rf
)

RF best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 200} best CV f1_macro: 0.9032

📊 Train : acc=0.9620 | bacc=0.9735 | f1m=0.9664 | f1w=0.9619
📊 Test  : acc=0.7097 | bacc=0.4790 | f1m=0.4615 | f1w=0.7310

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.42      0.82      0.56      6313
           C       0.92      0.69      0.79     22730
           M       0.66      0.40      0.50       297
           X       0.00      0.00      0.00         0

    accuracy                           0.71     29509
   macro avg       0.40      0.38      0.37     29509
weighted avg       0.81      0.71      0.73     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5208    1105       0       0
true_C       0    7053   15616      61       0
true_M   

In [28]:
# ==========================================================
# 5) Gradient Boosting 
# ==========================================================
ytr_comp, present_gb, to_compact_gb, to_original_gb = compact_labels(ytr_enc)
gb = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=0)
gb.fit(Xtr, ytr_comp)

res_gb, yhat_gb = evaluate_and_print("GradientBoosting", gb, Xtr, ytr_enc, Xte, yte_enc,
                                     present_gb, to_original_gb, ALL_CLASSES)
add_model_result("GradientBoosting", gb, present_gb, to_original_gb, res_gb, yhat_gb)


📊 Train : acc=0.8611 | bacc=0.8759 | f1m=0.8812 | f1w=0.8609
📊 Test  : acc=0.7382 | bacc=0.6036 | f1m=0.5546 | f1w=0.7583

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.45      0.89      0.60      6313
           C       0.96      0.70      0.81     22730
           M       0.81      0.82      0.81       297
           X       0.00      0.00      0.00         0

    accuracy                           0.74     29509
   macro avg       0.44      0.48      0.44     29509
weighted avg       0.84      0.74      0.76     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5641     672       0       0
true_C       0    6772   15899      59       0
true_M       0       0      53     244       0
true_X       0       0       0       0       0

🧩 Confusion matrix (per-class)
         pre

In [30]:
# ==========================================================
# 5) Gradient Boosting (tuned)
# ==========================================================
# Pondération des échantillons (équilibrage des classes)
sample_weight_tr = compute_sample_weight(class_weight="balanced", y=ytr_enc)

# Modèle de base (proche de ce qui marchait le mieux chez toi)
gb = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# Cross-validation 3-fold stratifiée
cv3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Petite grille "directionnelle" (élargis si besoin)
grid_gb = {
    "n_estimators":  [50, 100],
    "learning_rate": [0.2,0.4],
    "max_depth":     [4,6,8],
    # optionnel :
    # "subsample":     [1.0, 0.8],
    # "max_features":  ["sqrt", None],
}

# GridSearchCV
gs_gb = GridSearchCV(
    estimator=gb,
    param_grid=grid_gb,
    scoring="f1_macro",
    cv=cv3,
    n_jobs=-1,
    verbose=0
)

# Fit avec pondération des classes
gs_gb.fit(Xtr, ytr_enc, sample_weight=sample_weight_tr)

# Résultats de la recherche
print("GB best params:", gs_gb.best_params_,
      "best CV f1_macro:", round(gs_gb.best_score_, 4))

# Refit final + évaluation (mapping identité car on est déjà sur 0..4)
gb_best = gs_gb.best_estimator_
res_gb, yhat_gb = evaluate_and_print(
    "GradientBoosting (tuned)",
    gb_best,
    Xtr, ytr_enc,
    Xte, yte_enc,
    present=np.unique(ytr_enc),
    to_original={i: i for i in range(len(ALL_CLASSES))},
    ALL_CLASSES=ALL_CLASSES
)

# Stockage pour comparaison ultérieure
add_model_result("GradientBoosting (tuned)", gb_best,
                 np.unique(ytr_enc),
                 {i: i for i in range(len(ALL_CLASSES))},
                 res_gb, yhat_gb)

GB best params: {'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 100} best CV f1_macro: 0.8883

📊 Train : acc=0.9236 | bacc=0.9476 | f1m=0.9478 | f1w=0.9236
📊 Test  : acc=0.7244 | bacc=0.5745 | f1m=0.5265 | f1w=0.7450

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.43      0.84      0.57      6313
           C       0.93      0.70      0.80     22730
           M       0.71      0.76      0.74       297
           X       0.00      0.00      0.00         0

    accuracy                           0.72     29509
   macro avg       0.42      0.46      0.42     29509
weighted avg       0.82      0.72      0.75     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5274    1039       0       0
true_C       0    6764   15874      92       0
true_M       0       0      70     22

In [32]:
# ==========================================================
# 6) XGBoost 
# ==========================================================
ytr_comp, present_xgb, to_compact_xgb, to_original_xgb = compact_labels(ytr_enc)
k = len(present_xgb)

xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    tree_method="hist",
    num_class=k,
    n_jobs=-1,
    random_state=0
)

xgb.fit(Xtr, ytr_comp)

res_xgb, yhat_xgb = evaluate_and_print(
    "XGBoost", xgb, Xtr, ytr_enc, Xte, yte_enc,
    present_xgb, to_original_xgb, ALL_CLASSES
)

add_model_result("XGBoost", xgb, present_xgb, to_original_xgb, res_xgb, yhat_xgb)


📊 Train : acc=0.8799 | bacc=0.9068 | f1m=0.9063 | f1w=0.8795
📊 Test  : acc=0.7191 | bacc=0.5690 | f1m=0.5354 | f1w=0.7412

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.43      0.88      0.58      6313
           C       0.95      0.68      0.79     22730
           M       0.84      0.71      0.77       297
           X       0.00      0.00      0.00         0

    accuracy                           0.72     29509
   macro avg       0.44      0.46      0.43     29509
weighted avg       0.83      0.72      0.74     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5577     736       0       0
true_C       0    7258   15432      40       0
true_M       0       0      85     212       0
true_X       0       0       0       0       0

🧩 Confusion matrix (per-class)
         pre

In [34]:
# ==========================================================
# 6) XGBoost (si dispo)
# ==========================================================
# 1) Compactage des labels (gère classes absentes)
ytr_comp, present_xgb, to_compact_xgb, to_original_xgb = compact_labels(ytr_enc)
k = len(present_xgb)

# 2) Pondération des classes sur les labels compactés
sample_weight_tr = compute_sample_weight(class_weight="balanced", y=ytr_comp)

# 3) Modèle de base
xgb_base = XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    tree_method="hist",
    num_class=k,
    n_jobs=-1,
    random_state=42
)

# 4) CV 3-fold
cv3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 5) Petite grille directionnelle (rapide)
grid_xgb = {
    "n_estimators":      [150, 200],
    "max_depth":         [4, 6],
    "learning_rate":     [0.05, 0.1],
    "subsample":         [0.8, 1.0],
    "colsample_bytree":  [0.8, 1.0],
}

# 6) GridSearchCV
gs_xgb = GridSearchCV(
    estimator=xgb_base,
    param_grid=grid_xgb,
    scoring="f1_macro",
    cv=cv3,
    n_jobs=-1,
    verbose=0
)

# 7) Fit avec sample_weight
gs_xgb.fit(Xtr, ytr_comp, sample_weight=sample_weight_tr)

# 8) Résultats de la recherche
print("XGB best params:", gs_xgb.best_params_,
      "best CV f1_macro:", round(gs_xgb.best_score_, 4))

# 9) Refit final + évaluation
xgb_best = gs_xgb.best_estimator_
res_xgb, yhat_xgb = evaluate_and_print(
    "XGBoost (tuned)",
    xgb_best,
    Xtr, ytr_enc,
    Xte, yte_enc,
    present_xgb,
    to_original_xgb,
    ALL_CLASSES
)

# 10) Stockage pour comparatif global
add_model_result("XGBoost (tuned)",
                 xgb_best,
                 present_xgb,
                 to_original_xgb,
                 res_xgb,
                 yhat_xgb)

XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8} best CV f1_macro: 0.8927

📊 Train : acc=0.8805 | bacc=0.9187 | f1m=0.9059 | f1w=0.8805
📊 Test  : acc=0.7379 | bacc=0.6045 | f1m=0.5442 | f1w=0.7574

🧾 Classification report (test)
              precision    recall  f1-score   support

           A       0.00      0.00      0.00       169
           B       0.45      0.85      0.59      6313
           C       0.94      0.71      0.81     22730
           M       0.72      0.86      0.78       297
           X       0.00      0.00      0.00         0

    accuracy                           0.74     29509
   macro avg       0.42      0.48      0.44     29509
weighted avg       0.83      0.74      0.76     29509


🧩 Confusion matrix (counts)
         pred_A  pred_B  pred_C  pred_M  pred_X
true_A       0     169       0       0       0
true_B       0    5382     931       0       0
true_C       0    6490   16140     100  

In [38]:
# ==========================
# Tableau comparatif final
# ==========================
def build_leaderboard():
    # 1) Source prioritaire: results_table (contient déjà "model")
    if "results_table" in globals() and isinstance(results_table, list) and len(results_table) > 0:
        df = pd.DataFrame(results_table)
    # 2) Sinon, on tente de reconstruire avec fitted_pool + results_list
    elif ("results_list" in globals() and isinstance(results_list, list) and len(results_list) > 0
          and "fitted_pool" in globals() and isinstance(fitted_pool, dict) and len(fitted_pool) > 0):
        model_names = list(fitted_pool.keys())
        m = min(len(model_names), len(results_list))
        df = pd.DataFrame([{"model": model_names[i], **results_list[i]} for i in range(m)])
    else:
        print("⚠️ Aucun résultat à afficher (results_table/results_list vides).")
        return

    df = df.sort_values(["f1m_test", "bacc_test", "acc_test"], ascending=False).reset_index(drop=True)
    print("\n🏁 Tableau comparatif (tri par F1-macro test) :")
    print(df.to_string(index=False))

    if "model" in df.columns:
        best_name = df.iloc[0]["model"]
        print(f"\n⭐ Meilleur modèle du run : {best_name}")
    else:
        print("\n⚠️ Colonne 'model' absente : pense à utiliser add_model_result(...) dans chaque bloc.")

build_leaderboard()



🏁 Tableau comparatif (tri par F1-macro test) :
                     model  acc_train  bacc_train  f1m_train  f1w_train  acc_test  bacc_test  f1m_test  f1w_test
          GradientBoosting   0.861142    0.875911   0.881204   0.860869  0.738215   0.603643  0.554605  0.758304
           XGBoost (tuned)   0.880475    0.918658   0.905948   0.880514  0.737944   0.604455  0.544241  0.757364
                   XGBoost   0.879917    0.906753   0.906282   0.879519  0.719137   0.569037  0.535365  0.741151
  GradientBoosting (tuned)   0.923596    0.947636   0.947752   0.923580  0.724355   0.574525  0.526499  0.745048
      DecisionTree (tuned)   0.984196    0.987945   0.989154   0.984162  0.693314   0.553484  0.514238  0.717430
              DecisionTree   0.984281    0.987967   0.989211   0.984247  0.694093   0.551332  0.512349  0.718053
      RandomForest (tuned)   0.961961    0.973518   0.966360   0.961929  0.709750   0.479007  0.461487  0.731018
LogisticRegression (tuned)   0.828597    0.87572

In [40]:
# =========================
# 0) helpers génériques
# =========================
H_NEXT = 718  # 12h "observables" dans ton dataset (peut être 720 si complet)

def safe_to_datetime(s):
    return pd.to_datetime(s.astype(str), utc=True, errors="coerce")

def get_last_minutes_block(X_test, mask_test, Xte, minutes=H_NEXT):
    """Retourne X12_t (features transformées) et t12 (timestamps) pour les 'minutes' dernières minutes réelles du test."""
    # timeline côté X_test
    if "time" in X_test.columns:
        t_all = safe_to_datetime(X_test["time"])
    elif isinstance(X_test.index, pd.DatetimeIndex):
        t_all = pd.to_datetime(X_test.index, utc=True, errors="coerce").to_series()
    elif "date" in X_test.columns:
        t_all = safe_to_datetime(X_test["date"])
    else:
        raise KeyError("Pas de colonne temps ('time' ou 'date') dans X_test.")

    # indices du test (après filtre y non-NaN) et tri par temps
    idx_test = X_test.index[mask_test]
    t_test_sorted = (
        pd.DataFrame({"time": t_all.loc[idx_test].values}, index=idx_test)
        .dropna()
        .sort_values("time")
    )
    # prendre les 'minutes' dernières
    last_idx = t_test_sorted.tail(minutes).index
    # positions dans Xte (qui est X_test_t[mask_test])
    pos_map = pd.Series(range(len(idx_test)), index=idx_test)
    sel_pos = pos_map.loc[last_idx].sort_values()
    X_last = Xte[sel_pos.values]
    t_last = t_test_sorted.loc[last_idx, "time"].sort_values().reset_index(drop=True)
    return X_last, t_last

def softmax_from_decision(scores):
    scores = np.array(scores)
    if scores.ndim == 1:
        scores = np.column_stack([-scores, scores])
    m = scores.max(axis=1, keepdims=True)
    exp = np.exp(scores - m)
    return exp / exp.sum(axis=1, keepdims=True)

def safe_predict_proba(estimator, X):
    """Renvoie (proba, classes_idx) où classes_idx = estimator.classes_ (indices compacts)."""
    if hasattr(estimator, "predict_proba"):
        p = estimator.predict_proba(X)
        return p, estimator.classes_
    elif hasattr(estimator, "decision_function"):
        p = softmax_from_decision(estimator.decision_function(X))
        # si le modèle ne donne pas la même shape que len(classes_), on harmonise
        classes_ = getattr(estimator, "classes_", np.arange(p.shape[1]))
        return p, classes_
    else:
        # fallback uniform (à éviter en prod, mais utile pour garder le flux)
        k = len(getattr(estimator, "classes_", [0,1]))
        n = X.shape[0]
        return np.full((n, k), 1.0/k), getattr(estimator, "classes_", np.arange(k))

def build_718_table_for_model(name, fitted_entry, X_last, t_last, ALL_CLASSES):
    """
    Construit le DataFrame minute->probas/classes pour 'name' depuis fitted_pool[name].
    fitted_entry = (clf, to_original, present)
    """
    clf, to_original, present = fitted_entry

    # proba sur classes COMPACTES présentes pendant l'entraînement (ex: [0,1,2] => B,C,M)
    proba_compact, compact_classes = safe_predict_proba(clf, X_last)  # shape: (N, k_present)

    # mapping compact -> global index (0..len(ALL_CLASSES)-1)
    compact_to_global = np.vectorize(to_original.get)(compact_classes)  # ex: [1,2,3] (B,C,M)

    # construire un tableau proba sur TOUTES les classes globales A..X (même si absentes)
    dfp = pd.DataFrame(0.0, index=np.arange(len(t_last)), columns=ALL_CLASSES)
    # nom des colonnes pour les classes présentes
    present_names = ALL_CLASSES[compact_to_global]
    # injecter les proba au bon endroit
    for j, cname in enumerate(present_names):
        dfp[cname] = proba_compact[:, j]

    # ajouter time + classes dérivées
    dfp.insert(0, "time", t_last.values)
    dfp["pred_class"]  = ALL_CLASSES[dfp[ALL_CLASSES].values.argmax(axis=1)]
    dfp["pred_strong"] = dfp["pred_class"].isin(["M","X"]).astype(int)

    # tri par temps (sécurité)
    dfp = dfp.dropna(subset=["time"]).copy()
    dfp["time"] = pd.to_datetime(dfp["time"], utc=True, errors="coerce")
    dfp = dfp.sort_values("time").reset_index(drop=True)

    # plages continues
    change = dfp["pred_class"].ne(dfp["pred_class"].shift(1))
    dfp["_grp"] = change.cumsum()
    spans = (
        dfp.groupby("_grp", as_index=False)
           .agg(start=("time", "first"),
                end=("time", "last"),
                **{"class": ("pred_class", "first")},
                minutes=("time", "size"))
           .drop(columns=["_grp"])
    )
    return dfp, spans

def describe_718(dfp, spans, name, ALL_CLASSES):
    print(f"\n================ {name} — 718 minutes ================")
    print("\n⏱️ Plages continues :")
    print(spans.to_string(index=False))

    print("\n📊 Comptes classes prédites (718 min) :")
    print(dfp["pred_class"].value_counts().to_string())

    print("\n📈 Probas moyennes (718 min) :")
    print(dfp[ALL_CLASSES].mean().round(3).to_string())

    print("\n🏆 % minutes où chaque classe est 1ère proba :")
    for c in ALL_CLASSES:
        others = [x for x in ALL_CLASSES if x != c]
        share = (dfp[c] >= dfp[others].max(axis=1)).mean() * 100
        print(f" - {c}: {share:.2f}%")

# =========================
# 1) extraire X_last & t_last une seule fois
# =========================
X12_t, t12 = get_last_minutes_block(X_test, mask_test, Xte, minutes=H_NEXT)

# =========================
# 2) générer pour chaque modèle du pool
# =========================
pred_tables_718 = {}
spans_718 = {}

for name, fitted_entry in fitted_pool.items():
    df_12h, spans = build_718_table_for_model(name, fitted_entry, X12_t, t12, ALL_CLASSES)
    pred_tables_718[name] = df_12h
    spans_718[name] = spans
    # impression détaillée (tu peux commenter si trop verbeux)
    describe_718(df_12h, spans, name, ALL_CLASSES)

# =========================
# 3) tableau comparatif des parts de classes (718 min)
# =========================
summary = []
for name, dfp in pred_tables_718.items():
    vc = dfp["pred_class"].value_counts(normalize=True).reindex(ALL_CLASSES, fill_value=0.0)
    summary.append({"model": name, **{f"p_{c}": vc.get(c, 0.0) for c in ALL_CLASSES}})

summary_df = pd.DataFrame(summary).sort_values("model")
print("\n🏁 Part des classes prédites sur 718 min (par modèle) :")
print((summary_df.set_index("model") * 100).round(2).to_string())




⏱️ Plages continues :
                    start                       end class  minutes
2025-08-11 00:41:00+00:00 2025-08-11 01:11:00+00:00     M       31
2025-08-11 01:12:00+00:00 2025-08-11 01:16:00+00:00     C        5
2025-08-11 01:17:00+00:00 2025-08-11 01:27:00+00:00     M       11
2025-08-11 01:28:00+00:00 2025-08-11 03:03:00+00:00     C       96
2025-08-11 03:04:00+00:00 2025-08-11 03:09:00+00:00     M        6
2025-08-11 03:10:00+00:00 2025-08-11 03:46:00+00:00     C       37
2025-08-11 03:47:00+00:00 2025-08-11 04:03:00+00:00     M       17
2025-08-11 04:04:00+00:00 2025-08-11 04:55:00+00:00     C       52
2025-08-11 04:56:00+00:00 2025-08-11 05:01:00+00:00     M        6
2025-08-11 05:02:00+00:00 2025-08-11 05:30:00+00:00     C       29
2025-08-11 05:31:00+00:00 2025-08-11 05:42:00+00:00     M       12
2025-08-11 05:43:00+00:00 2025-08-11 06:02:00+00:00     C       20
2025-08-11 06:03:00+00:00 2025-08-11 06:14:00+00:00     M       12
2025-08-11 06:15:00+00:00 2025-08-11 0