In [1]:
import mlflow, os, sys

# Force MLflow à écrire dans projet/mlruns
mlflow.set_tracking_uri("file:../mlruns")

print("Tracking URI:", mlflow.get_tracking_uri())

mlflow.set_experiment("BAAC")
print("✅ MLflow prêt !")


Tracking URI: file:../mlruns
✅ MLflow prêt !


In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score

In [5]:
# CHARGEMENT CSV FINAL PRÉPARÉ
DATA_PATH = '../data/baac_final.csv'
df2 = pd.read_csv(DATA_PATH,low_memory=False)

print(f"✅ CSV chargé: {len(df2):,} lignes")
print("Features:", [c for c in df2.columns if c != 'grav_acc'])
print("grav_acc:", df2['grav_acc'].value_counts(normalize=True).round(2))

# Split TEMPORREL (comme tu fais)
df_train = df2[df2["annee"] < 2024]
df_test  = df2[df2["annee"] == 2024]

X_train = df_train.drop(columns=["grav_acc", "num_acc"])
y_train = df_train["grav_acc"]
X_test = df_test.drop(columns=["grav_acc", "num_acc"]) 
y_test = df_test["grav_acc"]


print(f"✅ Train 2019-2023: {len(X_train):,} | Test 2024: {len(X_test):,}")


✅ CSV chargé: 654,784 lignes
Features: ['num_acc', 'annee', 'dep', 'weekend', 'saison', 'lum', 'atm', 'col', 'nb_usager', 'nb_hommes', 'nb_femmes', 'presence_pieton', 'presence_enfant', 'presence_senior', 'nb_df', 'presence_velo', 'presence_2rm', 'presence_3rm_quad', 'presence_vl_vu', 'presence_pl', 'presence_tc', 'presence_edp', 'vma', 'nbv', 'catr', 'circ', 'surf', 'presence_bande_cyclable', 'heure_sin', 'heure_cos', 'jour_sin', 'jour_cos']
grav_acc: grav_acc
0    0.66
1    0.34
Name: proportion, dtype: float64
✅ Train 2019-2023: 501,742 | Test 2024: 153,042


In [7]:
# Pipeline
num_vars = X_train.select_dtypes(include=["int32","int64", "float64"]).columns.tolist()
cat_vars = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")),
                      ("scaler", StandardScaler())]), num_vars),
    ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                      ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_vars)
])

# Échantillon rapide
N_SAMPLE = 30000
X_train_small = X_train.sample(n=N_SAMPLE, random_state=42)
y_train_small = y_train.loc[X_train_small.index]

for col in X_train_small.select_dtypes("object"):
    X_train_small[col] = X_train_small[col].astype(str)


pipe_rf = Pipeline([("preprocessing", preprocessor),
                    ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))])

grid_rf = GridSearchCV(pipe_rf, {"classifier__n_estimators": [50]}, cv=3, scoring="recall")
grid_rf.fit(X_train_small, y_train_small)

print("✅ Pipeline OK !")


✅ Pipeline OK !


In [12]:
import warnings

# Ignore uniquement le warning MLflow sur le schema des colonnes int
warnings.filterwarnings(
    "ignore",
    message="Hint: Inferred schema contains integer column\\(s\\).*",
    category=UserWarning,
)

with mlflow.start_run(run_name="RF_BAAC_Final"):
    # ============================================================
    # 1) Prédictions + métriques de base
    # ============================================================
    from sklearn.metrics import (
        confusion_matrix, classification_report, roc_auc_score, recall_score, precision_score, f1_score
    )
    import matplotlib.pyplot as plt
    import seaborn as sns
    import tempfile
    import mlflow
    import mlflow.sklearn
    from mlflow.models.signature import infer_signature

    y_pred = grid_rf.predict(X_test)
    y_proba = grid_rf.predict_proba(X_test)[:, 1]

    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    cm = confusion_matrix(y_test, y_pred)
    report_txt = classification_report(y_test, y_pred)

    # ============================================================
    # 2) Log params + metrics
    # ============================================================
    mlflow.log_params(grid_rf.best_params_)
    mlflow.log_metric("recall_2024", recall)
    mlflow.log_metric("precision_2024", precision)
    mlflow.log_metric("f1_2024", f1)
    mlflow.log_metric("roc_auc_2024", auc)

    mlflow.log_param("n_features_input", X_train.shape[1])
    mlflow.log_param("n_train_total", len(X_train))
    mlflow.log_param("n_train_sample", len(X_train_small))
    mlflow.log_param("n_test_2024", len(X_test))

    # ============================================================
    # 3) Artefacts: report texte + matrice de confusion
    # ============================================================
    with tempfile.TemporaryDirectory() as tmp:
        # Report texte
        report_path = f"{tmp}/classification_report_2024.txt"
        with open(report_path, "w", encoding="utf-8") as f:
            f.write(report_txt)
        mlflow.log_artifact(report_path, artifact_path="reports")

        # Matrice de confusion (image)
        plt.figure(figsize=(5,4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title("Matrice de confusion - Test 2024")
        plt.xlabel("Prédictions")
        plt.ylabel("Réel")
        fig_path = f"{tmp}/confusion_matrix_2024.png"
        plt.tight_layout()
        plt.savefig(fig_path, dpi=150)
        plt.close()
        mlflow.log_artifact(fig_path, artifact_path="figures")

    # ============================================================
    # 4) Log du modèle (pipeline complet = preprocessing + RF)
    # ============================================================
    best_model = grid_rf.best_estimator_
    signature = infer_signature(X_train_small, best_model.predict(X_train_small))

    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="model",
        signature=signature,
        input_example=X_train_small.head(5)
    )

    # ============================================================
    # 5) Prints (pour toi)
    # ============================================================
    print(f"✅ Recall 2024: {recall:.3f}")
    print(f"✅ Precision 2024: {precision:.3f} | F1 2024: {f1:.3f} | ROC-AUC 2024: {auc:.3f}")
    print(f"✅ Matrice de confusion: {cm.tolist()}")
    print(f"✅ Run: {mlflow.active_run().info.run_id}")


✅ Recall 2024: 0.471
✅ Precision 2024: 0.669 | F1 2024: 0.553 | ROC-AUC 2024: 0.781
✅ Matrice de confusion: [[89730, 11980], [27157, 24175]]
✅ Run: d71817505c924c11ba88c46d46747250
