Objectif : fine-tuning + modèle final + explications

Méthodologie : StratifiedKFold, scoring = PR-AUC (average precision), pourquoi

GridSearch : LogReg + RandomForest

Évaluation test + choix seuil

Feature importance globale : permutation importance

SHAP global : beeswarm

SHAP local : waterfall sur 2–3 individus

Synthèse : causes potentielles + limites + next steps

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split

from technova_attrition.config import PATHS, SETTINGS
from technova_attrition.evaluation import (
    evaluate_classifier,
    find_threshold_for_recall,
    plot_precision_recall,
)
from technova_attrition.modeling import make_logreg, make_random_forest
from technova_attrition.preprocessing import FeatureGroups
from technova_attrition.tuning import run_grid_search, summarize_grid_search

df = pd.read_parquet(PATHS.data_processed / "employees_features.parquet")

TARGET = "a_quitte_l_entreprise"
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=SETTINGS.random_state,
    stratify=y,
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SETTINGS.random_state)
y.value_counts(normalize=True).round(3)

In [None]:
# === Numériques ===
num_cont = [
    "age",
    "augmentation_salaire_precedente",
    "distance_domicile_travail",
    "proba_chgt_experience_par_an",
    "proba_chgt_experience_par_an_adulte",
    "ratio_experience_vie_adulte",
]

num_log = [
    "revenu_mensuel",
    "annee_experience_totale",
    "annees_dans_l_entreprise",
    "annees_dans_le_poste_actuel",
    "annes_sous_responsable_actuel",
    "annees_depuis_la_derniere_promotion",
]

num_disc = [
    "nombre_participation_pee",
    "nb_formations_suivies",
    "nombre_employee_sous_responsabilite",
    "nombre_experiences_precedentes",
    "nombre_experiences_precedents",
]

# === Binaires ===
bin_cols = ["genre", "heure_supplementaires", "changement_poste"]

# === Nominales ===
cat_nom = ["statut_marital", "departement", "poste", "domaine_etude"]

# === Ordinales ===
cat_ord = [
    "satisfaction_employee_environnement",
    "satisfaction_employee_nature_travail",
    "satisfaction_employee_equipe",
    "satisfaction_employee_equilibre_pro_perso",
    "note_evaluation_precedente",
    "note_evaluation_actuelle",
    "niveau_hierarchique_poste",
    "niveau_education",
    "frequence_deplacement",
    "evolution_note",
]


# Nettoyage automatique : on ne garde que les colonnes présentes
def keep_existing(cols):
    return [c for c in cols if c in X.columns]


num_cont = keep_existing(num_cont)
num_log = keep_existing(num_log)
num_disc = keep_existing(num_disc)
bin_cols = keep_existing(bin_cols)
cat_nom = keep_existing(cat_nom)
cat_ord = keep_existing(cat_ord)

# Ordres ordinal : par défaut, on prend l’ordre trié (tu pourras le remplacer par un ordre métier explicite)
ord_categories = [sorted(df[c].dropna().unique().tolist()) for c in cat_ord]

groups = FeatureGroups(
    num_cont=num_cont,
    num_log=num_log,
    num_disc=num_disc,
    bin_cols=bin_cols,
    cat_nom=cat_nom,
    cat_ord=cat_ord,
    ord_categories=ord_categories,
)

groups

In [None]:
logreg_pipe = make_logreg(groups)

param_grid_logreg = {
    "model__penalty": ["l2", "l1", "elasticnet"],
    "model__C": [0.01, 0.1, 1.0, 10.0],
    "model__l1_ratio": [0.2, 0.5, 0.8],  # utilisé seulement si elasticnet
}

gs_logreg = run_grid_search(
    pipeline=logreg_pipe,
    param_grid=param_grid_logreg,
    X_train=X_train,
    y_train=y_train,
    cv=cv,
    scoring="average_precision",
    n_jobs=-1,
)

summary_logreg = summarize_grid_search(gs_logreg)
summary_logreg.head(10)

In [None]:
rf_pipe = make_random_forest(groups)

param_grid_rf = {
    "model__n_estimators": [300, 600],
    "model__max_depth": [None, 6, 10],
    "model__min_samples_leaf": [1, 5, 10],
    "model__max_features": ["sqrt", 0.5],
}

gs_rf = run_grid_search(
    pipeline=rf_pipe,
    param_grid=param_grid_rf,
    X_train=X_train,
    y_train=y_train,
    cv=cv,
    scoring="average_precision",
    n_jobs=-1,
)

summary_rf = summarize_grid_search(gs_rf)
summary_rf.head(10)

In [None]:
best_logreg = gs_logreg.best_estimator_
best_rf = gs_rf.best_estimator_

res_logreg = evaluate_classifier(best_logreg, X_train, y_train, X_test, y_test, threshold=0.5)
res_rf = evaluate_classifier(best_rf, X_train, y_train, X_test, y_test, threshold=0.5)

print("LogReg AP test:", res_logreg["test_ap"])
print("RF AP test:", res_rf["test_ap"])
print("\nLogReg report:\n", res_logreg["test_report"])
print("\nRF report:\n", res_rf["test_report"])

In [None]:
final_model = best_logreg  # ou best_rf si clairement meilleur + cohérent
p_test = final_model.predict_proba(X_test)[:, 1]

plot_precision_recall(y_test, p_test)

thr = find_threshold_for_recall(y_test, p_test, target_recall=0.80)
thr

In [None]:
res_thr = evaluate_classifier(final_model, X_train, y_train, X_test, y_test, threshold=thr)
print(res_thr["test_report"])
print(res_thr["test_cm"])

In [None]:
from technova_attrition.explainability import permutation_importance_df

perm_df = permutation_importance_df(
    final_model, X_test, y_test, scoring="average_precision", n_repeats=20
)
perm_df.head(20)

In [None]:
perm_path = PATHS.reports / "permutation_importance.csv"
perm_df.to_csv(perm_path, index=False)
perm_path

In [None]:
import shap

from technova_attrition.explainability import shap_explain_linear_model

# background = petit échantillon pour stabilité
X_bg = X_train.sample(min(200, len(X_train)), random_state=SETTINGS.random_state)
X_exp = X_test.sample(min(200, len(X_test)), random_state=SETTINGS.random_state)

explainer, shap_values, X_exp_t, feature_names = shap_explain_linear_model(final_model, X_bg, X_exp)

# Global beeswarm
shap.summary_plot(shap_values, X_exp_t, feature_names=feature_names)

In [None]:
probas = final_model.predict_proba(X_test)[:, 1]
idx_high = probas.argmax()
idx_low = probas.argmin()

X_one_high = X_test.iloc[[idx_high]]
X_one_low = X_test.iloc[[idx_low]]

# Transform + shap values pour 1 point
Xb = X_bg
explainer, shap_vals_high, Xh_t, fn = shap_explain_linear_model(final_model, Xb, X_one_high)
explainer, shap_vals_low, Xl_t, fn = shap_explain_linear_model(final_model, Xb, X_one_low)

shap.plots.waterfall(
    shap.Explanation(
        values=shap_vals_high[0],
        base_values=explainer.expected_value,
        data=Xh_t[0],
        feature_names=fn,
    )
)
shap.plots.waterfall(
    shap.Explanation(
        values=shap_vals_low[0],
        base_values=explainer.expected_value,
        data=Xl_t[0],
        feature_names=fn,
    )
)

In [None]:
import joblib

model_path = PATHS.models / "final_model.joblib"
joblib.dump(final_model, model_path)
model_path

In [None]:
import json

metrics = {
    "threshold": float(thr),
    "test_ap": float(res_thr["test_ap"]),
    "test_roc_auc": float(res_thr["test_roc_auc"]),
}

metrics_path = PATHS.reports / "final_metrics.json"
metrics_path.write_text(json.dumps(metrics, indent=2))
metrics_path