In [8]:
# %% [markdown]
# # Robustesse & stabilité des clusters (UMAP — Agglomerative & K-Means)

# %%
import os, re, glob, warnings
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import (
    adjusted_rand_score as ARI,
    normalized_mutual_info_score as NMI,
    silhouette_score, calinski_harabasz_score, davies_bouldin_score
)
from sklearn.utils import resample

warnings.filterwarnings("ignore")

# Dossiers
BASE_DIR = "../../cluster_results"
IN_DIR   = os.path.join(BASE_DIR, "clusters_annotes")
OUT_DIR  = os.path.join(BASE_DIR, "robustness")
os.makedirs(OUT_DIR, exist_ok=True)

# Fichiers attendus (avec tolérance de nommage)
FILE_AGGLO_CANDIDATES = [
    os.path.join(IN_DIR, "clusters_annotes_clusters_umap_agglomerative.csv"),
    os.path.join(IN_DIR, "clusters_annotes_umap_agglomerative.csv"),
]
FILE_KMEANS_CANDIDATES = [
    os.path.join(IN_DIR, "clusters_annotes_clusters_umap_kmeans_5.csv"),  # tel que fourni
    os.path.join(IN_DIR, "clusters_annotes_clusters_umap_kmeans_5.csv"),
    os.path.join(IN_DIR, "clusters_annotes_clusters_umap_kmeans_5.csv"),
]

def first_existing(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    return None

FILE_AGGLO  = first_existing(FILE_AGGLO_CANDIDATES)
FILE_KMEANS = first_existing(FILE_KMEANS_CANDIDATES)

if FILE_AGGLO is None or FILE_KMEANS is None:
    raise FileNotFoundError(
        f"Introuvable :\n"
        f"  AGGLO candidates : {FILE_AGGLO_CANDIDATES}\n"
        f"  KMEANS candidates : {FILE_KMEANS_CANDIDATES}"
    )

print("✅ AGGLO  :", FILE_AGGLO)
print("✅ KMEANS :", FILE_KMEANS)


✅ AGGLO  : ../../cluster_results/clusters_annotes/clusters_annotes_clusters_umap_agglomerative.csv
✅ KMEANS : ../../cluster_results/clusters_annotes/clusters_annotes_clusters_umap_kmeans_5.csv


In [9]:
# %% [markdown]
# ## Chargement & détection des colonnes d'embedding

# %%



META_COLS = ["player_name", "positions", "equipe", "team", "tag", "cluster"]

def load_df(path):
    df = pd.read_csv(path)
    if "cluster" not in df.columns:
        raise ValueError(f"'cluster' manquant dans {path}")
    return df

def detect_coord_columns(df: pd.DataFrame):
    """Privilégie umap_*, tsne_*, pca_*, isomap_* ; sinon toutes numériques hors méta/cluster."""
    prefixes = ("umap", "tsne", "pca", "isomap")
    coord_cols = [c for c in df.columns if any(p in c.lower() for p in prefixes)]
    coord_cols = [c for c in coord_cols if pd.api.types.is_numeric_dtype(df[c])]
    if len(coord_cols) >= 2:
        return coord_cols
    # fallback: numériques hors méta
    meta_set = set(META_COLS)
    return [c for c in df.select_dtypes(include=[np.number]).columns if c not in meta_set]

df_agglo  = load_df(FILE_AGGLO)
df_kmeans = load_df(FILE_KMEANS)

cols_agglo  = detect_coord_columns(df_agglo)
cols_kmeans = detect_coord_columns(df_kmeans)

X_agglo  = df_agglo[cols_agglo].to_numpy(dtype=float)
X_kmeans = df_kmeans[cols_kmeans].to_numpy(dtype=float)

y_agglo  = df_agglo["cluster"].to_numpy()
y_kmeans = df_kmeans["cluster"].to_numpy()

k_agglo  = np.unique(y_agglo[y_agglo!=-1]).size if (-1 in y_agglo) else np.unique(y_agglo).size
k_kmeans = np.unique(y_kmeans[y_kmeans!=-1]).size if (-1 in y_kmeans) else np.unique(y_kmeans).size

print(f"AGGLO  → X:{X_agglo.shape}, k={k_agglo}, coords={cols_agglo[:3]}{'...' if len(cols_agglo)>3 else ''}")
print(f"KMEANS → X:{X_kmeans.shape}, k={k_kmeans}, coords={cols_kmeans[:3]}{'...' if len(cols_kmeans)>3 else ''}")


AGGLO  → X:(341, 8), k=5, coords=['per90_gls', 'per90_ast', 'carries_prog']...
KMEANS → X:(341, 8), k=5, coords=['per90_gls', 'per90_ast', 'carries_prog']...


In [10]:
# %% [markdown]
# ## Fonctions : stabilité (bootstrap), robustesse (bruit), indices internes

# %%
RANDOM_STATE   = 42
N_BOOTSTRAPS   = 20
NOISE_LEVELS   = [0.01, 0.03, 0.05]  # 1%, 3%, 5% d'écart-type par feature

def recluster(X, method: str, k: int, seed: int):
    if method == "kmeans":
        return KMeans(n_clusters=k, n_init=10, random_state=seed).fit_predict(X)
    if method == "agglomerative":
        return AgglomerativeClustering(n_clusters=k, linkage="ward").fit_predict(X)
    raise ValueError("Méthode non supportée")

def stability_bootstrap(X: np.ndarray, y_base: np.ndarray, method: str, k: int, n_iter: int):
    """Resample indices avec remise, recluster, compare aux labels de base sur l'échantillon (ARI/NMI)."""
    np.random.seed(RANDOM_STATE)
    ari, nmi = [], []
    n = len(y_base)
    for i in range(n_iter):
        X_res, idx = resample(X, np.arange(n), replace=True, random_state=RANDOM_STATE+i)
        y_res = recluster(X_res, method, k, seed=RANDOM_STATE+i)
        ari.append(ARI(y_base[idx], y_res))
        nmi.append(NMI(y_base[idx], y_res))
    return np.mean(ari), np.std(ari), np.mean(nmi), np.std(nmi)

def robustness_noise(X: np.ndarray, y_base: np.ndarray, method: str, k: int, levels):
    """Ajoute du bruit gaussien (par feature), recluster, compare aux labels de base sur tout X."""
    rng = np.random.default_rng(RANDOM_STATE)
    mu, sigma = X.mean(axis=0), X.std(axis=0) + 1e-12
    rows = []
    for lvl in levels:
        noise = rng.normal(0, sigma*lvl, size=X.shape)
        Xn = X + noise
        y_new = recluster(Xn, method, k, seed=RANDOM_STATE+int(lvl*100))
        rows.append({
            "noise_pct": int(lvl*100),
            "ARI": ARI(y_base, y_new),
            "NMI": NMI(y_base, y_new),
        })
    return pd.DataFrame(rows)

def internal_indices(X: np.ndarray, y: np.ndarray):
    """Indices internes sur les labels fournis (si ≥2 clusters)."""
    unique = np.unique(y[y!=-1]) if (-1 in y) else np.unique(y)
    if unique.size < 2:
        return dict(silhouette=np.nan, calinski=np.nan, davies=np.nan)
    return dict(
        silhouette = float(silhouette_score(X, y)),
        calinski   = float(calinski_harabasz_score(X, y)),
        davies     = float(davies_bouldin_score(X, y)),
    )


In [13]:
# %% [markdown]
# ## Lancer les tests (avec assainissement des NaN/Inf) et sauvegarder les résumés

# %%
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
import os

def sanitize_X(X: np.ndarray):
    """Remplace ±Inf par NaN puis impute les NaN par la médiane par colonne."""
    X = np.asarray(X, dtype=float)
    X[~np.isfinite(X)] = np.nan  # ±Inf -> NaN
    imp = SimpleImputer(strategy="median")
    X_clean = imp.fit_transform(X)
    n_nan = np.isnan(X).sum()
    print(f"[sanitize] NaN imputés: {n_nan} | shape={X.shape}")
    return X_clean

# Assainir les matrices avant tests
X_agglo_clean  = sanitize_X(X_agglo)
X_kmeans_clean = sanitize_X(X_kmeans)

summ_rows = []
noise_details = []

# ---- Agglomerative ----
ari_mu, ari_sd, nmi_mu, nmi_sd = stability_bootstrap(X_agglo_clean, y_agglo, "agglomerative", k_agglo, N_BOOTSTRAPS)
ints = internal_indices(X_agglo_clean, y_agglo)
df_noise_agg = robustness_noise(X_agglo_clean, y_agglo, "agglomerative", k_agglo, NOISE_LEVELS)
df_noise_agg["method"] = "agglomerative"
noise_details.append(df_noise_agg)

summ_rows.append({
    "file": os.path.basename(FILE_AGGLO),
    "method": "agglomerative",
    "n": len(y_agglo),
    "k": k_agglo,
    "ARI_mean": round(ari_mu, 4),
    "ARI_std": round(ari_sd, 4),
    "NMI_mean": round(nmi_mu, 4),
    "NMI_std": round(nmi_sd, 4),
    "silhouette": round(ints["silhouette"], 4) if not np.isnan(ints["silhouette"]) else np.nan,
    "calinski": round(ints["calinski"], 2) if not np.isnan(ints["calinski"]) else np.nan,
    "davies": round(ints["davies"], 4) if not np.isnan(ints["davies"]) else np.nan,
})

# ---- K-Means ----
ari_mu, ari_sd, nmi_mu, nmi_sd = stability_bootstrap(X_kmeans_clean, y_kmeans, "kmeans", k_kmeans, N_BOOTSTRAPS)
ints = internal_indices(X_kmeans_clean, y_kmeans)
df_noise_km = robustness_noise(X_kmeans_clean, y_kmeans, "kmeans", k_kmeans, NOISE_LEVELS)
df_noise_km["method"] = "kmeans"
noise_details.append(df_noise_km)

summ_rows.append({
    "file": os.path.basename(FILE_KMEANS),
    "method": "kmeans",
    "n": len(y_kmeans),
    "k": k_kmeans,
    "ARI_mean": round(ari_mu, 4),
    "ARI_std": round(ari_sd, 4),
    "NMI_mean": round(nmi_mu, 4),
    "NMI_std": round(nmi_sd, 4),
    "silhouette": round(ints["silhouette"], 4) if not np.isnan(ints["silhouette"]) else np.nan,
    "calinski": round(ints["calinski"], 2) if not np.isnan(ints["calinski"]) else np.nan,
    "davies": round(ints["davies"], 4) if not np.isnan(ints["davies"]) else np.nan,
})

# ---- Sauvegardes ----
df_summary = pd.DataFrame(summ_rows)
df_noise   = pd.concat(noise_details, ignore_index=True)

sum_path   = os.path.join(OUT_DIR, "robustness_stability_summary.csv")
noise_path = os.path.join(OUT_DIR, "robustness_noise_sensitivity.csv")

df_summary.to_csv(sum_path, index=False)
df_noise.to_csv(noise_path, index=False)

display(df_summary)
display(df_noise)

print(f"✅ Sauvegardes :\n - {sum_path}\n - {noise_path}")


[sanitize] NaN imputés: 6 | shape=(341, 8)
[sanitize] NaN imputés: 6 | shape=(341, 8)


Unnamed: 0,file,method,n,k,ARI_mean,ARI_std,NMI_mean,NMI_std,silhouette,calinski,davies
0,clusters_annotes_clusters_umap_agglomerative.csv,agglomerative,341,5,0.0741,0.0364,0.1637,0.0355,-0.0721,17.28,7.1092
1,clusters_annotes_clusters_umap_kmeans_5.csv,kmeans,341,5,0.07,0.0277,0.1335,0.0349,-0.0448,13.71,8.1248


Unnamed: 0,noise_pct,ARI,NMI,method
0,1,0.041946,0.118427,agglomerative
1,3,0.137377,0.210646,agglomerative
2,5,0.114104,0.208644,agglomerative
3,1,0.04952,0.101829,kmeans
4,3,0.042615,0.098538,kmeans
5,5,0.042878,0.090091,kmeans


✅ Sauvegardes :
 - ../../cluster_results/robustness/robustness_stability_summary.csv
 - ../../cluster_results/robustness/robustness_noise_sensitivity.csv


# ## Interprétation (à insérer dans le rapport)
#
# - **Stabilité (bootstrap / ARI, NMI)** : des moyennes élevées (et faibles écarts-types)
#   signifient que la partition est peu sensible au rééchantillonnage des joueurs.
# - **Robustesse au bruit** : si ARI/NMI restent élevés quand on injecte 1–5% de bruit
#   (gaussien, par feature), la structure des clusters est robuste aux petites variations
#   de l'embedding UMAP.
# - **Indices internes** :
#   - Silhouette ↑ et Davies-Bouldin ↓ indiquent de meilleurs séparations/compacité,
#     Calinski-Harabasz ↑ reflète des clusters compacts et bien séparés.
# - **Comparaison méthodes** :
#   - Si Agglomerative > K-Means sur ARI/NMI et indices internes, il capte mieux des sous-profils.
#   - Si K-Means est proche en scores, **sa simplicité** en fait un très bon compromis.
