In [29]:
# %% [markdown]
# # Clustering t-SNE & UMAP — Ligue 1
# Cette section charge les données réduites et prépare l'environnement.

# %%
import os
import numpy as np
import pandas as pd

from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, DBSCAN, KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# --- Chemins de fichiers (tes chemins) ---
file_path_PCA    = "../../reduced_data/pca/embeddings/joueurs_ligue1_PCA_custom.csv"
file_path_tSNE   = "../../reduced_data/tsne/embeddings/joueurs_ligue1_tSNE_custom_GK.csv"
file_path_ISOMAP = "../../reduced_data/isomap/embeddings/joueurs_ligue1_ISOMap_raw.csv"
file_path_UMAP   = "../../reduced_data/umap/embeddings/joueurs_ligue1_2024_2025_clean_per90_umap3d_best_embedding.csv"

# Dossier sorties
OUT_DIR = "../../cluster_results"
os.makedirs(OUT_DIR, exist_ok=True)

# Colonnes méta à préserver si présentes
META_COLS = ["player_name", "positions", "equipe", "team", "tag"]


In [30]:
# %% [markdown]
# ## Chargement des jeux de données

# %%
def load_dataset(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Fichier introuvable: {path}")
    return pd.read_csv(path)

df_pca    = load_dataset(file_path_PCA)
df_tsne   = load_dataset(file_path_tSNE)
df_isomap = load_dataset(file_path_ISOMAP)
df_umap   = load_dataset(file_path_UMAP)

print("Shapes →",
      f"PCA={df_pca.shape}, tSNE={df_tsne.shape}, ISOMAP={df_isomap.shape}, UMAP={df_umap.shape}")


Shapes → PCA=(333, 37), tSNE=(356, 7), ISOMAP=(333, 8), UMAP=(333, 9)


In [31]:
# %% [markdown]
# ## Sélection des features (colonnes numériques)

# %%
def select_feature_matrix(df: pd.DataFrame) -> pd.DataFrame:
    meta = [c for c in META_COLS if c in df.columns]
    X = df.select_dtypes(include=[np.number]).copy()
    if X.shape[1] == 0:  # fallback si besoin
        X = df.drop(columns=meta, errors="ignore")
    return X

X_pca    = select_feature_matrix(df_pca)
X_tsne   = select_feature_matrix(df_tsne)
X_isomap = select_feature_matrix(df_isomap)
X_umap   = select_feature_matrix(df_umap)

print("Num features →",
      f"PCA={X_pca.shape[1]}, tSNE={X_tsne.shape[1]}, ISOMAP={X_isomap.shape[1]}, UMAP={X_umap.shape[1]}")


Num features → PCA=34, tSNE=4, ISOMAP=5, UMAP=4


In [32]:
# %% [markdown]
# ## Métriques & export

# %%
from typing import Optional, Tuple

def safe_metrics(X: pd.DataFrame, labels: np.ndarray) -> Tuple[Optional[float], Optional[float], Optional[float], int, Optional[int]]:
    unique = np.unique(labels)
    n_clusters = len(unique[unique != -1]) if -1 in unique else len(unique)
    n_noise = int(np.sum(labels == -1)) if -1 in unique else None

    if n_clusters <= 1:
        return None, None, None, n_clusters, n_noise

    try:
        sil = silhouette_score(X, labels)
    except Exception:
        sil = None
    try:
        ch = calinski_harabasz_score(X, labels)
    except Exception:
        ch = None
    try:
        db = davies_bouldin_score(X, labels)
    except Exception:
        db = None
    return sil, ch, db, n_clusters, n_noise

# %% [markdown]
# ## Export clusters — version complète (métadonnées + coordonnées automatiques)

# %%
def export_clusters(df_src: pd.DataFrame, labels: np.ndarray, dataset_key: str, method_name: str) -> str:
    """
    Exporte automatiquement :
      - les colonnes méta (player_name, positions, équipe, team, tag)
      - la colonne 'cluster'
      - toutes les colonnes numériques (coordonnées : tsne_*, umap_*, pca_*, etc.)
    """
    meta_cols = [c for c in META_COLS if c in df_src.columns]
    num_cols = [c for c in df_src.select_dtypes(include=[np.number]).columns if c != "cluster"]

    # Construit le DataFrame exporté
    out = df_src[meta_cols].copy() if meta_cols else pd.DataFrame(index=df_src.index)
    out["cluster"] = labels
    for c in num_cols:
        out[c] = df_src[c].values

    # Sauvegarde automatique
    out_path = os.path.join(OUT_DIR, f"clusters_{dataset_key}_{method_name}.csv")
    out.to_csv(out_path, index=False)
    return out_path



In [33]:
# %% [markdown]
# ## Définition des algorithmes (hyperparamètres imposés)

# %%
def run_affinity_tsne(X: pd.DataFrame) -> np.ndarray:
    # Affinity Propagation (t-SNE): damping=0.75, preference=-40, random_state=42
    model = AffinityPropagation(damping=0.75, preference=-40, random_state=42)
    return model.fit_predict(X)

def run_gmm_tsne(X: pd.DataFrame) -> np.ndarray:
    # GMM (t-SNE): n_components=5, covariance_type="full", random_state=42
    gmm = GaussianMixture(n_components=5, covariance_type="full", random_state=42)
    gmm.fit(X)
    return gmm.predict(X)

def run_dbscan_tsne(X: pd.DataFrame) -> np.ndarray:
    # DBSCAN (t-SNE): eps=0.9521711055021824, min_samples=20
    db = DBSCAN(eps=0.9521711055021824, min_samples=20)
    return db.fit_predict(X)

def run_agglomerative_umap(X: pd.DataFrame) -> np.ndarray:
    # Agglomerative (UMAP): n_clusters=5, linkage="ward"
    agg = AgglomerativeClustering(n_clusters=5, linkage="ward")
    return agg.fit_predict(X)

def run_kmeans_umap(X: pd.DataFrame) -> np.ndarray:
    # KMeans (UMAP): n_clusters=5
    km = KMeans(n_clusters=5, n_init=10, random_state=42)
    return km.fit_predict(X)


In [34]:
# %% [markdown]
# ## Exécution — t-SNE : Affinity, GMM, DBSCAN

# %%
results = []

# 1) Affinity (t-SNE)
labels = run_affinity_tsne(X_tsne)
sil, ch, db, n_cl, n_noise = safe_metrics(X_tsne, labels)
path = export_clusters(df_tsne, labels, "tsne", "affinity")
results.append({"method":"affinity_tsne","dataset":"tsne","n_clusters":n_cl,"n_noise":n_noise,
                "silhouette":sil,"calinski_harabasz":ch,"davies_bouldin":db,"clusters_csv":path})
print(f"[Affinity/tSNE] clusters={n_cl}, sil={sil}, ch={ch}, db={db} → {path}")

# 2) GMM (t-SNE)
labels = run_gmm_tsne(X_tsne)
sil, ch, db, n_cl, n_noise = safe_metrics(X_tsne, labels)
path = export_clusters(df_tsne, labels, "tsne", "gmm")
results.append({"method":"gmm_tsne","dataset":"tsne","n_clusters":n_cl,"n_noise":n_noise,
                "silhouette":sil,"calinski_harabasz":ch,"davies_bouldin":db,"clusters_csv":path})
print(f"[GMM/tSNE] clusters={n_cl}, sil={sil}, ch={ch}, db={db} → {path}")

# 3) DBSCAN (t-SNE)
labels = run_dbscan_tsne(X_tsne)
sil, ch, db, n_cl, n_noise = safe_metrics(X_tsne, labels)
path = export_clusters(df_tsne, labels, "tsne", "dbscan")
results.append({"method":"dbscan_tsne","dataset":"tsne","n_clusters":n_cl,"n_noise":n_noise,
                "silhouette":sil,"calinski_harabasz":ch,"davies_bouldin":db,"clusters_csv":path})
print(f"[DBSCAN/tSNE] clusters={n_cl}, noise={n_noise}, sil={sil}, ch={ch}, db={db} → {path}")


[Affinity/tSNE] clusters=12, sil=0.305359449009325, ch=232.84069377413527, db=1.1282189966630074 → ../../cluster_results/clusters_tsne_affinity.csv
[GMM/tSNE] clusters=5, sil=0.3284210534968433, ch=248.30416988612498, db=1.1642518000146584 → ../../cluster_results/clusters_tsne_gmm.csv
[DBSCAN/tSNE] clusters=1, noise=336, sil=None, ch=None, db=None → ../../cluster_results/clusters_tsne_dbscan.csv


In [35]:
# %% [markdown]
# ## Exécution — UMAP : Agglomerative(ward), KMeans(5)

# %%
# 4) Agglomerative (UMAP)
labels = run_agglomerative_umap(X_umap)
sil, ch, db, n_cl, n_noise = safe_metrics(X_umap, labels)
path = export_clusters(df_umap, labels, "umap", "agglomerative")
results.append({"method":"agg_umap","dataset":"umap","n_clusters":n_cl,"n_noise":n_noise,
                "silhouette":sil,"calinski_harabasz":ch,"davies_bouldin":db,"clusters_csv":path})
print(f"[Agglo/UMAP] clusters={n_cl}, sil={sil}, ch={ch}, db={db} → {path}")

# 5) KMeans (UMAP)
labels = run_kmeans_umap(X_umap)
sil, ch, db, n_cl, n_noise = safe_metrics(X_umap, labels)
path = export_clusters(df_umap, labels, "umap", "kmeans_5")
results.append({"method":"kmeans_umap","dataset":"umap","n_clusters":n_cl,"n_noise":n_noise,
                "silhouette":sil,"calinski_harabasz":ch,"davies_bouldin":db,"clusters_csv":path})
print(f"[KMeans/UMAP] clusters={n_cl}, sil={sil}, ch={ch}, db={db} → {path}")


[Agglo/UMAP] clusters=5, sil=0.2829845995242783, ch=196.4034339336274, db=1.05369096786117 → ../../cluster_results/clusters_umap_agglomerative.csv
[KMeans/UMAP] clusters=5, sil=0.30292913527555065, ch=218.0853831742956, db=1.0237728497593295 → ../../cluster_results/clusters_umap_kmeans_5.csv
