In [9]:
# %% [markdown]
# # Visualisation des clusters (tous les CSV du dossier)

# %%
import os, re, glob
import pandas as pd
import numpy as np
import plotly.express as px

# Dossier o√π tu as export√© les clusters
OUT_DIR = "../../cluster_results"
PLOT_DIR = os.path.join(OUT_DIR, "plots_html")
os.makedirs(PLOT_DIR, exist_ok=True)

META_COLS = ["player_name", "positions", "equipe", "team", "tag"]


In [10]:
# %% [markdown]
# ## Chargement des fichiers clusters_*.csv

# %%
cluster_files = sorted(glob.glob(os.path.join(OUT_DIR, "clusters_*.csv")))
if not cluster_files:
    raise FileNotFoundError(f"Aucun fichier clusters_*.csv trouv√© dans {OUT_DIR}")

dfs = []
for fp in cluster_files:
    try:
        df = pd.read_csv(fp)
        df["__source_file"] = os.path.basename(fp)
        dfs.append(df)
    except Exception as e:
        print(f"[warn] Impossible de lire {fp}: {e}")

print(f"Charg√©: {len(dfs)} fichiers")


Charg√©: 7 fichiers


In [11]:
# %% [markdown]
# ## Utilitaires pour d√©tecter les colonnes (t-SNE / UMAP / PCA / ISOMAP)

# %%
def detect_coord_columns(df: pd.DataFrame):
    """
    D√©tecte automatiquement les colonnes d'embedding en privil√©giant les pr√©fixes connus,
    et les ordonne (1,2,3). Si rien n'est trouv√©, prend les premi√®res colonnes num√©riques.
    """
    prefixes = ["tsne", "tSNE", "UMAP", "umap", "PCA", "pca", "isomap", "ISOMap", "ISOMAP"]
    candidates = []

    # Cherche des patterns du type prefix_num (ex: tsne_1, UMAP_2, PCA_3)
    for p in prefixes:
        patt = re.compile(rf"^{p}[_\s]*([0-9]+)$", re.IGNORECASE)
        hits = []
        for c in df.columns:
            m = patt.match(c.replace("__", "_").replace(" ", "_"))
            if m:
                try:
                    hits.append((int(m.group(1)), c))
                except Exception:
                    pass
        if hits:
            hits = [name for _, name in sorted(hits, key=lambda x: x[0])]
            candidates = hits
            break

    # Si rien trouv√©: fallback sur num√©riques hors meta/cluster
    if not candidates:
        banned = set(META_COLS + ["cluster", "__source_file"])
        num_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in banned]
        candidates = num_cols

    # Garde jusqu'√† 3 dimensions
    coords = candidates[:3]
    return coords


In [12]:
# %% [markdown]
# ## Profils de clusters (normalisation min‚Äìmax par colonne) + sauvegarde des plots

# %%
import os, re, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, plotly.express as px

# --- Stats √† utiliser ---
stats_cols = [
    'per90_gls',         # buts par 90 min
    'per90_ast',         # passes d√©cisives par 90 min
    'carries_prog',      # conduites de balle progressives
    'pct_take_on_suc',   # % de dribbles r√©ussis
    'tkl_plus_int',      # tacles + interceptions
    'pct_air_dual_won',  # % de duels a√©riens gagn√©s
    'ball_recov',        # ballons r√©cup√©r√©s
    'age'                # √¢ge du joueur
]

# --- Donn√©es de base (stats) ---
file_path_base = "../../raw_data/joueurs_ligue1_2024_2025.csv"
data_base = pd.read_csv(file_path_base, encoding="utf-8")

# --- Dossiers de sortie ---
PROFILE_DIR = os.path.join(OUT_DIR, "profiles_normed")
PLOT_DIR = os.path.join(PROFILE_DIR, "plots")
os.makedirs(PROFILE_DIR, exist_ok=True)
os.makedirs(PLOT_DIR, exist_ok=True)

def plot_cluster_profiles_normed(df_cluster: pd.DataFrame,
                                 data_base: pd.DataFrame,
                                 stats_cols,
                                 cluster_name: str = "Cluster",
                                 save_png: bool = True,
                                 save_html: bool = True):
    """
    Jointure cluster ‚Üí stats, calcule les moyennes par cluster,
    normalise chaque colonne (0‚Äì1) et sauvegarde les plots.
    """
    if "player_name" not in df_cluster.columns or "cluster" not in df_cluster.columns:
        print(f"[skip] {cluster_name}: colonnes manquantes ('player_name', 'cluster').")
        return None

    # --- Jointure ---
    df_clusters = df_cluster[['player_name', 'cluster']].copy()
    df_stats = data_base[['player_name'] + stats_cols].copy()
    df_merged = pd.merge(df_clusters, df_stats, on='player_name', how='left')

    # --- Moyennes et tailles ---
    cluster_means = df_merged.groupby('cluster', dropna=False)[stats_cols].mean(numeric_only=True)
    cluster_sizes = df_merged.groupby('cluster', dropna=False).size().rename("cluster_size")
    cluster_means['cluster_size'] = cluster_sizes

    # --- Normalisation min‚Äìmax par colonne ---
    cluster_means_normed = cluster_means.copy()
    for col in stats_cols:
        cmin, cmax = cluster_means[col].min(), cluster_means[col].max()
        cluster_means_normed[col] = (
            (cluster_means[col] - cmin) / (cmax - cmin)
            if cmax > cmin else 0.0
        )

    # --- Sauvegarde CSV ---
    safe_name = re.sub(r"[^A-Za-z0-9_\-]+", "_", cluster_name)
    csv_path = os.path.join(PROFILE_DIR, f"profiles_normed_{safe_name}.csv")
    cluster_means_normed.to_csv(csv_path, index=True)

    # --- Heatmap (matplotlib) ---
    plt.figure(figsize=(12, 6))
    sns.heatmap(cluster_means_normed[stats_cols], annot=True, fmt=".2f", cmap="coolwarm")
    plt.title(f"Profils normalis√©s (0‚Äì1) par cluster ‚Äî {cluster_name}")
    plt.ylabel("Cluster")
    plt.xlabel("Statistiques")
    plt.tight_layout()

    if save_png:
        png_path = os.path.join(PLOT_DIR, f"profiles_normed_{safe_name}.png")
        plt.savefig(png_path, dpi=150)
    plt.close()

    # --- Plot interactif (plotly) ---
    if save_html:
        fig = px.imshow(
            cluster_means_normed[stats_cols],
            labels=dict(x="Statistiques", y="Cluster", color="Valeur normalis√©e"),
            title=f"Profils normalis√©s par cluster ‚Äî {cluster_name}",
            color_continuous_scale="RdBu",
            aspect="auto"
        )
        html_path = os.path.join(PLOT_DIR, f"profiles_normed_{safe_name}.html")
        fig.write_html(html_path, include_plotlyjs="cdn")

    print(f"‚úÖ Sauv√©: {csv_path}")
    if save_png: print(f"üìä PNG: {png_path}")
    if save_html: print(f"üåê HTML: {html_path}")

    return cluster_means_normed


# --- Appliquer sur tous les fichiers de clusters d√©j√† charg√©s ---
profiles_normed = {}
for df in dfs:
    src = df["__source_file"].iloc[0] if "__source_file" in df.columns else "clusters"
    cluster_name = os.path.splitext(src)[0]
    res = plot_cluster_profiles_normed(df, data_base, stats_cols, cluster_name=cluster_name)
    profiles_normed[cluster_name] = res

# --- R√©capitulatif rapide ---
pd.DataFrame([{"file": k, "n_clusters": len(v)} for k, v in profiles_normed.items() if v is not None])


‚úÖ Sauv√©: ../../cluster_results/profiles_normed/profiles_normed_clusters_tsne_affinity.csv
üìä PNG: ../../cluster_results/profiles_normed/plots/profiles_normed_clusters_tsne_affinity.png
üåê HTML: ../../cluster_results/profiles_normed/plots/profiles_normed_clusters_tsne_affinity.html
‚úÖ Sauv√©: ../../cluster_results/profiles_normed/profiles_normed_clusters_tsne_birch.csv
üìä PNG: ../../cluster_results/profiles_normed/plots/profiles_normed_clusters_tsne_birch.png
üåê HTML: ../../cluster_results/profiles_normed/plots/profiles_normed_clusters_tsne_birch.html
‚úÖ Sauv√©: ../../cluster_results/profiles_normed/profiles_normed_clusters_tsne_dbscan.csv
üìä PNG: ../../cluster_results/profiles_normed/plots/profiles_normed_clusters_tsne_dbscan.png
üåê HTML: ../../cluster_results/profiles_normed/plots/profiles_normed_clusters_tsne_dbscan.html
‚úÖ Sauv√©: ../../cluster_results/profiles_normed/profiles_normed_clusters_tsne_gmm.csv
üìä PNG: ../../cluster_results/profiles_normed/plots/profil

Unnamed: 0,file,n_clusters
0,clusters_tsne_affinity,12
1,clusters_tsne_birch,8
2,clusters_tsne_dbscan,2
3,clusters_tsne_gmm,5
4,clusters_umap_agglomerative,5
5,clusters_umap_kmeans_5,5
6,clusters_umap_ward,10


In [13]:
# %% [markdown]
# ## G√©n√©ration des figures pour chaque fichier de clusters

# %%
exports = []
for df in dfs:
    src = df["__source_file"].iloc[0] if "__source_file" in df.columns else "clusters"
    title = f"Clusters ‚Äî {src}"
    fig, out_html = plot_clusters_df(df, title)
    if out_html:
        exports.append({"file": src, "html": out_html, "n": len(df), "n_clusters": df["cluster"].nunique()})

# R√©cap des exports
exp_df = pd.DataFrame(exports)
display(exp_df)
print(f"Exports HTML dans: {PLOT_DIR}")


NameError: name 'plot_clusters_df' is not defined