# Agglomerative Clustering – Multi-Method Pipeline

For each method, we:
1) Load the selected embedding (case-insensitive column handling).
2) Scale the embedding columns (StandardScaler).
3) Grid search Agglomerative Clustering over (n_clusters, linkage).
4) Compute internal validity metrics:
      - Silhouette (↑ better)
      - Calinski–Harabasz (↑ better)
      - Davies–Bouldin (↓ better)
5) Select the best config by: Silhouette ↑, then CH ↑, then DB ↓.
6) Save grid, best row, cluster assignments, and a plot of the best solution.


# Imports

In [17]:
import os
import re
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
)


# Constants and Directory Setup


In [18]:
REDUCED_ROOT = "reduced_data"

METHODS = ["umap", "pca", "tsne", "isomap"]

UMAP_DIR = os.path.join(REDUCED_ROOT, "umap")
UMAP_EMB_DIR = os.path.join(UMAP_DIR, "embeddings")
UMAP_BEST_DIR = os.path.join(UMAP_DIR, "best_results")

PCA_EMB_DIR = os.path.join(REDUCED_ROOT, "pca", "embeddings")
TSNE_EMB_DIR = os.path.join(REDUCED_ROOT, "tsne", "embeddings")
ISOMAP_EMB_DIR = os.path.join(REDUCED_ROOT, "isomap", "embeddings")

CLUST_ROOT = os.path.join("clusters", "agglomerative_clustering")
GRID_DIR = os.path.join(CLUST_ROOT, "grid_search")
BEST_DIR = os.path.join(CLUST_ROOT, "best_results")
CLUSTERS_DIR = os.path.join(CLUST_ROOT, "clusters")
PLOTS_DIR = os.path.join(CLUST_ROOT, "plots")

for d in [CLUST_ROOT, GRID_DIR, BEST_DIR, CLUSTERS_DIR, PLOTS_DIR]:
    os.makedirs(d, exist_ok=True)

# Hyperparameters
# N_CLUSTERS_GRID = list(range(2, 11))
N_CLUSTERS_GRID = list(range(3, 11))
LINKAGE_METHODS = ["ward", "complete", "average"]
METRIC_PRIORITY = ["silhouette", "calinski_harabasz", "davies_bouldin"]

KNOWN_ID_COLS = [
    "player_name", "equipe", "positions", "age",
    "player_id", "player_country_code"
]

warnings.filterwarnings("ignore", category=UserWarning)


# Utilities

In [19]:
def save_csv(df: pd.DataFrame, path: str) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False, encoding="utf-8")
    print(f"💾 Saved: {path}")

def to_lowercase_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.lower() for c in df.columns]
    return df

def detect_embedding_columns(df: pd.DataFrame, method_prefix: str) -> list:
    pat = re.compile(rf"^{re.escape(method_prefix)}_?(\d+)$")
    emb_cols = []
    for c in df.columns:
        m = pat.match(c)
        if m:
            emb_cols.append((c, int(m.group(1))))
    emb_cols = [name for name, _ in sorted(emb_cols, key=lambda t: t[1])]
    return emb_cols

def pick_id_columns(df: pd.DataFrame) -> list:
    ids = [c for c in KNOWN_ID_COLS if c in df.columns]
    if ids:
        return ids
    non_num = df.select_dtypes(exclude=[np.number]).columns.tolist()
    return non_num

def scale_embedding(X: np.ndarray) -> np.ndarray:
    scaler = StandardScaler()
    return scaler.fit_transform(X)

def evaluate_labels(X_scaled: np.ndarray, labels: np.ndarray) -> dict:
    """Compute clustering quality metrics."""
    n_clusters = len(set(labels))
    metrics = {"silhouette": np.nan, "calinski_harabasz": np.nan, "davies_bouldin": np.nan,
               "n_clusters": int(n_clusters)}

    if n_clusters > 1:
        try:
            sil = silhouette_score(X_scaled, labels)
            ch = calinski_harabasz_score(X_scaled, labels)
            db = davies_bouldin_score(X_scaled, labels)
            metrics.update({
                "silhouette": float(sil),
                "calinski_harabasz": float(ch),
                "davies_bouldin": float(db)
            })
        except Exception:
            pass

    return metrics

def select_best_by_metrics(df: pd.DataFrame, priority_list: list[str]) -> pd.Series:
    ranked_df = df.copy()
    for metric in priority_list:
        if metric not in df.columns:
            continue
        ascending = metric.lower() == 'davies_bouldin'
        ranked_df[f"{metric}_rank"] = ranked_df[metric].rank(
            method="min", ascending=ascending, na_option="bottom"
        )

    rank_cols = [f"{m}_rank" for m in priority_list if f"{m}_rank" in ranked_df.columns]
    ranked_df = ranked_df.sort_values(by=rank_cols, ascending=True)
    return ranked_df.iloc[0]

def plot_best_scatter(df_full: pd.DataFrame, emb_cols: list, labels_col: str,
                      title: str, outpath: str) -> None:
    if len(emb_cols) < 2:
        print("⚠️ Less than 2 embedding dimensions – skipping plot.")
        return

    plt.figure(figsize=(8, 6))
    n_clusters = len(set(df_full[labels_col]))
    palette = sns.color_palette(None, max(1, n_clusters))
    sns.scatterplot(
        data=df_full,
        x=emb_cols[0], y=emb_cols[1],
        hue=labels_col, palette=palette, s=45, alpha=0.9, edgecolor="none"
    )
    plt.title(title)
    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(outpath, dpi=300)
    plt.close()
    print(f"📈 Saved plot: {outpath}")

# Embedding Resolution

In [20]:
def _pick_one_csv_from_dir_by_method(emb_dir: str, method_keyword: str) -> str:
    if not os.path.isdir(emb_dir):
        raise FileNotFoundError(f"Embeddings directory not found: {emb_dir}")

    all_csv = glob.glob(os.path.join(emb_dir, "*.csv"))
    cand = [p for p in all_csv if method_keyword.lower() in os.path.basename(p).lower()]
    if not cand:
        cand = all_csv.copy()
    cand.sort()
    return cand[0]

def resolve_all_embedding_files_for_method(method: str) -> list[str]:
    m = method.lower()
    base_dir = None
    if m == "umap":
        base_dir = UMAP_EMB_DIR
    elif m == "pca":
        base_dir = PCA_EMB_DIR
    elif m == "tsne":
        base_dir = TSNE_EMB_DIR
    elif m == "isomap":
        base_dir = ISOMAP_EMB_DIR
    else:
        raise ValueError(f"Unsupported method: {method}")

    all_csvs = sorted(glob.glob(os.path.join(base_dir, "*.csv")))
    print(f"→ Found {len(all_csvs)} embeddings for {method.upper()} in {base_dir}")
    return all_csvs

def select_best_embedding_path(method: str) -> str:
    """
    Select the embedding file corresponding to the lowest MSE
    from the best_results folder for the given method.
    Falls back to the first embedding if no MSE data is available.
    """
    method = method.lower()
    base_best_dir = os.path.join(REDUCED_ROOT, method, "best_results")
    base_emb_dir = os.path.join(REDUCED_ROOT, method, "embeddings")

    best_files = glob.glob(os.path.join(base_best_dir, "*.csv"))
    if not best_files:
        print(f"⚠️ No best_results found for {method.upper()}, using first embedding instead.")
        all_emb = sorted(glob.glob(os.path.join(base_emb_dir, "*.csv")))
        if not all_emb:
            raise FileNotFoundError(f"No embeddings found for {method.upper()}.")
        return all_emb[0]

    rows = []
    for bf in best_files:
        try:
            dfm = pd.read_csv(bf)
            if "mse" in dfm.columns and not dfm.empty:
                mse = float(dfm["mse"].iloc[0])
                rows.append({"path": bf, "mse": mse})
        except Exception:
            continue

    if not rows:
        print(f"⚠️ No valid MSE entries found for {method.upper()}, using first embedding file.")
        all_emb = sorted(glob.glob(os.path.join(base_emb_dir, "*.csv")))
        if not all_emb:
            raise FileNotFoundError(f"No embeddings found for {method.upper()}.")
        return all_emb[0]

    # Pick the dataset with the lowest MSE
    best_entry = min(rows, key=lambda r: r["mse"])
    best_path = best_entry["path"]
    tag = os.path.basename(best_path).replace(f"_{method}_metrics.csv", "")

    # Locate corresponding embedding
    candidates = glob.glob(os.path.join(base_emb_dir, f"{tag}_*best_embedding*.csv"))
    if not candidates:
        candidates = glob.glob(os.path.join(base_emb_dir, f"{tag}_*.csv"))
    if not candidates:
        raise FileNotFoundError(f"No embedding found for tag '{tag}' in {base_emb_dir}")

    candidates.sort()
    chosen = candidates[0]
    print(f"✅ Selected {method.upper()} embedding with lowest MSE: {os.path.basename(chosen)}")
    return chosen


# Grid Search for Agglomerative Clustering

In [21]:
def grid_search_agglomerative_on_embedding(df: pd.DataFrame,
                                           emb_cols: list,
                                           method: str,
                                           tag: str) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
    X = df[emb_cols].to_numpy(dtype=float, copy=True)
    X_scaled = scale_embedding(X)

    rows = []
    for n_clust in N_CLUSTERS_GRID:
        for link in LINKAGE_METHODS:
            try:
                model = AgglomerativeClustering(
                    n_clusters=n_clust,
                    linkage=link
                )
                labels = model.fit_predict(X_scaled)
                metrics = evaluate_labels(X_scaled, labels)
                rows.append({
                    "method": method,
                    "tag": tag,
                    "n_clusters_param": n_clust,
                    "linkage": link,
                    **metrics
                })
            except Exception as e:
                print(f"⚠️ Error with linkage={link}, n_clusters={n_clust}: {e}")

    grid_df = pd.DataFrame(rows)
    best_row = select_best_by_metrics(grid_df, METRIC_PRIORITY)

    # Refit best configuration
    best_model = AgglomerativeClustering(
        n_clusters=int(best_row["n_clusters_param"]),
        linkage=best_row["linkage"]
    )
    best_labels = best_model.fit_predict(X_scaled)
    df_best = df.copy()
    df_best["cluster"] = best_labels

    return grid_df, best_row, df_best

# Orchestration for All Methods / Embeddings

In [22]:
def process_method(method: str) -> None:
    """
    Resolve the single embedding to use, run Agglomerative grid search, select best model,
    and save grid, best, clusters, and plot.
    """
    print(f"\n=== Processing method: {method.upper()} ===")

    # --- Select the embedding with lowest MSE (fallback to first if not found) ---
    try:
        emb_path = select_best_embedding_path(method)
    except Exception as e:
        print(f"⚠️ Could not select best embedding for {method.upper()}: {e}")
        return

    tag = os.path.splitext(os.path.basename(emb_path))[0]
    print(f"→ Embedding file: {emb_path}")

    # Load & prepare data
    df = pd.read_csv(emb_path)
    df = to_lowercase_columns(df)
    emb_cols = detect_embedding_columns(df, method)
    id_cols = pick_id_columns(df)
    print(f"Detected embedding columns ({len(emb_cols)}): {emb_cols[:8]}{' ...' if len(emb_cols) > 8 else ''}")
    print(f"ID columns retained: {id_cols}")

    # --- Grid search ---
    grid_df, best_row, df_best = grid_search_agglomerative_on_embedding(df, emb_cols, method, tag)

    # --- Define paths ---
    grid_out = os.path.join(GRID_DIR, method, f"{tag}_agglo_grid.csv")
    best_out = os.path.join(BEST_DIR, method, f"{tag}_agglo_best.csv")
    clusters_out = os.path.join(CLUSTERS_DIR, method, f"{tag}_agglo_clusters.csv")
    plot_out = os.path.join(PLOTS_DIR, method, f"{tag}_agglo_best.png")
    comp_plot = os.path.join(PLOTS_DIR, method, f"{tag}_positions_per_cluster.png")

    # --- Ensure directories exist BEFORE saving ---
    for p in [grid_out, best_out, clusters_out, plot_out, comp_plot]:
        os.makedirs(os.path.dirname(p), exist_ok=True)

    # --- Save outputs ---
    save_csv(grid_df, grid_out)
    save_csv(pd.DataFrame([best_row]), best_out)
    keep_cols = id_cols + emb_cols + ["cluster"]
    keep_cols = [c for c in keep_cols if c in df_best.columns]
    save_csv(df_best[keep_cols], clusters_out)

    # --- Plot clusters ---
    title = (f"{method.upper()} – Agglomerative Best "
             f"(n_clusters={int(best_row['n_clusters_param'])}, linkage={best_row['linkage']})")
    plot_best_scatter(df_best, emb_cols, "cluster", title, plot_out)

    # --- Optional composition plot ---
    try:
        if "positions" in df_best.columns:
            plt.figure(figsize=(10, 6))
            sns.countplot(
                data=df_best,
                x="cluster", hue="positions", palette="tab10"
            )
            plt.title(f"{method.upper()} – Positions per Cluster")
            plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
            plt.tight_layout()
            plt.savefig(comp_plot, dpi=300)
            plt.close()
            print(f"📊 Saved composition plot: {comp_plot}")
    except Exception as e:
        print(f"⚠️ Could not plot composition: {e}")

    # --- Console summary ---
    print("\n🏆 Best configuration:")
    print(f"  n_clusters={int(best_row['n_clusters_param'])} | linkage={best_row['linkage']}")
    print(f"  silhouette={best_row['silhouette']:.3f} | "
          f"CH={best_row['calinski_harabasz']:.1f} | "
          f"DB={best_row['davies_bouldin']:.3f}")
    print(f"✅ Outputs:\n"
          f"    grid → {grid_out}\n"
          f"    best → {best_out}\n"
          f"    clusters → {clusters_out}\n"
          f"    plots → {plot_out}")


In [23]:
def process_all_embeddings_per_method(method: str) -> None:
    all_embeddings = resolve_all_embedding_files_for_method(method)
    for emb_path in all_embeddings:
        try:
            tag = os.path.splitext(os.path.basename(emb_path))[0]
            print(f"\n=== Processing {method.upper()} embedding: {tag} ===")

            df = pd.read_csv(emb_path)
            df = to_lowercase_columns(df)
            emb_cols = detect_embedding_columns(df, method)
            id_cols = pick_id_columns(df)

            print(f"Detected embedding columns ({len(emb_cols)}): {emb_cols[:8]}{' ...' if len(emb_cols) > 8 else ''}")
            print(f"ID columns retained: {id_cols}")

            grid_df, best_row, df_best = grid_search_agglomerative_on_embedding(df, emb_cols, method, tag)

            grid_out = os.path.join(GRID_DIR, method, f"{tag}_agglo_grid.csv")
            best_out = os.path.join(BEST_DIR, method, f"{tag}_agglo_best.csv")
            clusters_out = os.path.join(CLUSTERS_DIR, method, f"{tag}_agglo_clusters.csv")
            plot_out = os.path.join(PLOTS_DIR, method, f"{tag}_agglo_best.png")
            comp_plot = os.path.join(PLOTS_DIR, method, f"{tag}_positions_per_cluster.png")

            for p in [grid_out, best_out, clusters_out, plot_out, comp_plot]:
                os.makedirs(os.path.dirname(p), exist_ok=True)

            # --- Save results ---
            save_csv(grid_df, grid_out)
            save_csv(pd.DataFrame([best_row]), best_out)
            keep_cols = id_cols + emb_cols + ["cluster"]
            keep_cols = [c for c in keep_cols if c in df_best.columns]
            save_csv(df_best[keep_cols], clusters_out)

            # --- Plot scatter ---
            title = (f"{method.upper()} – Agglomerative Best "
                    f"(n_clusters={int(best_row['n_clusters_param'])}, linkage={best_row['linkage']})")
            plot_best_scatter(df_best, emb_cols, "cluster", title, plot_out)

            # Optional composition plot by position
            try:
                if "positions" in df_best.columns:
                    plt.figure(figsize=(10, 6))
                    sns.countplot(
                        data=df_best,
                        x="cluster", hue="positions", palette="tab10"
                    )
                    plt.title(f"{method.upper()} – Positions per Cluster")
                    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
                    plt.tight_layout()
                    plt.savefig(comp_plot, dpi=300)
                    plt.close()
                    print(f"📊 Saved composition plot: {comp_plot}")
            except Exception as e:
                print(f"⚠️ Could not plot composition for {tag}: {e}")

            print("\n🏆 Best configuration:")
            print(f"  n_clusters={int(best_row['n_clusters_param'])} | linkage={best_row['linkage']}")
            print(f"  silhouette={best_row['silhouette']:.3f} | "
                    f"CH={best_row['calinski_harabasz']:.1f} | "
                    f"DB={best_row['davies_bouldin']:.3f}")
            print(f"✅ Outputs:\n"
                    f"    grid → {grid_out}\n"
                    f"    best → {best_out}\n"
                    f"    clusters → {clusters_out}\n"
                    f"    plot → {plot_out}")

        except Exception as e:
            print(f"⚠️ Skipping embedding {emb_path} due to error: {e}")

In [24]:
print(f"Scanning reduced embeddings under: {REDUCED_ROOT}")
for method in METHODS:
    try:
        process_method(method)
        # process_all_embeddings_per_method(method)
    except Exception as e:
        print(f"❌ Skipping method {method.upper()} due to error: {e}")


Scanning reduced embeddings under: reduced_data

=== Processing method: UMAP ===
✅ Selected UMAP embedding with lowest MSE: joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding.csv
→ Embedding file: reduced_data\umap\embeddings\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding.csv
Detected embedding columns (5): ['umap_1', 'umap_2', 'umap_3', 'umap_4', 'umap_5']
ID columns retained: ['player_name', 'equipe', 'positions', 'age', 'player_id', 'player_country_code']
💾 Saved: clusters\agglomerative_clustering\grid_search\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_agglo_grid.csv
💾 Saved: clusters\agglomerative_clustering\best_results\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_agglo_best.csv
💾 Saved: clusters\agglomerative_clustering\clusters\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_agglo_clusters.csv
📈 Saved plot: clusters\agglomerative_clustering\plots\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embed