# K-Means Clustering – Multi-Method Pipeline

For each dimensionality reduction method (UMAP, PCA, t-SNE, ISOMAP):
1) Load the selected embedding (case-insensitive column handling).
2) Scale embedding columns with StandardScaler.
3) Grid search over n_clusters (e.g., 2–20).
4) Compute internal validity metrics:
      - Silhouette Score (↑ better)
      - Calinski–Harabasz Index (↑ better)
      - Davies–Bouldin Index (↓ better)
5) Select best config by multi-metric ranking (Silhouette, CH, DB).
6) Save grid, best row, clusters, and plots to per-method subfolders.

# Imports

In [1]:
import os
import re
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
)


# Constants and Directory Setup

In [2]:

REDUCED_ROOT = "reduced_data"

METHODS = ["umap", "pca", "tsne", "isomap"]

# Input sub-structure per method
UMAP_EMB_DIR = os.path.join(REDUCED_ROOT, "umap", "embeddings")
PCA_EMB_DIR = os.path.join(REDUCED_ROOT, "pca", "embeddings")
TSNE_EMB_DIR = os.path.join(REDUCED_ROOT, "tsne", "embeddings")
ISOMAP_EMB_DIR = os.path.join(REDUCED_ROOT, "isomap", "embeddings")

# Output directories
CLUST_ROOT = os.path.join("clusters", "kmeans")
GRID_DIR = os.path.join(CLUST_ROOT, "grid_search")
BEST_DIR = os.path.join(CLUST_ROOT, "best_results")
CLUSTERS_DIR = os.path.join(CLUST_ROOT, "clusters")
PLOTS_DIR = os.path.join(CLUST_ROOT, "plots")

for d in [CLUST_ROOT, GRID_DIR, BEST_DIR, CLUSTERS_DIR, PLOTS_DIR]:
    os.makedirs(d, exist_ok=True)

# Grid search parameters
N_CLUSTERS_GRID = list(range(3, 13))
INIT = "k-means++"
N_INIT = 20
RANDOM_STATE = 42

METRIC_PRIORITY = ["silhouette", "calinski_harabasz", "davies_bouldin"]

KNOWN_ID_COLS = [
    "player_name", "equipe", "positions", "age",
    "player_id", "player_country_code"
]

warnings.filterwarnings("ignore", category=UserWarning)


# Utilities

In [3]:
def save_csv(df: pd.DataFrame, path: str) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False, encoding="utf-8")
    print(f"💾 Saved: {path}")


def to_lowercase_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.lower() for c in df.columns]
    return df


def detect_embedding_columns(df: pd.DataFrame, method_prefix: str) -> list:
    pat = re.compile(rf"^{re.escape(method_prefix)}_?(\d+)$")
    emb_cols = []
    for c in df.columns:
        m = pat.match(c)
        if m:
            emb_cols.append((c, int(m.group(1))))
    emb_cols = [name for name, _ in sorted(emb_cols, key=lambda t: t[1])]
    return emb_cols


def pick_id_columns(df: pd.DataFrame) -> list:
    ids = [c for c in KNOWN_ID_COLS if c in df.columns]
    if ids:
        return ids
    non_num = df.select_dtypes(exclude=[np.number]).columns.tolist()
    return non_num


def scale_embedding(X: np.ndarray) -> np.ndarray:
    scaler = StandardScaler()
    return scaler.fit_transform(X)


def evaluate_labels(X_scaled: np.ndarray, labels: np.ndarray) -> dict:
    """Compute internal clustering metrics."""
    n_clusters = len(set(labels))
    metrics = {"silhouette": np.nan, "calinski_harabasz": np.nan, "davies_bouldin": np.nan,
               "n_clusters": int(n_clusters)}

    if n_clusters > 1:
        try:
            sil = silhouette_score(X_scaled, labels)
            ch = calinski_harabasz_score(X_scaled, labels)
            db = davies_bouldin_score(X_scaled, labels)
            metrics.update({
                "silhouette": float(sil),
                "calinski_harabasz": float(ch),
                "davies_bouldin": float(db)
            })
        except Exception:
            pass

    return metrics


def select_best_by_metrics(df: pd.DataFrame, priority_list: list[str]) -> pd.Series:
    ranked_df = df.copy()
    for metric in priority_list:
        if metric not in df.columns:
            continue
        ascending = metric.lower() == "davies_bouldin"
        ranked_df[f"{metric}_rank"] = ranked_df[metric].rank(
            method="min", ascending=ascending, na_option="bottom"
        )
    rank_cols = [f"{m}_rank" for m in priority_list if f"{m}_rank" in ranked_df.columns]
    ranked_df = ranked_df.sort_values(by=rank_cols, ascending=True)
    return ranked_df.iloc[0]


def plot_best_scatter(df_full: pd.DataFrame, emb_cols: list, labels_col: str,
                      title: str, outpath: str) -> None:
    if len(emb_cols) < 2:
        print("⚠️ Less than 2 embedding dimensions – skipping plot.")
        return

    plt.figure(figsize=(8, 6))
    n_clusters = len(set(df_full[labels_col]))
    palette = sns.color_palette(None, max(1, n_clusters))
    sns.scatterplot(
        data=df_full,
        x=emb_cols[0], y=emb_cols[1],
        hue=labels_col, palette=palette, s=45, alpha=0.9, edgecolor="none"
    )
    plt.title(title)
    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(outpath, dpi=300)
    plt.close()
    print(f"📈 Saved plot: {outpath}")

# Embedding Resolution

In [4]:
def resolve_embedding_file_for_method(method: str) -> str:
    """
    Resolve the correct embedding file for each dimensionality reduction method
    based on dataset naming conventions:
      - UMAP  → lowest-MSE embedding (per90 dataset)
      - PCA   → embedding containing 'custom'
      - t-SNE → embedding containing 'custom_gk'
      - ISOMAP→ embedding containing 'raw'
    """
    m = method.lower()

    if m == "umap":
        return select_best_embedding_path("umap")

    if m == "pca":
        pattern = "*custom*.csv"
        base_dir = PCA_EMB_DIR
    elif m == "tsne":
        pattern = "*custom_gk*.csv"
        base_dir = TSNE_EMB_DIR
    elif m == "isomap":
        pattern = "*raw*.csv"
        base_dir = ISOMAP_EMB_DIR
    else:
        raise ValueError(f"Unsupported method: {method}")

    candidates = sorted(glob.glob(os.path.join(base_dir, pattern)))
    if not candidates:
        print(f"⚠️ No embeddings matching {pattern} found for {m.upper()}, using fallback.")
        candidates = sorted(glob.glob(os.path.join(base_dir, "*.csv")))
        if not candidates:
            raise FileNotFoundError(f"No embeddings found in {base_dir} for {m.upper()}")

    chosen = candidates[0]
    print(f"✅ Selected {m.upper()} embedding: {os.path.basename(chosen)}")
    return chosen


def resolve_all_embedding_files_for_method(method: str) -> list[str]:
    m = method.lower()
    base_dir = None
    if m == "umap":
        base_dir = UMAP_EMB_DIR
    elif m == "pca":
        base_dir = PCA_EMB_DIR
    elif m == "tsne":
        base_dir = TSNE_EMB_DIR
    elif m == "isomap":
        base_dir = ISOMAP_EMB_DIR
    else:
        raise ValueError(f"Unsupported method: {method}")

    all_csvs = sorted(glob.glob(os.path.join(base_dir, "*.csv")))
    print(f"→ Found {len(all_csvs)} embeddings for {method.upper()} in {base_dir}")
    return all_csvs

def select_best_embedding_path(method: str) -> str:
    """
    Select the embedding file corresponding to the lowest MSE
    from the best_results folder for the given method.
    Falls back to the first embedding if no MSE data is available.
    """
    method = method.lower()
    base_best_dir = os.path.join(REDUCED_ROOT, method, "best_results")
    base_emb_dir = os.path.join(REDUCED_ROOT, method, "embeddings")

    best_files = glob.glob(os.path.join(base_best_dir, "*.csv"))
    if not best_files:
        print(f"⚠️ No best_results found for {method.upper()}, using first embedding file instead.")
        all_emb = sorted(glob.glob(os.path.join(base_emb_dir, "*.csv")))
        if not all_emb:
            raise FileNotFoundError(f"No embeddings found for {method.upper()}.")
        return all_emb[0]

    rows = []
    for bf in best_files:
        try:
            dfm = pd.read_csv(bf)
            if "mse" in dfm.columns and not dfm.empty:
                mse = float(dfm["mse"].iloc[0])
                rows.append({"path": bf, "mse": mse})
        except Exception:
            continue

    if not rows:
        print(f"⚠️ No valid MSE entries found for {method.upper()}, using first embedding file.")
        all_emb = sorted(glob.glob(os.path.join(base_emb_dir, "*.csv")))
        if not all_emb:
            raise FileNotFoundError(f"No embeddings found for {method.upper()}.")
        return all_emb[0]

    # Pick lowest-MSE dataset
    best_entry = min(rows, key=lambda r: r["mse"])
    best_path = best_entry["path"]
    tag = os.path.basename(best_path).replace(f"_{method}_metrics.csv", "")

    # Find corresponding embedding
    candidates = glob.glob(os.path.join(base_emb_dir, f"{tag}_*best_embedding*.csv"))
    if not candidates:
        candidates = glob.glob(os.path.join(base_emb_dir, f"{tag}_*.csv"))
    if not candidates:
        raise FileNotFoundError(f"No embedding file found for tag '{tag}' in {base_emb_dir}")
    candidates.sort()
    chosen = candidates[0]

    print(f"✅ Selected {method.upper()} embedding with lowest MSE: {os.path.basename(chosen)}")
    return chosen



# Grid Search for K-Means Clustering

In [5]:
def grid_search_kmeans_on_embedding(df: pd.DataFrame,
                                    emb_cols: list,
                                    method: str,
                                    tag: str) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
    X = df[emb_cols].to_numpy(dtype=float, copy=True)
    X_scaled = scale_embedding(X)

    rows = []
    for n_clust in N_CLUSTERS_GRID:
        try:
            model = KMeans(
                n_clusters=n_clust,
                init=INIT,
                n_init=N_INIT,
                random_state=RANDOM_STATE
            )
            labels = model.fit_predict(X_scaled)
            metrics = evaluate_labels(X_scaled, labels)
            rows.append({
                "method": method,
                "tag": tag,
                "n_clusters_param": n_clust,
                **metrics
            })
        except Exception as e:
            print(f"⚠️ Error at n_clusters={n_clust}: {e}")

    grid_df = pd.DataFrame(rows)
    best_row = select_best_by_metrics(grid_df, METRIC_PRIORITY)

    # Refit best configuration
    best_model = KMeans(
        n_clusters=int(best_row["n_clusters_param"]),
        init=INIT,
        n_init=N_INIT,
        random_state=RANDOM_STATE
    )
    best_labels = best_model.fit_predict(X_scaled)
    df_best = df.copy()
    df_best["cluster"] = best_labels

    return grid_df, best_row, df_best

# Orchestration for Single and Multiple Embeddings

In [6]:
def process_method(method: str) -> None:
    """
    Run K-Means clustering on a representative embedding for a given method.
    """
    print(f"\n=== Processing method: {method.upper()} ===")

    emb_path = resolve_embedding_file_for_method(method)
    if not emb_path:
        print(f"⚠️ No embeddings found for {method.upper()}")
        return
    tag = os.path.splitext(os.path.basename(emb_path))[0]
    print(f"→ Embedding file: {emb_path}")

    df = pd.read_csv(emb_path)
    df = to_lowercase_columns(df)
    emb_cols = detect_embedding_columns(df, method)
    id_cols = pick_id_columns(df)

    grid_df, best_row, df_best = grid_search_kmeans_on_embedding(df, emb_cols, method, tag)

    # Paths
    grid_out = os.path.join(GRID_DIR, method, f"{tag}_kmeans_grid.csv")
    best_out = os.path.join(BEST_DIR, method, f"{tag}_kmeans_best.csv")
    clusters_out = os.path.join(CLUSTERS_DIR, method, f"{tag}_kmeans_clusters.csv")
    plot_out = os.path.join(PLOTS_DIR, method, f"{tag}_kmeans_best.png")
    comp_plot = os.path.join(PLOTS_DIR, method, f"{tag}_positions_per_cluster.png")

    for p in [grid_out, best_out, clusters_out, plot_out, comp_plot]:
        os.makedirs(os.path.dirname(p), exist_ok=True)

    save_csv(grid_df, grid_out)
    save_csv(pd.DataFrame([best_row]), best_out)
    keep_cols = id_cols + emb_cols + ["cluster"]
    keep_cols = [c for c in keep_cols if c in df_best.columns]
    save_csv(df_best[keep_cols], clusters_out)

    title = (f"{method.upper()} – KMeans Best "
             f"(n_clusters={int(best_row['n_clusters_param'])})")
    plot_best_scatter(df_best, emb_cols, "cluster", title, plot_out)

    # Composition plot
    try:
        if "positions" in df_best.columns:
            plt.figure(figsize=(10, 6))
            sns.countplot(
                data=df_best,
                x="cluster", hue="positions", palette="tab10"
            )
            plt.title(f"{method.upper()} – Positions per Cluster")
            plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
            plt.tight_layout()
            plt.savefig(comp_plot, dpi=300)
            plt.close()
            print(f"📊 Saved composition plot: {comp_plot}")
    except Exception as e:
        print(f"⚠️ Could not plot composition: {e}")

    print("\n🏆 Best configuration:")
    print(f"  n_clusters={int(best_row['n_clusters_param'])}")
    print(f"  silhouette={best_row['silhouette']:.3f} | "
          f"CH={best_row['calinski_harabasz']:.1f} | "
          f"DB={best_row['davies_bouldin']:.3f}")
    print(f"✅ Outputs:\n"
          f"    grid → {grid_out}\n"
          f"    best → {best_out}\n"
          f"    clusters → {clusters_out}\n"
          f"    plots → {plot_out}")


In [7]:
def process_all_embeddings_per_method(method: str) -> None:
    all_embeddings = resolve_all_embedding_files_for_method(method)
    for emb_path in all_embeddings:
        try:
            tag = os.path.splitext(os.path.basename(emb_path))[0]
            print(f"\n=== Processing {method.upper()} embedding: {tag} ===")

            df = pd.read_csv(emb_path)
            df = to_lowercase_columns(df)
            emb_cols = detect_embedding_columns(df, method)
            id_cols = pick_id_columns(df)

            grid_df, best_row, df_best = grid_search_kmeans_on_embedding(df, emb_cols, method, tag)

            # Paths
            grid_out = os.path.join(GRID_DIR, method, f"{tag}_kmeans_grid.csv")
            best_out = os.path.join(BEST_DIR, method, f"{tag}_kmeans_best.csv")
            clusters_out = os.path.join(CLUSTERS_DIR, method, f"{tag}_kmeans_clusters.csv")
            plot_out = os.path.join(PLOTS_DIR, method, f"{tag}_kmeans_best.png")
            comp_plot = os.path.join(PLOTS_DIR, method, f"{tag}_positions_per_cluster.png")

            for p in [grid_out, best_out, clusters_out, plot_out, comp_plot]:
                os.makedirs(os.path.dirname(p), exist_ok=True)

            save_csv(grid_df, grid_out)
            save_csv(pd.DataFrame([best_row]), best_out)
            keep_cols = id_cols + emb_cols + ["cluster"]
            keep_cols = [c for c in keep_cols if c in df_best.columns]
            save_csv(df_best[keep_cols], clusters_out)

            title = (f"{method.upper()} – KMeans Best "
                     f"(n_clusters={int(best_row['n_clusters_param'])})")
            plot_best_scatter(df_best, emb_cols, "cluster", title, plot_out)

            # Composition plot
            try:
                if "positions" in df_best.columns:
                    plt.figure(figsize=(10, 6))
                    sns.countplot(
                        data=df_best,
                        x="cluster", hue="positions", palette="tab10"
                    )
                    plt.title(f"{method.upper()} – Positions per Cluster")
                    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
                    plt.tight_layout()
                    plt.savefig(comp_plot, dpi=300)
                    plt.close()
                    print(f"📊 Saved composition plot: {comp_plot}")
            except Exception as e:
                print(f"⚠️ Could not plot composition for {tag}: {e}")

            print("\n🏆 Best configuration:")
            print(f"  n_clusters={int(best_row['n_clusters_param'])}")
            print(f"  silhouette={best_row['silhouette']:.3f} | "
                  f"CH={best_row['calinski_harabasz']:.1f} | "
                  f"DB={best_row['davies_bouldin']:.3f}")
            print(f"✅ Outputs:\n"
                  f"    grid → {grid_out}\n"
                  f"    best → {best_out}\n"
                  f"    clusters → {clusters_out}\n"
                  f"    plots → {plot_out}")

        except Exception as e:
            print(f"⚠️ Skipping embedding {emb_path} due to error: {e}")


In [8]:

print(f"Scanning reduced embeddings under: {REDUCED_ROOT}")
for method in METHODS:
    try:
        process_method(method)        
        # process_all_embeddings_per_method(method)
    except Exception as e:
        print(f"❌ Skipping {method.upper()} due to error: {e}")

Scanning reduced embeddings under: reduced_data

=== Processing method: UMAP ===
✅ Selected UMAP embedding with lowest MSE: joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding.csv
→ Embedding file: reduced_data\umap\embeddings\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding.csv
💾 Saved: clusters\kmeans\grid_search\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_kmeans_grid.csv
💾 Saved: clusters\kmeans\best_results\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_kmeans_best.csv
💾 Saved: clusters\kmeans\clusters\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_kmeans_clusters.csv
📈 Saved plot: clusters\kmeans\plots\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_kmeans_best.png
📊 Saved composition plot: clusters\kmeans\plots\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_positions_per_cluster.png

🏆 Best configuration:
  n_clusters=5
  silhouette=0.355 | CH=231.1 | DB=0.993
✅ Outputs:
    gr