# DBSCAN Clustering – Multi-Method Pipeline

For each method, we:
1) Load the selected embedding (case-insensitive column handling).
2) Scale the embedding columns (StandardScaler).
3) Build a data-driven eps grid from k-distance percentiles (10%–90%).
4) Grid search DBSCAN over (eps, min_samples), compute internal validity metrics:
      - Silhouette (↑ better), Calinski–Harabasz (↑ better), Davies–Bouldin (↓ better).
5) Select the best config by: Silhouette ↑, then CH ↑, then DB ↓.
6) Save grid, best row, cluster assignments, and a plot of the best solution.

# Imports

In [1]:
import os
import re
import glob
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
)

# Constants and Directory Setup

In [2]:
REDUCED_ROOT = "reduced_data"

METHODS = ["umap", "pca", "tsne", "isomap"]  # case-insensitive handling downstream

# Input sub-structure per method
UMAP_DIR = os.path.join(REDUCED_ROOT, "umap")
UMAP_EMB_DIR = os.path.join(UMAP_DIR, "embeddings")
UMAP_BEST_DIR = os.path.join(UMAP_DIR, "best_results")

PCA_EMB_DIR = os.path.join(REDUCED_ROOT, "pca", "embeddings")
TSNE_EMB_DIR = os.path.join(REDUCED_ROOT, "tsne", "embeddings")
ISOMAP_EMB_DIR = os.path.join(REDUCED_ROOT, "isomap", "embeddings")

# Output directories
CLUST_ROOT = os.path.join("clusters", "dbscan")
GRID_DIR = os.path.join(CLUST_ROOT, "grid_search")
BEST_DIR = os.path.join(CLUST_ROOT, "best_results")
CLUSTERS_DIR = os.path.join(CLUST_ROOT, "clusters")
PLOTS_DIR = os.path.join(CLUST_ROOT, "plots")

for d in [CLUST_ROOT, GRID_DIR, BEST_DIR, CLUSTERS_DIR, PLOTS_DIR]:
    os.makedirs(d, exist_ok=True)

# Grid search hyperparameters
MIN_SAMPLES_GRID = [5, 7, 10, 15, 20]
K_FOR_KDIST = max(MIN_SAMPLES_GRID)  # use the most stringent min_samples to build eps grid
PERCENT_RANGE = (10, 90)             # percentile window for eps candidates
N_EPS_CANDIDATES = 15                # number of eps values to scan within percentile window

# Known (optional) ID columns to preserve if present
KNOWN_ID_COLS = [
    "player_name", "equipe", "positions", "age",
    "player_id", "player_country_code"
]

METRIC_PRIORITY = ['silhouette', 'calinski_harabasz', 'davies_bouldin']


In [3]:
warnings.filterwarnings("ignore", category=UserWarning)

# Utilities

In [4]:
def save_csv(df: pd.DataFrame, path: str) -> None:
    """Save helper with directory creation."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False, encoding="utf-8")
    print(f"💾 Saved: {path}")


def to_lowercase_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize all column names to lowercase."""
    df = df.copy()
    df.columns = [c.lower() for c in df.columns]
    return df


def detect_embedding_columns_old(df: pd.DataFrame, method_prefix: str) -> list:
    """
    Given a DataFrame (already lowercased) and a method prefix ('umap_', 'pca_', 'tsne_', 'isomap_'),
    return the list of embedding columns of the form f'{prefix}<int>' (e.g., 'umap_1', 'umap_2', ...).
    """
    pat = re.compile(rf"^{re.escape(method_prefix)}(\d+)$")
    emb_cols = [c for c in df.columns if pat.match(c)]
    # numerical suffix order
    emb_cols = sorted(emb_cols, key=lambda x: int(pat.match(x).group(1)))
    return emb_cols

# new
def detect_embedding_columns(df: pd.DataFrame, method_prefix: str) -> list:
    """
    Detect embedding columns for a method, accepting both underscore and no-underscore styles:
      - 'pca_1', 'pca_2', ...  or  'pca1', 'pca2', ...
    Case-insensitive; df is assumed lowercased.
    """
    # optional underscore after prefix, then integer suffix
    # e.g., ^pca_?(\d+)$, ^umap_?(\d+)$, ^tsne_?(\d+)$, ^isomap_?(\d+)$
    pat = re.compile(rf"^{re.escape(method_prefix)}_?(\d+)$")
    emb_cols = []
    for c in df.columns:
        m = pat.match(c)
        if m:
            emb_cols.append((c, int(m.group(1))))
    emb_cols = [name for name, _ in sorted(emb_cols, key=lambda t: t[1])]
    return emb_cols


def pick_id_columns(df: pd.DataFrame) -> list:
    """
    Select ID columns if present; fall back to all non-numeric columns (excluding embedding columns).
    """
    ids = [c for c in KNOWN_ID_COLS if c in df.columns]
    if ids:
        return ids
    # Fallback: non-numeric cols
    non_num = df.select_dtypes(exclude=[np.number]).columns.tolist()
    return non_num


def scale_embedding(X: np.ndarray) -> np.ndarray:
    """Standardize embedding features before clustering."""
    scaler = StandardScaler()
    return scaler.fit_transform(X)


def k_distance_values(X_scaled: np.ndarray, k: int) -> np.ndarray:
    """
    Compute sorted k-distances for eps grid building.
    Returns the sorted distances to the k-th nearest neighbor for each point.
    """
    nbrs = NearestNeighbors(n_neighbors=k, n_jobs=None)
    nbrs.fit(X_scaled)
    distances, _ = nbrs.kneighbors(X_scaled)
    kdist = np.sort(distances[:, k - 1])
    return kdist


def build_eps_grid_from_percentiles(kdist: np.ndarray,
                                    prange=(10, 90),
                                    n_candidates=15) -> np.ndarray:
    """
    Build eps candidate values from percentile window of k-distances.
    Handles degenerate ranges.
    """
    p_low, p_high = np.percentile(kdist, prange)
    if not np.isfinite(p_low) or not np.isfinite(p_high):
        # Fallback to median ± small deltas
        med = np.median(kdist)
        return np.linspace(max(1e-6, med * 0.5), med * 1.5, n_candidates)

    if p_high <= p_low:
        # Flat curve; expand slightly around p_low
        base = p_low if np.isfinite(p_low) else np.median(kdist)
        eps_min = max(1e-6, base * 0.8)
        eps_max = base * 1.2
        return np.linspace(eps_min, eps_max, n_candidates)

    return np.linspace(max(1e-6, p_low), p_high, n_candidates)


def evaluate_labels(X_scaled: np.ndarray, labels: np.ndarray) -> dict:
    """
    Compute internal validation metrics on non-noise points if ≥2 clusters.
    Returns dict with (silhouette, ch, db) possibly NaN if not meaningful.
    """
    mask = labels != -1
    # cluster count excluding noise
    clusters = set(labels[mask])
    n_clusters = len(clusters)
    n_noise = int((labels == -1).sum())

    metrics = {"silhouette": np.nan, "calinski_harabasz": np.nan, "davies_bouldin": np.nan,
               "n_clusters": int(n_clusters), "n_noise": n_noise}

    if mask.sum() > 1 and n_clusters > 1:
        Xv = X_scaled[mask]
        yv = labels[mask]
        try:
            sil = silhouette_score(Xv, yv)
            ch = calinski_harabasz_score(Xv, yv)
            db = davies_bouldin_score(Xv, yv)
            metrics.update({
                "silhouette": float(sil),
                "calinski_harabasz": float(ch),
                "davies_bouldin": float(db)
            })
        except Exception:
            pass

    return metrics


def lexicographic_best(df: pd.DataFrame) -> pd.Series:
    """
    Select best row by:
      1) silhouette: max
      2) calinski_harabasz: max
      3) davies_bouldin: min
    Rows with NaN silhouette are ranked last automatically.
    """
    ranked = (
        df.assign(
            silhouette_rank=df["silhouette"].rank(method="min", ascending=False, na_option="bottom"),
            ch_rank=df["calinski_harabasz"].rank(method="min", ascending=False, na_option="bottom"),
            db_rank=df["davies_bouldin"].rank(method="min", ascending=True, na_option="bottom")
        )
        .sort_values(by=["silhouette_rank", "ch_rank", "db_rank"], ascending=True)
    )
    return ranked.iloc[0]


def plot_best_scatter(df_full: pd.DataFrame,
                      emb_cols: list,
                      labels_col: str,
                      title: str,
                      outpath: str) -> None:
    """Scatter plot of first two embedding dims colored by cluster label."""
    if len(emb_cols) < 2:
        print("⚠️ Less than 2 embedding dimensions – skipping plot.")
        return

    plt.figure(figsize=(8, 6))
    n_clusters = len(set(df_full[labels_col])) - (1 if -1 in df_full[labels_col].values else 0)
    palette = sns.color_palette(None, max(1, n_clusters))
    sns.scatterplot(
        data=df_full,
        x=emb_cols[0], y=emb_cols[1],
        hue=labels_col, palette=palette, s=45, alpha=0.9, edgecolor="none"
    )
    plt.title(title)
    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(outpath, dpi=300)
    plt.close()
    print(f"📈 Saved plot: {outpath}")



# UMAP Best-Embedding Selection

In [5]:
def select_best_umap_embedding_path() -> str:
    """
    Among files in `umap/best_results/`, pick the CSV (per dataset) with the **lowest MSE**,
    then find the corresponding embedding CSV under `umap/embeddings/`.

    Assumptions from `umap_pipeline.py`:
    - Best metrics file name: `{tag}_umap_metrics.csv`
    - Embedding file saved as: `embeddings/{tag}_umap{Nd}d_best_embedding.csv`
      (we will search for files starting with `{tag}_` and containing `best_embedding`)

    Returns: absolute path to the chosen embedding CSV, or raises if not found.
    """
    best_files = glob.glob(os.path.join(UMAP_BEST_DIR, "*.csv"))
    if not best_files:
        raise FileNotFoundError(f"No best_results CSVs found in {UMAP_BEST_DIR}")

    # Read all metrics and pick the single *global* best (lowest MSE) across datasets
    rows = []
    for bf in best_files:
        try:
            dfm = pd.read_csv(bf)
            if "mse" not in dfm.columns or dfm.empty:
                continue
            mse = float(dfm["mse"].iloc[0])
            rows.append({"path": bf, "mse": mse})
        except Exception:
            continue

    if not rows:
        raise RuntimeError("No valid UMAP best_results rows with MSE found.")

    best_entry = min(rows, key=lambda r: r["mse"])
    best_path = best_entry["path"]
    tag = os.path.basename(best_path).replace("_umap_metrics.csv", "")

    # Find the embedding file matching this tag
    candidates = glob.glob(os.path.join(UMAP_EMB_DIR, f"{tag}_*best_embedding*.csv"))
    if not candidates:
        raise FileNotFoundError(f"No embedding CSV with best_embedding found for tag '{tag}' in {UMAP_EMB_DIR}")

    # If multiple, prefer the one that mentions dimensionality implied by metrics row (if present)
    # Otherwise, take the first sorted candidate for determinism
    candidates.sort()
    chosen = candidates[0]
    print(f"✅ Selected UMAP embedding for DBSCAN: {os.path.basename(chosen)} (from tag: {tag})")
    return chosen



# Metric-based Model Selection

In [6]:
def select_best_by_metrics(df: pd.DataFrame, priority_list: list[str]) -> pd.Series:
    """
    Select best DBSCAN configuration dynamically based on user-specified priority of metrics.

    Parameters
    ----------
    df : pd.DataFrame
        Grid search results containing eps, min_samples, and metrics.
    priority_list : list[str]
        Metrics to prioritize in order, e.g. ['silhouette', 'calinski_harabasz', 'davies_bouldin'].

    Returns
    -------
    pd.Series
        The best row according to the chosen priority.
    """
    ranked_df = df.copy()

    # For each metric, compute a ranking column dynamically
    for metric in priority_list:
        if metric not in df.columns:
            print(f"⚠️ Metric '{metric}' not found in grid; skipping.")
            continue

        ascending = metric.lower() == 'davies_bouldin'  # DB index → lower is better
        rank_col = f"{metric}_rank"
        ranked_df[rank_col] = ranked_df[metric].rank(
            method="min", ascending=ascending, na_option="bottom"
        )

    # Sort by all rank columns in order of priority
    rank_cols = [f"{m}_rank" for m in priority_list if f"{m}_rank" in ranked_df.columns]
    if not rank_cols:
        raise ValueError("No valid metrics found for ranking in the grid DataFrame.")

    ranked_df = ranked_df.sort_values(by=rank_cols, ascending=True)
    best_row = ranked_df.iloc[0]
    return best_row

# Method → Embedding File Resolution

In [7]:
def _pick_one_csv_from_dir_by_method(emb_dir: str, method_keyword: str) -> str:
    """
    Pick one CSV from a directory that contains the method keyword (case-insensitive).
    Preference order if multiple:
      1) filenames containing 'custom'
      2) filenames containing 'raw'
      3) lexicographically first
    """
    if not os.path.isdir(emb_dir):
        raise FileNotFoundError(f"Embeddings directory not found: {emb_dir}")

    all_csv = glob.glob(os.path.join(emb_dir, "*.csv"))
    cand = [p for p in all_csv if method_keyword.lower() in os.path.basename(p).lower()]
    if not cand:
        # fallback: if method_keyword not in names (e.g., teammates exported without it), use any csv
        cand = all_csv.copy()

    if not cand:
        raise FileNotFoundError(f"No CSV embeddings found in {emb_dir}")

    # preferences
    prefer_custom = [p for p in cand if "custom" in os.path.basename(p).lower()]
    if prefer_custom:
        prefer_custom.sort()
        return prefer_custom[0]

    prefer_raw = [p for p in cand if "raw" in os.path.basename(p).lower()]
    if prefer_raw:
        prefer_raw.sort()
        return prefer_raw[0]

    cand.sort()
    return cand[0]


# def resolve_embedding_file_for_method(method: str) -> str:
#     m = method.lower()
#     if m == "umap":
#         return select_best_umap_embedding_path()
#     elif m == "pca":
#         return _pick_one_csv_from_dir_by_method(PCA_EMB_DIR, "pca")
#     elif m == "tsne":
#         # cover tsne/tSNE naming
#         return _pick_one_csv_from_dir_by_method(TSNE_EMB_DIR, "tsne")
#     elif m == "isomap":
#         return _pick_one_csv_from_dir_by_method(ISOMAP_EMB_DIR, "isomap")
#     else:
#         raise ValueError(f"Unsupported method: {method}")

def resolve_embedding_file_for_method(method: str) -> str:
    """
    Resolve the correct embedding file based on the intended dataset source per method:
      - UMAP  → lowest-MSE embedding (per90 dataset)
      - PCA   → embeddings containing 'custom'
      - t-SNE → embeddings containing 'custom_gk'
      - ISOMAP→ embeddings containing 'raw'
    """
    m = method.lower()

    if m == "umap":
        return select_best_umap_embedding_path()

    if m == "pca":
        pattern = "*custom*.csv"
        base_dir = PCA_EMB_DIR
    elif m == "tsne":
        pattern = "*custom_gk*.csv"
        base_dir = TSNE_EMB_DIR
    elif m == "isomap":
        pattern = "*raw*.csv"
        base_dir = ISOMAP_EMB_DIR
    else:
        raise ValueError(f"Unsupported method: {method}")

    candidates = sorted(glob.glob(os.path.join(base_dir, pattern)))
    if not candidates:
        # fallback: pick any embedding for the method if specific keyword missing
        print(f"⚠️ No embeddings matching {pattern} found for {m.upper()}, falling back to first available.")
        candidates = sorted(glob.glob(os.path.join(base_dir, "*.csv")))
        if not candidates:
            raise FileNotFoundError(f"No embeddings found in {base_dir} for {m.upper()}")

    chosen = candidates[0]
    print(f"✅ Selected {m.upper()} embedding: {os.path.basename(chosen)}")
    return chosen


def resolve_all_embedding_files_for_method(method: str) -> list[str]:
    """
    Return a list of all embedding CSV paths for a given method.

    Each method folder can contain multiple embeddings (e.g. multiple UMAPs).
    We gather *all* CSVs under its embeddings/ subdirectory.

    Returns a list of absolute file paths.
    """
    m = method.lower()
    base_dir = None
    if m == "umap":
        base_dir = UMAP_EMB_DIR
    elif m == "pca":
        base_dir = PCA_EMB_DIR
    elif m == "tsne":
        base_dir = TSNE_EMB_DIR
    elif m == "isomap":
        base_dir = ISOMAP_EMB_DIR
    else:
        raise ValueError(f"Unsupported method: {method}")

    if not os.path.isdir(base_dir):
        raise FileNotFoundError(f"Embeddings directory not found: {base_dir}")

    all_csvs = sorted(glob.glob(os.path.join(base_dir, "*.csv")))
    if not all_csvs:
        raise FileNotFoundError(f"No embedding CSVs found for {method.upper()} in {base_dir}")

    print(f"→ Found {len(all_csvs)} embeddings for {method.upper()} in {base_dir}")
    return all_csvs

# Loading & Preparing an Embedding

In [8]:
def load_and_prepare_embedding(embedding_csv_path: str, method: str) -> tuple[pd.DataFrame, list, list]:
    """
    Load the embedding CSV, lowercase columns, detect embedding columns by prefix,
    and return (df_lower, emb_cols, id_cols).
    """
    df = pd.read_csv(embedding_csv_path)
    df = to_lowercase_columns(df)

    prefix = method.lower()
    # standardize known variants (just to be defensive)
    if prefix in ["t-sne", "tsne"]:
        prefix = "tsne"
    if prefix not in ["umap", "pca", "tsne", "isomap"]:
        raise ValueError(f"Unsupported method prefix: {prefix}")

    # emb_cols = detect_embedding_columns(df, prefix + "_")
    emb_cols = detect_embedding_columns(df, prefix)
    
    if not emb_cols:
        raise RuntimeError(f"No embedding columns like '{prefix}1'/ detected in {embedding_csv_path}")

    id_cols = pick_id_columns(df)
    return df, emb_cols, id_cols


# DBSCAN Grid Search

In [9]:
def grid_search_dbscan_on_embedding(df: pd.DataFrame,
                                    emb_cols: list,
                                    method: str,
                                    tag: str) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
    """
    Run grid search over DBSCAN on the selected embedding columns.
    Returns:
      - grid_df: full table of (eps, min_samples, n_clusters, n_noise, silhouette, ch, db)
      - best_row: selected best configuration (Series)
      - df_best: original df with a new 'cluster' column for the best config
    """
    X = df[emb_cols].to_numpy(dtype=float, copy=True)
    X_scaled = scale_embedding(X)

    # Build eps candidates from k-distance
    try:
        kdist = k_distance_values(X_scaled, k=K_FOR_KDIST)
        eps_candidates = build_eps_grid_from_percentiles(
            kdist, prange=PERCENT_RANGE, n_candidates=N_EPS_CANDIDATES
        )
    except Exception as e:
        print(f"⚠️ Could not build eps grid from k-distance ({e}); using fallback linspace.")
        # Fallback: simple range based on std of pairwise distances
        eps_candidates = np.linspace(0.1, 2.0, N_EPS_CANDIDATES)

    rows = []
    for ms in MIN_SAMPLES_GRID:
        for eps in eps_candidates:
            model = DBSCAN(eps=float(eps), min_samples=int(ms))
            labels = model.fit_predict(X_scaled)
            metrics = evaluate_labels(X_scaled, labels)
            rows.append({
                "method": method,
                "tag": tag,
                "eps": float(eps),
                "min_samples": int(ms),
                **metrics
            })

    grid_df = pd.DataFrame(rows)

    # Select best lexicographically
    # best_row = lexicographic_best(grid_df)
    best_row = select_best_by_metrics(grid_df, METRIC_PRIORITY)

    # Refit best to collect labels for saving
    best_eps = float(best_row["eps"])
    best_ms = int(best_row["min_samples"])
    best_model = DBSCAN(eps=best_eps, min_samples=best_ms)
    best_labels = best_model.fit_predict(X_scaled)

    df_best = df.copy()
    df_best["cluster"] = best_labels

    return grid_df, best_row, df_best


# Orchestration per Method

In [10]:
def process_method(method: str) -> None:
    """
    Resolve the single embedding to use, run DBSCAN grid search, select best model,
    and save grid, best, clusters, and plot.
    """
    print(f"\n=== Processing method: {method.upper()} ===")
    emb_path = resolve_embedding_file_for_method(method)
    tag = os.path.splitext(os.path.basename(emb_path))[0]

    print(f"→ Embedding file: {emb_path}")

    df, emb_cols, id_cols = load_and_prepare_embedding(emb_path, method)
    print(
        f"Detected embedding columns ({len(emb_cols)}): {emb_cols[:8]}{' ...' if len(emb_cols) > 8 else ''}")
    print(f"ID columns retained: {id_cols}")

    # --- Run DBSCAN grid search ---
    grid_df, best_row, df_best = grid_search_dbscan_on_embedding(df, emb_cols, method, tag)

    # --- Define method-specific output paths ---
    grid_out = os.path.join(GRID_DIR, method, f"{tag}_dbscan_grid.csv")
    best_out = os.path.join(BEST_DIR, method, f"{tag}_dbscan_best.csv")
    clusters_out = os.path.join(CLUSTERS_DIR, method, f"{tag}_dbscan_clusters.csv")
    plot_out = os.path.join(PLOTS_DIR, method, f"{tag}_dbscan_best.png")

    # --- Ensure directories exist BEFORE saving ---
    for p in [grid_out, best_out, clusters_out, plot_out]:
        os.makedirs(os.path.dirname(p), exist_ok=True)

    # --- Save results ---
    keep_cols = id_cols + emb_cols + ["cluster"]
    keep_cols = [c for c in keep_cols if c in df_best.columns]
    save_csv(grid_df, grid_out)
    save_csv(pd.DataFrame([best_row]), best_out)
    save_csv(df_best[keep_cols], clusters_out)

    # --- Plot scatter ---
    title = (f"{method.upper()} – DBSCAN Best "
             f"(eps={best_row['eps']:.3f}, min_samples={int(best_row['min_samples'])})")
    plot_best_scatter(df_best, emb_cols, "cluster", title, plot_out)

    # Visualize which player positions dominate each cluster (excluding noise).
    try:
        if "positions" in df_best.columns:
            plt.figure(figsize=(10, 6))
            sns.countplot(
                data=df_best[df_best["cluster"] != -1],
                x="cluster", hue="positions", palette="tab10"
            )
            plt.title(f"{method.upper()} – Positions per Cluster (excluding noise)")
            plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
            plt.tight_layout()
            comp_plot = os.path.join(PLOTS_DIR, method, f"{tag}_positions_per_cluster.png")
            os.makedirs(os.path.dirname(comp_plot), exist_ok=True)
            plt.savefig(comp_plot, dpi=300)
            plt.close()
            print(f"📊 Saved composition plot: {comp_plot}")
    except Exception as e:
        print(f"⚠️ Could not plot cluster composition: {e}")

    # --- Console summary ---
    print("\n🏆 Best configuration:")
    print(f"  eps={best_row['eps']:.4f} | min_samples={int(best_row['min_samples'])}")
    print(f"  silhouette={best_row['silhouette']:.3f} | "
          f"CH={best_row['calinski_harabasz']:.1f} | "
          f"DB={best_row['davies_bouldin']:.3f}")
    print(f"  n_clusters={int(best_row['n_clusters'])} | n_noise={int(best_row['n_noise'])}")
    print(f"✅ Outputs:\n"
          f"    grid → {grid_out}\n"
          f"    best → {best_out}\n"
          f"    clusters → {clusters_out}\n"
          f"    plots → {plot_out}")
    print("    extra → composition & silhouette stability plots added ✅")



# Run on All Methods

In [11]:
def process_all_embeddings_per_method(method: str) -> None:
    all_embeddings = resolve_all_embedding_files_for_method(method)
    for emb_path in all_embeddings:
        try:
            tag = os.path.splitext(os.path.basename(emb_path))[0]
            print(f"\n=== Processing {method.upper()} embedding: {tag} ===")

            df, emb_cols, id_cols = load_and_prepare_embedding(emb_path, method)
            print(
                f"Detected embedding columns ({len(emb_cols)}): {emb_cols[:8]}{' ...' if len(emb_cols) > 8 else ''}")
            print(f"ID columns retained: {id_cols}")

            # Run DBSCAN grid search
            grid_df, best_row, df_best = grid_search_dbscan_on_embedding(
                df, emb_cols, method, tag
            )

            # --- Define paths ---
            grid_out = os.path.join(GRID_DIR, method, f"{tag}_dbscan_grid.csv")
            best_out = os.path.join(BEST_DIR, method, f"{tag}_dbscan_best.csv")
            clusters_out = os.path.join(CLUSTERS_DIR, method, f"{tag}_dbscan_clusters.csv")
            plot_out = os.path.join(PLOTS_DIR, method, f"{tag}_dbscan_best.png")
            comp_plot = os.path.join(PLOTS_DIR, method, f"{tag}_positions_per_cluster.png")

            # --- Ensure directories exist BEFORE saving ---
            for p in [grid_out, best_out, clusters_out, plot_out, comp_plot]:
                os.makedirs(os.path.dirname(p), exist_ok=True)

            # --- Save results ---
            keep_cols = id_cols + emb_cols + ["cluster"]
            keep_cols = [c for c in keep_cols if c in df_best.columns]
            save_csv(grid_df, grid_out)
            save_csv(pd.DataFrame([best_row]), best_out)
            save_csv(df_best[keep_cols], clusters_out)

            # --- Plot best scatter ---
            title = (f"{method.upper()} – DBSCAN Best "
                     f"(eps={best_row['eps']:.3f}, min_samples={int(best_row['min_samples'])})")
            plot_best_scatter(df_best, emb_cols, "cluster", title, plot_out)

            # Optional composition plot
            try:
                if "positions" in df_best.columns:
                    plt.figure(figsize=(10, 6))
                    sns.countplot(
                        data=df_best[df_best["cluster"] != -1],
                        x="cluster", hue="positions", palette="tab10"
                    )
                    plt.title(f"{method.upper()} – Positions per Cluster (excluding noise)")
                    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
                    os.makedirs(os.path.dirname(comp_plot), exist_ok=True)
                    plt.savefig(comp_plot, dpi=300)
                    plt.close()
                    print(f"📊 Saved composition plot: {comp_plot}")
            except Exception as e:
                print(f"⚠️ Could not plot cluster composition for {tag}: {e}")

            # Console summary
            print("\n🏆 Best configuration:")
            print(
                f"  eps={best_row['eps']:.4f} | min_samples={int(best_row['min_samples'])}")
            print(f"  silhouette={best_row['silhouette']:.3f} | "
                    f"CH={best_row['calinski_harabasz']:.1f} | "
                    f"DB={best_row['davies_bouldin']:.3f}")
            print(
                f"  n_clusters={int(best_row['n_clusters'])} | n_noise={int(best_row['n_noise'])}")
            print(f"✅ Outputs:\n"
                    f"    grid → {grid_out}\n"
                    f"    best → {best_out}\n"
                    f"    clusters → {clusters_out}\n"
                    f"    plots → {plot_out}")
            print("    extra → composition plots added ✅")
        except Exception as e:
            print(f"⚠️ Skipping embedding {emb_path} due to error: {e}")

In [12]:
print(f"Scanning reduced embeddings under: {REDUCED_ROOT}")
for method in METHODS:
    try:
        process_method(method)
        # process_all_embeddings_per_method(method)
    except Exception as e:
        print(f"❌ Skipping {method.upper()} due to error: {e}")

Scanning reduced embeddings under: reduced_data

=== Processing method: UMAP ===
✅ Selected UMAP embedding for DBSCAN: joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding.csv (from tag: joueurs_ligue1_2024_2025_clean_per90)
→ Embedding file: reduced_data\umap\embeddings\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding.csv
Detected embedding columns (5): ['umap_1', 'umap_2', 'umap_3', 'umap_4', 'umap_5']
ID columns retained: ['player_name', 'equipe', 'positions', 'age', 'player_id', 'player_country_code']
💾 Saved: clusters\dbscan\grid_search\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_dbscan_grid.csv
💾 Saved: clusters\dbscan\best_results\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_dbscan_best.csv
💾 Saved: clusters\dbscan\clusters\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_dbscan_clusters.csv
📈 Saved plot: clusters\dbscan\plots\umap\joueurs_ligue1_2024_2025_clean_per90_umap5d_best_embedding_dbscan_best.png
📊 Sa