# UMAP Dimensionality Reduction – Multi-Dataset Pipeline



# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import pairwise_distances
from sklearn.manifold import trustworthiness
from scipy.stats import spearmanr
import umap
import warnings


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

warnings.filterwarnings(
    "ignore",
    message="n_jobs value 1 overridden to 1 by setting random_state",
    category=UserWarning
)

## Constants and Directory Setup

In [3]:
INPUT_DIR = "cleaned_data"
OUTPUT_DIR = "reduced_data"

UMAP_DIR = os.path.join(OUTPUT_DIR, "umap")
EMB_DIR = os.path.join(UMAP_DIR, "embeddings")
GRID_DIR = os.path.join(UMAP_DIR, "grid_search")
BEST_DIR = os.path.join(UMAP_DIR, "best_results")
PLOT_DIR = os.path.join(UMAP_DIR, "plots")

for d in [OUTPUT_DIR, UMAP_DIR, EMB_DIR, GRID_DIR, BEST_DIR, PLOT_DIR]:
    os.makedirs(d, exist_ok=True)

ID_COLS = [
    "player_name", "equipe", "positions", "age",
    "player_id", "player_country_code"
]

# UMAP parameter grid
N_NEIGHBORS = [5, 10, 15, 30, 50]
MIN_DIST = [0.0, 0.1, 0.3, 0.5]
# N_COMPONENTS = [2, 3, 5, 10]
N_COMPONENTS = [2, 3]
METRIC = "euclidean"
SEED = 42

# Quality metric neighborhood size
K_TRUST_CONT = 10


## Quality Metric Functions

In [4]:
def continuity(X, X_embedded, k=10):
    n = X.shape[0]
    D_high = pairwise_distances(X)
    D_low = pairwise_distances(X_embedded)
    orig_neighbors = np.argsort(D_high, axis=1)[:, 1:k+1]
    emb_neighbors = np.argsort(D_low, axis=1)[:, 1:k+1]
    total = 0
    for i in range(n):
        orig = set(orig_neighbors[i])
        emb = set(emb_neighbors[i])
        missing = orig - emb
        total += sum([list(orig_neighbors[i]).index(m) + 1 for m in missing])
    return 1 - (2 / (n * k * (2 * n - 3 * k - 1))) * total


def mrre(X, X_embedded):
    n = X.shape[0]
    D_high = pairwise_distances(X)
    D_low = pairwise_distances(X_embedded)
    R_high = np.argsort(np.argsort(D_high, axis=1), axis=1)
    R_low = np.argsort(np.argsort(D_low, axis=1), axis=1)
    error = np.abs(R_high - R_low) / np.maximum(R_high, 1)
    np.fill_diagonal(error, 0)
    return error.sum() / (n * (n - 1))


def evaluate_umap_quality(X_high, X_low, k=10):
    trust = trustworthiness(X_high, X_low, n_neighbors=k)
    cont = continuity(X_high, X_low, k)
    corr, _ = spearmanr(pairwise_distances(X_high).ravel(),
                        pairwise_distances(X_low).ravel())
    mrre_score = mrre(X_high, X_low)
    return {
        'trustworthiness': trust,
        'continuity': cont,
        'distance_corr': corr,
        'mrre': mrre_score
    }

## Utility Functions

In [5]:
def scale_features(df, id_cols):
    X = df[[c for c in df.columns if c not in id_cols]].select_dtypes(include=[np.number])
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

def run_umap(X_scaled, n_neighbors, min_dist, n_components, metric=METRIC, seed=SEED):
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric,
        random_state=seed
    )
    embedding = reducer.fit_transform(X_scaled)
    return embedding

def save_csv(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False, encoding='utf-8')
    print(f"💾 Saved: {path}")

def pick_best_row_by_mse(results_df):
    idx = results_df['mse'].idxmin()
    return results_df.loc[idx].to_dict()


## Parameter Grid Search for UMAP Quality

In [6]:
def grid_search_umap(X_scaled, k_eval=K_TRUST_CONT):
    results = []
    for nn in N_NEIGHBORS:
        for md in MIN_DIST:
            for nc in N_COMPONENTS:
                emb = run_umap(X_scaled, nn, md, nc)
                m = evaluate_umap_quality(X_scaled, emb, k=k_eval)
                m.update({
                    "n_neighbors": nn,
                    "min_dist": md,
                    "n_components": nc
                })
                results.append(m)
                print(f"✓ n_neighbors={nn}, min_dist={md}, n_components={nc} | "
                      f"trust={m['trustworthiness']:.3f}, cont={m['continuity']:.3f}, "
                      f"dcorr={m['distance_corr']:.3f}, mrre={m['mrre']:.3f}")
    return pd.DataFrame(results)

## Composite Score and MSE

In [7]:
def add_composite_score(results_df):
    # Normalize ↑ metrics; invert MRRE so higher is better
    scaler = MinMaxScaler()
    up_cols = ['trustworthiness', 'continuity', 'distance_corr']
    up_scaled = scaler.fit_transform(results_df[up_cols])

    mrre_scaled = 1 - MinMaxScaler().fit_transform(results_df[['mrre']])
    composite = np.mean(np.column_stack([up_scaled, mrre_scaled]), axis=1)

    out = results_df.copy()
    out['composite_score'] = composite
    return out

def add_mse_score(results_df):
    """
    Compute MSE relative to the ideal target [1, 1, 1, 0]
    using the raw (non-normalized) metrics.
    Lower MSE means closer to the perfect embedding.
    """
    df = results_df.copy()

    mse = (
        (1 - df['trustworthiness']) ** 2 +
        (1 - df['continuity']) ** 2 +
        (1 - df['distance_corr']) ** 2 +
        (df['mrre'] - 0) ** 2
    ) / 4

    df['mse'] = mse
    return df



## Best Embedding Visualization (2D or 3D)

In [8]:
def plot_best_embedding(df_ids, embedding, best, tag):
    """Visualize the best UMAP embedding — only by position."""
    n_components = int(best['n_components'])
    cols = [f"UMAP_{i+1}" for i in range(n_components)]
    umap_df = pd.concat(
        [pd.DataFrame(embedding, columns=cols),
         df_ids.reset_index(drop=True)],
        axis=1
    )

    os.makedirs(PLOT_DIR, exist_ok=True)

    if n_components == 2:
        plt.figure(figsize=(8, 6))
        sns.scatterplot(
            data=umap_df,
            x='UMAP_1', y='UMAP_2',
            hue='positions',
            palette='tab10', s=40, alpha=0.9
        )
        plt.title(f"{tag} – Best 2D UMAP by Position")
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(f"{PLOT_DIR}/{tag}_umap2d_positions.png", dpi=300)
        plt.close()
    else:
        from mpl_toolkits.mplot3d import Axes3D  # noqa
        fig = plt.figure(figsize=(8, 6))
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(
            umap_df['UMAP_1'], umap_df['UMAP_2'], umap_df['UMAP_3'],
            c=pd.Categorical(umap_df['positions']).codes,
            s=20, alpha=0.8
        )
        ax.set_title(
            f"{tag} – Best UMAP by Position (showing first 3 of {int(best['n_components'])} dims)")
        ax.set_ylabel('UMAP_2')
        ax.set_zlabel('UMAP_3')
        plt.tight_layout()
        plt.savefig(f"{PLOT_DIR}/{tag}_umap3d_positions.png", dpi=300)
        plt.close()

    return umap_df

## Process a Single Dataset

In [9]:
def process_dataset(file_path):
    fname = os.path.basename(file_path)
    tag = os.path.splitext(fname)[0]
    print(f"\n=== Processing dataset: {tag} ===")

    df = pd.read_csv(file_path)
    df_ids = df[ID_COLS].copy()
    X_scaled = scale_features(df, ID_COLS)

    # Grid search over 2D and 3D
    grid_df = grid_search_umap(X_scaled, k_eval=K_TRUST_CONT)
    grid_df = add_composite_score(grid_df)
    grid_df = add_mse_score(grid_df)

    # --- Reorder columns to match best_metrics schema ---
    desired_order = [
        'n_neighbors', 'min_dist', 'n_components',
        'trustworthiness', 'continuity', 'distance_corr',
        'mrre', 'composite_score', 'mse'
    ]
    grid_df = grid_df[desired_order]

    # Save grid results
    grid_path = f"{GRID_DIR}/{tag}_umap_grid_results.csv"
    save_csv(grid_df, grid_path)

    # Pick best row and re-fit UMAP once
    best = pick_best_row_by_mse(grid_df)
    print(f"🏆 Best config for {tag}: "
          f"nn={int(best['n_neighbors'])}, md={float(best['min_dist']):.2f}, "
          f"nc={int(best['n_components'])}, comp={float(best['composite_score']):.3f}")

    emb_best = run_umap(
        X_scaled,
        n_neighbors=int(best['n_neighbors']),
        min_dist=float(best['min_dist']),
        n_components=int(best['n_components']),
        metric=METRIC,
        seed=SEED
    )

    # Build DF (UMAP first + ids) and visualize
    umap_df = plot_best_embedding(df_ids, emb_best, best, tag)

    # Save ONLY the best embedding
    dim_suffix = f"umap{int(best['n_components'])}d"
    emb_cols = [c for c in umap_df.columns if c.startswith("UMAP_")]
    emb_out = umap_df[emb_cols + ID_COLS]
    emb_path = f"{EMB_DIR}/{tag}_{dim_suffix}_best_embedding.csv"
    save_csv(emb_out, emb_path)

    # Save metrics for the best row only
    metrics_out = pd.DataFrame([{
        'n_neighbors': int(best['n_neighbors']),
        'min_dist': float(best['min_dist']),
        'n_components': int(best['n_components']),
        'trustworthiness': float(best['trustworthiness']),
        'continuity': float(best['continuity']),
        'distance_corr': float(best['distance_corr']),
        'mrre': float(best['mrre']),
        'composite_score': float(best['composite_score']),
        'mse': float(best['mse'])
    }])
    metrics_path = f"{BEST_DIR}/{tag}_umap_metrics.csv"
    save_csv(metrics_out, metrics_path)

## Run on All Datasets in cleaned_data/

In [None]:
all_files = [os.path.join(INPUT_DIR, f) for f in os.listdir(INPUT_DIR)
             if f.endswith(".csv")]

print(f"Found {len(all_files)} datasets in {INPUT_DIR}:")
for f in all_files:
    print(" -", os.path.basename(f))

for path in all_files:
    process_dataset(path)

Found 4 datasets in cleaned_data:
 - joueurs_ligue1_2024_2025_clean_custom.csv
 - joueurs_ligue1_2024_2025_clean_custom_GK.csv
 - joueurs_ligue1_2024_2025_clean_per90.csv
 - joueurs_ligue1_2024_2025_clean_raw.csv

=== Processing dataset: joueurs_ligue1_2024_2025_clean_custom ===
✓ n_neighbors=5, min_dist=0.0, n_components=2 | trust=0.916, cont=0.988, dcorr=0.567, mrre=0.760
✓ n_neighbors=5, min_dist=0.0, n_components=3 | trust=0.922, cont=0.988, dcorr=0.563, mrre=0.752
✓ n_neighbors=5, min_dist=0.1, n_components=2 | trust=0.916, cont=0.988, dcorr=0.534, mrre=0.761
✓ n_neighbors=5, min_dist=0.1, n_components=3 | trust=0.924, cont=0.988, dcorr=0.590, mrre=0.701
✓ n_neighbors=5, min_dist=0.3, n_components=2 | trust=0.912, cont=0.988, dcorr=0.546, mrre=0.748
✓ n_neighbors=5, min_dist=0.3, n_components=3 | trust=0.925, cont=0.988, dcorr=0.610, mrre=0.649
✓ n_neighbors=5, min_dist=0.5, n_components=2 | trust=0.906, cont=0.988, dcorr=0.564, mrre=0.730
✓ n_neighbors=5, min_dist=0.5, n_componen