In [66]:
import sys
import os

# Ajouter le dossier parent au path
sys.path.append(os.path.abspath(os.path.join('..')))

# Ensuite tu peux importer ton module
import utils


In [67]:
# Cell 1 — Imports & paramètres
import os
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.manifold import trustworthiness
from sklearn.metrics import pairwise_distances

import utils  # doit fournir utils.load_data(pattern)

# Réglages
INPUT_GLOB = "../../cleaned_data/*.csv"
OUT_DIR_COORDS = "./reduced_data_pca"
OUT_METRICS_CSV = "./reduced_data_pca/pca_metrics.csv"
VAR_THRESHOLD = 0.90
N_NEIGHBORS_TRUST = 10
RANDOM_STATE = 0
K_NEIGHBORS_EVAL = 3  # k pour trustworthiness, continuity, mrre (sera clampé au n-1)


os.makedirs(OUT_DIR_COORDS, exist_ok=True)


In [68]:
# Cell 2 — Utilitaires
def ensure_dict_datasets(obj, default_name="merged"):
    """Accepte un DataFrame unique OU un dict[str, DataFrame]."""
    if isinstance(obj, pd.DataFrame):
        return {default_name: obj}
    if isinstance(obj, dict):
        return obj
    raise TypeError("utils.load_data doit retourner un DataFrame ou un dict[str, DataFrame].")

def pick_meta(df: pd.DataFrame) -> pd.DataFrame:
    """Retourne les colonnes méta disponibles : player_name, equipe, positions."""
    tmp = df.copy()
    if "positions" not in tmp.columns and "position" in tmp.columns:
        tmp = tmp.rename(columns={"position": "positions"})
    meta_cols = [c for c in ["player_name", "equipe", "positions"] if c in tmp.columns]
    return tmp[meta_cols] if meta_cols else pd.DataFrame(index=tmp.index)

def compute_distance_correlation(X_high: np.ndarray, X_low: np.ndarray) -> float:
    """Corrélation de Pearson entre matrices de distances (triangle supérieur)."""
    D_high = pairwise_distances(X_high, metric="euclidean")
    D_low  = pairwise_distances(X_low,  metric="euclidean")
    iu = np.triu_indices_from(D_high, k=1)
    v1, v2 = D_high[iu], D_low[iu]
    if v1.std() == 0 or v2.std() == 0:
        return np.nan
    return float(np.corrcoef(v1, v2)[0, 1])

def clamp_neighbors(n_samples: int, n_neighbors: int) -> int:
    return max(1, min(n_neighbors, max(1, n_samples - 1)))

def continuity(X: np.ndarray, X_embedded: np.ndarray, k: int) -> float:
    """Continuity (Kaski & Venna): 1 = parfait, 0 = mauvais."""
    from sklearn.metrics import pairwise_distances
    n = X.shape[0]
    k = clamp_neighbors(n, k)

    orig_nn = np.argsort(pairwise_distances(X), axis=1)[:, 1:k+1]
    emb_nn  = np.argsort(pairwise_distances(X_embedded), axis=1)[:, 1:k+1]

    total = 0
    for i in range(n):
        orig = list(orig_nn[i])
        emb_set = set(emb_nn[i])
        missing = [m for m in orig if m not in emb_set]
        # pénalité = rang (1..k) des voisins manquants dans l'espace original
        total += sum(orig.index(m) + 1 for m in missing)

    denom = n * k * (2 * n - 3 * k - 1)
    if denom <= 0:
        return np.nan
    return 1 - (2 / denom) * total

def mrre(X: np.ndarray, X_embedded: np.ndarray, k: int) -> float:
    """Mean Relative Rank Error (MRRE): 0 = parfait (mieux), plus grand = pire."""
    from sklearn.metrics import pairwise_distances
    n = X.shape[0]
    k = clamp_neighbors(n, k)

    D_high = pairwise_distances(X)
    D_low  = pairwise_distances(X_embedded)

    # Rangs (1..n)
    R_high = np.argsort(np.argsort(D_high, axis=1), axis=1) + 1
    R_low  = np.argsort(np.argsort(D_low,  axis=1), axis=1) + 1

    err = 0.0
    cnt = 0
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            denom = R_high[i, j]
            if denom <= 0:
                continue
            err += abs(R_high[i, j] - R_low[i, j]) / denom
            cnt += 1
    return float(err / cnt) if cnt > 0 else float('nan')




In [69]:
# Cell - 3
def run_pca_once(df: pd.DataFrame, var_threshold=0.90, random_state=0):
    Xnum = df.select_dtypes(include=[np.number]).copy()
    if Xnum.shape[1] < 2:
        return None

    X_imp = SimpleImputer(strategy="mean").fit_transform(Xnum.values)
    X_std = StandardScaler().fit_transform(X_imp)

    pca = PCA(n_components=None, random_state=random_state)
    scores_all = pca.fit_transform(X_std)
    cumvar = np.cumsum(pca.explained_variance_ratio_)
    k = int(np.searchsorted(cumvar, var_threshold) + 1)

    scores_k = scores_all[:, :k]
    cols = [f"pca{i+1}" for i in range(k)]
    coords_df = pd.DataFrame(scores_k, index=df.index, columns=cols)

    # Métriques
    nn_trust = clamp_neighbors(X_std.shape[0], N_NEIGHBORS_TRUST)
    tw = trustworthiness(X_std, scores_k, n_neighbors=nn_trust)

    nn_eval = clamp_neighbors(X_std.shape[0], K_NEIGHBORS_EVAL)
    cont = continuity(X_std, scores_k, k=nn_eval)
    mrre_score = mrre(X_std, scores_k, k=K_NEIGHBORS_EVAL)
    
    dist_corr = compute_distance_correlation(X_std, scores_k)


    # cast sûrs (évite float(None))
    def safe_float(x):
        try:
            return float(x)
        except Exception:
            return float('nan')

    return {
        "k": k,
        "coords": coords_df,
        "explained_var": safe_float(pca.explained_variance_ratio_[:k].sum()),
        "trustworthiness": safe_float(tw),
        "continuity": safe_float(cont),
        "mrre": safe_float(mrre_score),
        "distance_correlation": safe_float(dist_corr),

    }



In [70]:
# Cell 4 — Chargement des jeux de données (robuste)
import glob

def _to_dict_datasets(obj):
    """Normalise en dict[str, DataFrame] les formes possibles."""
    if isinstance(obj, pd.DataFrame):
        return {"merged": obj}
    if isinstance(obj, dict):
        return obj
    if isinstance(obj, (list, tuple)):
        out = {}
        for i, item in enumerate(obj):
            # format: (name, df)
            if isinstance(item, (list, tuple)) and len(item) == 2 and isinstance(item[1], pd.DataFrame):
                out[str(item[0])] = item[1]
            # format: df seul
            elif isinstance(item, pd.DataFrame):
                out[f"df_{i}"] = item
        if out:
            return out
    return None

try:
    raw = utils.load_data(INPUT_GLOB)  # peut renvoyer df / dict / liste
except Exception as e:
    print(f"utils.load_data a échoué: {e}")
    raw = None

datasets = _to_dict_datasets(raw)

# Fallback: lecture directe des CSV si rien d'exploitable
if datasets is None:
    print("Fallback: lecture directe des CSV via glob.")
    paths = sorted(glob.glob(INPUT_GLOB))
    if not paths:
        raise FileNotFoundError(f"Aucun fichier trouvé avec le motif: {INPUT_GLOB}")
    datasets = {
        os.path.splitext(os.path.basename(p))[0]: pd.read_csv(p)
        for p in paths
    }

print(f"{len(datasets)} jeux trouvés.")
list(datasets.keys())[:5]


Error loading data: [Errno 2] No such file or directory: '../../cleaned_data/*.csv'
Fallback: lecture directe des CSV via glob.
4 jeux trouvés.


['joueurs_ligue1_2024_2025_clean_custom',
 'joueurs_ligue1_2024_2025_clean_custom_no_GK',
 'joueurs_ligue1_2024_2025_clean_per90',
 'joueurs_ligue1_2024_2025_clean_raw']

In [71]:
# Cell 5 — Boucle: PCA + sauvegardes + collecte métriques
metrics_rows = []

for name, df in datasets.items():
    res = run_pca_once(df, var_threshold=VAR_THRESHOLD, random_state=RANDOM_STATE)
    if res is None:
        print(f"[SKIP] {name}: < 2 colonnes numériques.")
        continue

    k = res["k"]
    coords = res["coords"]  # pca1..pcak
    meta = pick_meta(df)
    out_df = coords.join(meta, how="left")

    # nom fichier sortie
    base = str(name)
    if base.endswith(".csv"):
        base = os.path.splitext(os.path.basename(base))[0]
    safe_name = base.replace("/", "_").replace("\\", "_")
    out_path = os.path.join(OUT_DIR_COORDS, f"{safe_name}_pca_{k}d.csv")

    out_df.to_csv(out_path, index=False)

    metrics_rows.append({
        "dataset": safe_name,
        "n_components": k,
        "explained_variance_cum": res["explained_var"],
        "trustworthiness_k": res["trustworthiness"],
        "continuity_k": res["continuity"],
        "mrre_k": res["mrre"],
        "out_file": out_path,
        "distance_corr": res["distance_correlation"]
    })

    print(f"[OK] {safe_name}: k={k}, var={res['explained_var']:.3f}, "
          f"trust={res['trustworthiness']:.3f}, dist_corr={res['distance_correlation']:.3f} -> {out_path}")


[OK] joueurs_ligue1_2024_2025_clean_custom: k=33, var=0.901, trust=0.998, dist_corr=0.997 -> ./reduced_data_pca/joueurs_ligue1_2024_2025_clean_custom_pca_33d.csv
[OK] joueurs_ligue1_2024_2025_clean_custom_no_GK: k=34, var=0.902, trust=0.998, dist_corr=0.998 -> ./reduced_data_pca/joueurs_ligue1_2024_2025_clean_custom_no_GK_pca_34d.csv
[OK] joueurs_ligue1_2024_2025_clean_per90: k=29, var=0.900, trust=0.998, dist_corr=0.997 -> ./reduced_data_pca/joueurs_ligue1_2024_2025_clean_per90_pca_29d.csv
[OK] joueurs_ligue1_2024_2025_clean_raw: k=23, var=0.904, trust=0.998, dist_corr=0.998 -> ./reduced_data_pca/joueurs_ligue1_2024_2025_clean_raw_pca_23d.csv


In [72]:
# Cell 6 — Sauvegarde & aperçu des métriques
if metrics_rows:
    metrics_df = pd.DataFrame(metrics_rows)
    metrics_df.to_csv(OUT_METRICS_CSV, index=False)
    display(metrics_df.head())
    print(f"Métriques sauvegardées -> {OUT_METRICS_CSV}")
else:
    print("Aucun jeu traité.")


Unnamed: 0,dataset,n_components,explained_variance_cum,trustworthiness_k,continuity_k,mrre_k,out_file,distance_corr
0,joueurs_ligue1_2024_2025_clean_custom,33,0.901,0.997791,0.99845,0.065161,./reduced_data_pca/joueurs_ligue1_2024_2025_cl...,0.997459
1,joueurs_ligue1_2024_2025_clean_custom_no_GK,34,0.902434,0.997912,0.998422,0.063516,./reduced_data_pca/joueurs_ligue1_2024_2025_cl...,0.997852
2,joueurs_ligue1_2024_2025_clean_per90,29,0.900209,0.997883,0.998059,0.06781,./reduced_data_pca/joueurs_ligue1_2024_2025_cl...,0.997254
3,joueurs_ligue1_2024_2025_clean_raw,23,0.904024,0.997656,0.998129,0.067003,./reduced_data_pca/joueurs_ligue1_2024_2025_cl...,0.998275


Métriques sauvegardées -> ./reduced_data_pca/pca_metrics.csv
