In [14]:
# Cell: Spectral diagnostic on selected bootstrap subsamples (log-log plots + fit details)
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy import sparse
from scipy.sparse.linalg import eigsh
from scipy import stats
import matplotlib.pyplot as plt

# Parameters (modifiable)
csv_path = 'data/sunspots_raw/Sunspots.csv'
value_col_candidates = ['Number', 'Total Sunspot', 'Total Sunspot Number', 'Monthly Mean']
embedding_dim = 10
tau = 1
k_neighbors = 10
n_eig = 200                 # eigenvalues to compute
n_diagnostics = 20          # number of bootstrap subsamples to inspect
subsample_frac = 0.6
lambda_max_list = [0.1, 0.2, 0.4]  # lambda ranges to display on plots (we still fit per chosen lambda_max below)
lambda_max_fit = 0.2        # primary lambda_max used for the numeric fit reported
min_points_for_fit = 6
rng_seed = 42

out_dir = 'results/spectral_diagnostics'
os.makedirs(out_dir, exist_ok=True)

# Load series and build embedding
df0 = pd.read_csv(csv_path)
col = next((c for c in value_col_candidates if c in df0.columns), None)
if col is None:
    numeric_cols = df0.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        raise RuntimeError("No numeric column found in CSV.")
    col = numeric_cols[-1]
series = pd.to_numeric(df0[col], errors='coerce').dropna().values

def takens_embed(x, dim, tau):
    m = len(x) - (dim - 1) * tau
    if m <= 0:
        return None
    embed = np.empty((m, dim))
    for i in range(dim):
        embed[:, i] = x[i * tau : i * tau + m]
    return embed

X_full = takens_embed(series, embedding_dim, tau)
if X_full is None:
    raise RuntimeError("Embedding too short for embedding_dim/tau")

n_nodes = X_full.shape[0]
print(f"Embedding nodes: {n_nodes}, embedding_dim={embedding_dim}")

def build_laplacian_eigs(X_points, k_neighbors, n_eig):
    n_nodes_local = X_points.shape[0]
    nbrs = NearestNeighbors(n_neighbors=min(k_neighbors + 1, n_nodes_local), algorithm='auto').fit(X_points)
    distances, indices = nbrs.kneighbors(X_points)
    adj = sparse.lil_matrix((n_nodes_local, n_nodes_local), dtype=np.float32)
    for i in range(n_nodes_local):
        for j in indices[i, 1:]:
            adj[i, j] = 1.0
            adj[j, i] = 1.0
    adj = adj.tocsr()
    deg = np.array(adj.sum(axis=1)).flatten()
    deg[deg == 0] = 1.0
    D_inv_sqrt = sparse.diags(1.0 / np.sqrt(deg))
    I = sparse.identity(n_nodes_local, format='csr')
    L_norm = I - D_inv_sqrt @ adj @ D_inv_sqrt
    n_eig_local = min(n_eig, n_nodes_local - 1)
    try:
        eigvals, _ = eigsh(L_norm, k=n_eig_local, which='SM', tol=1e-6, maxiter=5000)
    except Exception as e:
        # fallback to dense for small matrices
        try:
            from scipy.linalg import eigh
            Ld = L_norm.toarray()
            eigvals_all = eigh(Ld, eigvals_only=True)
            eigvals = np.sort(eigvals_all)[:n_eig_local]
        except Exception as e2:
            print("Eigen decomposition failed:", e, e2)
            return None
    return np.sort(eigvals)

def spectral_counting(eigvals):
    eps = 1e-12
    lams = eigvals[eigvals > eps]
    lam_vals = np.unique(lams)
    N_vals = np.array([np.searchsorted(lams, lam, side='right') for lam in lam_vals])
    return lam_vals, N_vals

def fit_loglog(lam_fit, N_fit):
    log_lam = np.log(lam_fit)
    log_N = np.log(N_fit)
    slope, intercept, r_value, p_value, stderr = stats.linregress(log_lam, log_N)
    return slope, intercept, r_value, p_value, stderr

# Select bootstrap indices deterministically
rng = np.random.default_rng(rng_seed)
indices_list = [rng.choice(np.arange(n_nodes), size=max(100, int(np.floor(subsample_frac * n_nodes))), replace=False)
                for _ in range(n_diagnostics)]

summary_rows = []
for i, idx in enumerate(indices_list, start=1):
    X_sub = X_full[idx, :]
    eigvals = build_laplacian_eigs(X_sub, k_neighbors, n_eig)
    if eigvals is None:
        print(f"Sample {i}: eig computation failed; skipping.")
        continue
    lam_vals, N_vals = spectral_counting(eigvals)
    # Save raw eigenvalues for this diagnostic
    pd.DataFrame({'eig_index': np.arange(1, len(eigvals)+1), 'eigval': eigvals}).to_csv(f"{out_dir}/diag_{i:03d}_eigvals.csv", index=False)
    pd.DataFrame({'lambda': lam_vals, 'N_lambda': N_vals}).to_csv(f"{out_dir}/diag_{i:03d}_counting.csv", index=False)
    # Fit on user lambda_max_fit and report number of points
    mask = lam_vals <= lambda_max_fit
    lam_fit = lam_vals[mask]
    N_fit = N_vals[mask]
    fit_ok = len(lam_fit) >= min_points_for_fit
    slope = intercept = r_value = p_value = stderr = np.nan
    if fit_ok:
        slope, intercept, r_value, p_value, stderr = fit_loglog(lam_fit, N_fit)
        d_s_est = 2.0 * slope
    else:
        d_s_est = np.nan
    summary_rows.append({
        'diag': i,
        'n_nodes_sub': X_sub.shape[0],
        'n_eig_computed': len(eigvals),
        'n_lambda_total': len(lam_vals),
        'n_points_fit': int(len(lam_fit)),
        'lambda_max_fit': float(lambda_max_fit),
        'fit_ok': bool(fit_ok),
        'slope': float(slope) if not np.isnan(slope) else np.nan,
        'stderr_slope': float(stderr) if not np.isnan(stderr) else np.nan,
        'r_value': float(r_value) if not np.isnan(r_value) else np.nan,
        'd_s_est': float(d_s_est) if not np.isnan(d_s_est) else np.nan
    })
    # Plot log-log with fits for multiple lambda_max_list overlays
    plt.figure(figsize=(6,4))
    plt.loglog(lam_vals, N_vals, marker='o', markersize=4, linestyle='none', alpha=0.6, label='N(lambda)')
    colors = ['red', 'orange', 'green']
    for j, lm in enumerate(lambda_max_list):
        maskj = lam_vals <= lm
        if maskj.sum() >= 2:
            lam_line = lam_vals[maskj]
            N_line = N_vals[maskj]
            # linear fit on that range for visualization
            try:
                s, itc, rv, pv, se = fit_loglog(lam_line, N_line)
                label = f'fit <={lm} slope={s:.3f}'
                lam_plot = np.linspace(lam_line.min(), lam_line.max(), 100)
                N_plot = np.exp(itc) * lam_plot**(s)
                plt.loglog(lam_plot, N_plot, color=colors[j], lw=1.5, label=label)
            except Exception:
                pass
    plt.xlabel('lambda (eigenvalue)')
    plt.ylabel('N(lambda)')
    plt.title(f'Spectral diagnostic sample {i} (n_sub={X_sub.shape[0]})')
    plt.legend(fontsize=8)
    plt.grid(alpha=0.25, which='both')
    plt.tight_layout()
    plt.savefig(f"{out_dir}/diag_{i:03d}_loglog.png", dpi=150)
    plt.close()
    print(f"Saved diagnostic {i}: eigs={len(eigvals)}, fit_points={len(lam_fit)}, fit_ok={fit_ok}")

# Save summary CSV
pd.DataFrame(summary_rows).to_csv(f"{out_dir}/spectral_diagnostics_summary.csv", index=False)
print("Saved diagnostics to", out_dir)


Embedding nodes: 3256, embedding_dim=10
Saved diagnostic 1: eigs=200, fit_points=17, fit_ok=True
Saved diagnostic 2: eigs=200, fit_points=17, fit_ok=True
Saved diagnostic 3: eigs=200, fit_points=16, fit_ok=True
Saved diagnostic 4: eigs=200, fit_points=17, fit_ok=True
Saved diagnostic 5: eigs=200, fit_points=17, fit_ok=True
Saved diagnostic 6: eigs=200, fit_points=17, fit_ok=True
Saved diagnostic 7: eigs=200, fit_points=17, fit_ok=True
Saved diagnostic 8: eigs=200, fit_points=18, fit_ok=True
Saved diagnostic 9: eigs=200, fit_points=16, fit_ok=True
Saved diagnostic 10: eigs=200, fit_points=16, fit_ok=True
Saved diagnostic 11: eigs=200, fit_points=18, fit_ok=True
Saved diagnostic 12: eigs=200, fit_points=16, fit_ok=True
Saved diagnostic 13: eigs=200, fit_points=17, fit_ok=True
Saved diagnostic 14: eigs=200, fit_points=17, fit_ok=True
Saved diagnostic 15: eigs=200, fit_points=16, fit_ok=True
Saved diagnostic 16: eigs=200, fit_points=16, fit_ok=True
Saved diagnostic 17: eigs=200, fit_points

### Résumé des diagnostics spectral (ce que montrent tes fichiers)
- Les diagnostics sur 20 sous‑échantillons montrent des fits stables sur λ ≤ 0.2 : nombre de points de fit ≈ 16–18 et slopes ≈ 0.14–0.15 → d_s spectrale ≈ 0.28–0.31 (cohérent avec ton résultat initial)【results/spectral_diagnostics_summary.csv】.  
- Levina–Bickel MLE donne une dimension locale ≈ 7.94 (95% ≈ [7.84, 8.03]) — très différent de la d_s spectrale mais cohérent avec une ID locale élevée pour l’embedding Takens (dim embedding = 10).  
- Diagnostic clair : le fit spectral small‑λ est appliqué sur une plage courte (≈ 16–18 points), avec pente stable ≈0.15. Cela indique que la pente est estimée de façon consistante pour λ ≤ 0.2, mais reste faible comparée aux estimateurs locaux (LB). La discordance signifie que ces méthodes mesurent des propriétés différentes (spectral scaling vs ID locale) ou que la plage de λ choisie impose une estimation de faible pente.

---

### Verdict opérationnel rapide
- Signe de T_log (négatif) est robuste : même en explorant hyperparamètres et nulls, T_log reste < 0 pour configurations testées.  
- La magnitude numérique de T_log dépend fortement de l'estimateur de dimension et des hyperparamètres (emb,k,lambda_max). Avant de rapporter une valeur absolue pour d_s/T_log il faut décider quelle définition on revendique et documenter sensibilité.

---




Cellule Python — Sensibilité de la pente spectrale en fonction de lambda_max

In [15]:
# Cell: Sweep lambda_max and report slope stability across selected diagnostic subsamples
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy import sparse
from scipy.sparse.linalg import eigsh
from scipy import stats
import matplotlib.pyplot as plt

# Paramètres (modifiable)
csv_path = 'data/sunspots_raw/Sunspots.csv'
value_col_candidates = ['Number', 'Total Sunspot', 'Total Sunspot Number', 'Monthly Mean']
embedding_dim = 10
tau = 1
k_neighbors = 10
n_eig = 200
subsample_frac = 0.6
n_diagnostics = 20       # utiliser les mêmes indices diagnostiqués précédemment
rng_seed = 42

# grille de lambda_max à tester
lambda_max_grid = [0.05, 0.1, 0.2, 0.4, 0.8]

min_points_for_fit = 4   # tolérance réduite pour explorer robustesse (reporter n_points)
out_dir = 'results/spectral_lambda_sensitivity'
os.makedirs(out_dir, exist_ok=True)

# Chargement et embedding
df0 = pd.read_csv(csv_path)
col = next((c for c in value_col_candidates if c in df0.columns), None)
if col is None:
    numeric_cols = df0.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        raise RuntimeError("No numeric column found in CSV.")
    col = numeric_cols[-1]
series = pd.to_numeric(df0[col], errors='coerce').dropna().values

def takens_embed(x, dim, tau):
    m = len(x) - (dim - 1) * tau
    if m <= 0:
        return None
    embed = np.empty((m, dim))
    for i in range(dim):
        embed[:, i] = x[i * tau : i * tau + m]
    return embed

X_full = takens_embed(series, embedding_dim, tau)
if X_full is None:
    raise RuntimeError("Embedding too short for given embedding_dim/tau.")
n_nodes = X_full.shape[0]
print(f"Embedding nodes={n_nodes}, emb_dim={embedding_dim}")

def build_laplacian_eigs(X_points, k_neighbors, n_eig):
    n_nodes_local = X_points.shape[0]
    nbrs = NearestNeighbors(n_neighbors=min(k_neighbors + 1, n_nodes_local), algorithm='auto').fit(X_points)
    distances, indices = nbrs.kneighbors(X_points)
    adj = sparse.lil_matrix((n_nodes_local, n_nodes_local), dtype=np.float32)
    for i in range(n_nodes_local):
        for j in indices[i, 1:]:
            adj[i, j] = 1.0
            adj[j, i] = 1.0
    adj = adj.tocsr()
    deg = np.array(adj.sum(axis=1)).flatten()
    deg[deg == 0] = 1.0
    D_inv_sqrt = sparse.diags(1.0 / np.sqrt(deg))
    I = sparse.identity(n_nodes_local, format='csr')
    L_norm = I - D_inv_sqrt @ adj @ D_inv_sqrt
    n_eig_local = min(n_eig, n_nodes_local - 1)
    try:
        eigvals, _ = eigsh(L_norm, k=n_eig_local, which='SM', tol=1e-6, maxiter=5000)
    except Exception:
        try:
            from scipy.linalg import eigh
            Ld = L_norm.toarray()
            eigvals_all = eigh(Ld, eigvals_only=True)
            eigvals = np.sort(eigvals_all)[:n_eig_local]
        except Exception as e:
            print("Eigen decomposition failed:", e)
            return None
    return np.sort(eigvals)

def spectral_counting(eigvals):
    eps = 1e-12
    lams = eigvals[eigvals > eps]
    lam_vals = np.unique(lams)
    N_vals = np.array([np.searchsorted(lams, lam, side='right') for lam in lam_vals])
    return lam_vals, N_vals

def fit_loglog(lam_fit, N_fit):
    log_lam = np.log(lam_fit)
    log_N = np.log(N_fit)
    slope, intercept, r_value, p_value, stderr = stats.linregress(log_lam, log_N)
    return slope, intercept, r_value, p_value, stderr

# deterministe: recréer mêmes indices de diagnostic (même seed que précédemment)
rng = np.random.default_rng(rng_seed)
indices_list = [rng.choice(np.arange(n_nodes), size=max(100, int(np.floor(subsample_frac * n_nodes))), replace=False)
                for _ in range(n_diagnostics)]

rows = []
for i, idx in enumerate(indices_list, start=1):
    X_sub = X_full[idx, :]
    eigvals = build_laplacian_eigs(X_sub, k_neighbors, n_eig)
    if eigvals is None:
        print(f"sample {i}: eig failed")
        continue
    lam_vals, N_vals = spectral_counting(eigvals)
    for lm in lambda_max_grid:
        mask = lam_vals <= lm
        lam_fit = lam_vals[mask]
        N_fit = N_vals[mask]
        n_points = int(len(lam_fit))
        fit_ok = n_points >= min_points_for_fit
        slope = intercept = r_value = p_value = stderr = np.nan
        if fit_ok:
            slope, intercept, r_value, p_value, stderr = fit_loglog(lam_fit, N_fit)
        rows.append({
            'diag': i,
            'n_nodes_sub': X_sub.shape[0],
            'lambda_max': lm,
            'n_points_fit': n_points,
            'fit_ok': bool(fit_ok),
            'slope': float(slope) if not np.isnan(slope) else np.nan,
            'stderr_slope': float(stderr) if not np.isnan(stderr) else np.nan,
            'r_value': float(r_value) if not np.isnan(r_value) else np.nan,
            'd_s_est': float(2.0*slope) if not np.isnan(slope) else np.nan
        })
    print(f"Processed diag {i}: total lam unique={len(lam_vals)}")

df_out = pd.DataFrame(rows)
df_out.to_csv(f"{out_dir}/lambda_max_sweep_summary.csv", index=False)

# Aggregate: median and IQR of slope/d_s by lambda_max
agg = df_out.groupby('lambda_max').agg(
    n_samples=('diag', 'count'),
    med_slope=('slope', 'median'),
    med_d_s=('d_s_est', 'median'),
    slope_iqr_lower=('slope', lambda x: np.quantile(x.dropna(), 0.25) if x.dropna().size>0 else np.nan),
    slope_iqr_upper=('slope', lambda x: np.quantile(x.dropna(), 0.75) if x.dropna().size>0 else np.nan),
    med_n_points=('n_points_fit','median')
).reset_index()
agg.to_csv(f"{out_dir}/lambda_max_sweep_aggregate.csv", index=False)

# Plot median d_s vs lambda_max with IQR ribbon
plt.figure(figsize=(6,3.5))
xs = agg['lambda_max'].values
ys = agg['med_d_s'].values
ylo = agg['slope_iqr_lower'].values * 2
yhi = agg['slope_iqr_upper'].values * 2
plt.plot(xs, ys, '-o', color='darkblue', label='median d_s (2*slope)')
plt.fill_between(xs, ylo, yhi, color='lightblue', alpha=0.4, label='IQR (2*slope)')
plt.xscale('log')
plt.xlabel('lambda_max (log scale)')
plt.ylabel('d_s estimate (median and IQR)')
plt.title('Sensitivity of spectral slope to lambda_max')
plt.legend()
plt.grid(alpha=0.3, which='both')
plt.tight_layout()
plt.savefig(f"{out_dir}/lambda_max_sensitivity_plot.png", dpi=150)
plt.close()

print("Saved sweep summary:", f"{out_dir}/lambda_max_sweep_summary.csv")
print("Saved aggregate:", f"{out_dir}/lambda_max_sweep_aggregate.csv")
print("Saved plot:", f"{out_dir}/lambda_max_sensitivity_plot.png")


Embedding nodes=3256, emb_dim=10
Processed diag 1: total lam unique=200
Processed diag 2: total lam unique=200
Processed diag 3: total lam unique=200
Processed diag 4: total lam unique=200
Processed diag 5: total lam unique=200
Processed diag 6: total lam unique=200
Processed diag 7: total lam unique=200
Processed diag 8: total lam unique=200
Processed diag 9: total lam unique=200
Processed diag 10: total lam unique=200
Processed diag 11: total lam unique=200
Processed diag 12: total lam unique=200
Processed diag 13: total lam unique=200
Processed diag 14: total lam unique=200
Processed diag 15: total lam unique=200
Processed diag 16: total lam unique=200
Processed diag 17: total lam unique=200
Processed diag 18: total lam unique=200
Processed diag 19: total lam unique=200
Processed diag 20: total lam unique=200
Saved sweep summary: results/spectral_lambda_sensitivity/lambda_max_sweep_summary.csv
Saved aggregate: results/spectral_lambda_sensitivity/lambda_max_sweep_aggregate.csv
Saved 

### Bref constat (5 lignes)
- La pente spectrale dépend fortement de la plage de fit : lambda_max = 0.05 → médiane d_s ≈ 0.16; 0.1 → ≈ 0.21; 0.2 → ≈ 0.30; 0.4 → ≈ 0.51; 0.8 → ≈ 0.87 (médianes over diagnostics).  
- Pour la plage small‑λ (≤ 0.2) la pente est stable sur les sous‑échantillons (n_points_fit ≈ 16–18, slope ≈ 0.14–0.16 → d_s ≈ 0.28–0.32).  
- Levina–Bickel (MLE) donne une intrinsic dim locale ≈ 7.94 (CI 7.84–8.03) sur les mêmes embeddings.  
- Interprétation clé : les deux estimateurs mesurent des propriétés différentes — le comptage spectral sur petites valeurs propres renvoie une très faible pente locale, tandis que les estimateurs locaux (LB) révèlent une haute dimension locale.  
- Conséquence pratique : le signe de T_log (négatif) reste robuste, mais la valeur numérique de d_s/T_log dépend fortement de la définition (spectral vs local) et du choix de lambda_max.



Cell Python — Comparaison pairée Levina–Bickel vs pente spectrale (lambda_max = 0.1, 0.2, 0.4)

In [16]:
# Cell: Paired comparison between Levina-Bickel MLE and spectral slope (same bootstrap subsamples)
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy import sparse
from scipy.sparse.linalg import eigsh
from scipy import stats
import matplotlib.pyplot as plt
from numpy.random import default_rng

# Paramètres (modifiables)
csv_path = 'data/sunspots_raw/Sunspots.csv'
levina_samples_path = 'results/levina_bickel_boot_samples.csv'   # produit précédemment
out_dir = 'results/paired_levina_spectral'
embedding_dim = 10
tau = 1
k_neighbors = 10
n_eig = 200
subsample_frac = 0.6
rng_seed = 42
lambda_max_list = [0.1, 0.2, 0.4]   # plages à tester (paired)
min_points_for_fit = 4

os.makedirs(out_dir, exist_ok=True)

# Load Levina samples (to pair by bootstrap index)
if not os.path.exists(levina_samples_path):
    raise RuntimeError(f"Levina samples not found at {levina_samples_path}")
lev_df = pd.read_csv(levina_samples_path)
# Expect columns: b, n_nodes_sub, levina_mle
if 'b' not in lev_df.columns or 'levina_mle' not in lev_df.columns:
    raise RuntimeError("Levina samples file missing required columns 'b' or 'levina_mle'")

n_boot = int(lev_df['b'].max())
print(f"Loaded Levina samples: n_boot={n_boot}, valid={len(lev_df)}")

# Load series and build embedding
df0 = pd.read_csv(csv_path)
value_col_candidates = ['Number', 'Total Sunspot', 'Total Sunspot Number', 'Monthly Mean']
col = next((c for c in value_col_candidates if c in df0.columns), None)
if col is None:
    numeric_cols = df0.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        raise RuntimeError("No numeric column found in CSV.")
    col = numeric_cols[-1]
series = pd.to_numeric(df0[col], errors='coerce').dropna().values

def takens_embed(x, dim, tau):
    m = len(x) - (dim - 1) * tau
    if m <= 0:
        return None
    embed = np.empty((m, dim))
    for i in range(dim):
        embed[:, i] = x[i * tau : i * tau + m]
    return embed

X_full = takens_embed(series, embedding_dim, tau)
if X_full is None:
    raise RuntimeError("Embedding too short for given embedding_dim/tau.")
n_nodes = X_full.shape[0]
print(f"Embedding: emb_dim={embedding_dim}, n_nodes={n_nodes}")

def build_laplacian_eigs(X_points, k_neighbors, n_eig):
    n_nodes_local = X_points.shape[0]
    nbrs = NearestNeighbors(n_neighbors=min(k_neighbors + 1, n_nodes_local), algorithm='auto').fit(X_points)
    distances, indices = nbrs.kneighbors(X_points)
    adj = sparse.lil_matrix((n_nodes_local, n_nodes_local), dtype=np.float32)
    for i in range(n_nodes_local):
        for j in indices[i, 1:]:
            adj[i, j] = 1.0
            adj[j, i] = 1.0
    adj = adj.tocsr()
    deg = np.array(adj.sum(axis=1)).flatten()
    deg[deg == 0] = 1.0
    D_inv_sqrt = sparse.diags(1.0 / np.sqrt(deg))
    I = sparse.identity(n_nodes_local, format='csr')
    L_norm = I - D_inv_sqrt @ adj @ D_inv_sqrt
    n_eig_local = min(n_eig, n_nodes_local - 1)
    try:
        eigvals, _ = eigsh(L_norm, k=n_eig_local, which='SM', tol=1e-6, maxiter=5000)
    except Exception:
        try:
            from scipy.linalg import eigh
            Ld = L_norm.toarray()
            eigvals_all = eigh(Ld, eigvals_only=True)
            eigvals = np.sort(eigvals_all)[:n_eig_local]
        except Exception as e:
            print("Eigen decomposition failed:", e)
            return None
    return np.sort(eigvals)

def spectral_counting(eigvals):
    eps = 1e-12
    lams = eigvals[eigvals > eps]
    lam_vals = np.unique(lams)
    N_vals = np.array([np.searchsorted(lams, lam, side='right') for lam in lam_vals])
    return lam_vals, N_vals

def fit_loglog(lam_fit, N_fit):
    log_lam = np.log(lam_fit)
    log_N = np.log(N_fit)
    slope, intercept, r_value, p_value, stderr = stats.linregress(log_lam, log_N)
    return slope, intercept, r_value, p_value, stderr

# Recreate deterministic bootstrap indices (must match Levina run)
rng = default_rng(rng_seed)
indices_list = [rng.choice(np.arange(n_nodes), size=max(100, int(np.floor(subsample_frac * n_nodes))), replace=False)
                for _ in range(n_boot)]

rows = []
for row in lev_df.itertuples(index=False):
    b_idx = int(row.b)
    levina_val = float(row.levina_mle) if np.isfinite(row.levina_mle) else np.nan
    # get indices for this bootstrap (1-based b in lev_df)
    if b_idx < 1 or b_idx > len(indices_list):
        print(f"Skipping b={b_idx}: index out of range")
        continue
    idx = indices_list[b_idx - 1]
    X_sub = X_full[idx, :]
    eigvals = build_laplacian_eigs(X_sub, k_neighbors, n_eig)
    if eigvals is None:
        for lm in lambda_max_list:
            rows.append({'b': b_idx, 'levina_mle': levina_val, 'lambda_max': lm,
                         'n_points_fit': 0, 'fit_ok': False, 'slope': np.nan, 'stderr_slope': np.nan, 'd_s': np.nan})
        continue
    lam_vals, N_vals = spectral_counting(eigvals)
    for lm in lambda_max_list:
        mask = lam_vals <= lm
        lam_fit = lam_vals[mask]
        N_fit = N_vals[mask]
        n_points = int(len(lam_fit))
        fit_ok = n_points >= min_points_for_fit
        if fit_ok:
            slope, intercept, r_value, p_value, stderr = fit_loglog(lam_fit, N_fit)
            d_s = 2.0 * slope
        else:
            slope = intercept = r_value = p_value = stderr = np.nan
            d_s = np.nan
        rows.append({
            'b': b_idx,
            'levina_mle': levina_val,
            'lambda_max': lm,
            'n_points_fit': n_points,
            'fit_ok': bool(fit_ok),
            'slope': float(slope) if not np.isnan(slope) else np.nan,
            'stderr_slope': float(stderr) if not np.isnan(stderr) else np.nan,
            'd_s': float(d_s) if not np.isnan(d_s) else np.nan
        })
    print(f"Paired sample b={b_idx}: levina={levina_val:.3f}, eigvals={len(eigvals)}, lam_unique={len(lam_vals)}")

paired_df = pd.DataFrame(rows)
paired_df.to_csv(f"{out_dir}/paired_levina_spectral_raw.csv", index=False)

# Aggregate paired comparisons and compute differences levina - spectral
summary_rows = []
for lm in lambda_max_list:
    subset = paired_df[paired_df['lambda_max'] == lm]
    # drop NaNs
    valid = subset.dropna(subset=['levina_mle','d_s'])
    n_valid = len(valid)
    if n_valid == 0:
        med_lev = med_spec = diff_med = np.nan
    else:
        med_lev = float(np.median(valid['levina_mle']))
        med_spec = float(np.median(valid['d_s']))
        diff_med = med_lev - med_spec
    summary_rows.append({
        'lambda_max': lm,
        'n_pairs': int(len(subset)),
        'n_valid_pairs': int(n_valid),
        'median_levina': med_lev,
        'median_d_s_spectral': med_spec,
        'median_diff_levina_minus_spectral': diff_med
    })

pd.DataFrame(summary_rows).to_csv(f"{out_dir}/paired_levina_spectral_summary.csv", index=False)
print("Saved paired raw:", f"{out_dir}/paired_levina_spectral_raw.csv")
print("Saved paired summary:", f"{out_dir}/paired_levina_spectral_summary.csv")

# Plots: scatter paired (levina vs d_s) for each lambda_max
for lm in lambda_max_list:
    subset = paired_df[paired_df['lambda_max'] == lm].dropna(subset=['levina_mle','d_s'])
    plt.figure(figsize=(5,4))
    plt.scatter(subset['levina_mle'], subset['d_s'], alpha=0.7, s=20)
    # identity line
    mn = min(subset['levina_mle'].min(), subset['d_s'].min()) if not subset.empty else 0
    mx = max(subset['levina_mle'].max(), subset['d_s'].max()) if not subset.empty else 1
    plt.plot([mn,mx],[mn,mx], color='gray', linestyle='--', linewidth=1)
    plt.xlabel('Levina-Bickel MLE (m_hat)')
    plt.ylabel('Spectral d_s (2*slope)')
    plt.title(f'Paired: Levina vs spectral (lambda_max={lm})')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{out_dir}/paired_scatter_lambda_{lm}.png", dpi=150)
    plt.close()

# Paired difference boxplots (levina - spectral) across lambda_max
plt.figure(figsize=(6,3.5))
data = []
labels = []
for lm in lambda_max_list:
    subset = paired_df[paired_df['lambda_max'] == lm].dropna(subset=['levina_mle','d_s'])
    diff = subset['levina_mle'] - subset['d_s']
    data.append(diff.values)
    labels.append(str(lm))
plt.boxplot(data, labels=labels, showfliers=False)
plt.xlabel('lambda_max')
plt.ylabel('Levina - spectral d_s')
plt.title('Paired differences across lambda_max')
plt.tight_layout()
plt.savefig(f"{out_dir}/paired_diff_boxplot.png", dpi=150)
plt.close()

print("Saved paired plots in", out_dir)


Loaded Levina samples: n_boot=150, valid=150
Embedding: emb_dim=10, n_nodes=3256
Paired sample b=1: levina=7.857, eigvals=200, lam_unique=200
Paired sample b=2: levina=7.934, eigvals=200, lam_unique=200
Paired sample b=3: levina=7.910, eigvals=200, lam_unique=200
Paired sample b=4: levina=7.854, eigvals=200, lam_unique=200
Paired sample b=5: levina=7.886, eigvals=200, lam_unique=200
Paired sample b=6: levina=7.898, eigvals=200, lam_unique=200
Paired sample b=7: levina=7.937, eigvals=200, lam_unique=200
Paired sample b=8: levina=7.912, eigvals=200, lam_unique=200
Paired sample b=9: levina=8.002, eigvals=200, lam_unique=200
Paired sample b=10: levina=7.906, eigvals=200, lam_unique=200
Paired sample b=11: levina=7.906, eigvals=200, lam_unique=200
Paired sample b=12: levina=7.877, eigvals=200, lam_unique=200
Paired sample b=13: levina=7.958, eigvals=200, lam_unique=200
Paired sample b=14: levina=7.941, eigvals=200, lam_unique=200
Paired sample b=15: levina=7.957, eigvals=200, lam_unique=20

Constat principal

Les estimations sont fortement discordantes : Levina‑Bickel MLE médiane ≈ 7.94, tandis que les pentes spectrales donnent d_s médian ≈ 0.21 (λ=0.1), 0.30 (λ=0.2), 0.52 (λ=0.4). La médiane des différences (Levina − spectral) est ≈ 7.73, 7.64, 7.42 respectivement.

Les scatter plots montrent un regroupement des points vers la droite-bas : Levina élevé, spectral très bas — l’accord est absent à l’échelle absolue.

Interprétation technique rapide
Les deux estimateurs mesurent des propriétés différentes : Levina‑Bickel estime une dimension intrinsèque locale fondée sur distances de voisinage ; la pente spectrale (comptage des petites valeurs propres) capte l’échelle spectrale choisie par λ_max.

La valeur numérique de d_s issue de la pente spectrale varie fortement avec λ_max (signe d’une pente qui s’accroît quand on élargit la plage), donc la sensibilité en λ_max explique la majeure partie de l’écart.

Sur petites valeurs propres (λ ≤ 0.2) la pente spectrale reste très faible → d_s ≪ m_hat, ce qui est cohérent avec les visualisations log‑log si la plage small‑λ est presque plate.

Cellule Python — Tests appariés (Wilcoxon, t), Spearman, tailles d’effet et figures annotées

In [17]:
# Cell: Paired tests (Wilcoxon, paired t), Spearman correlation, Cohen's d for paired differences,
# and annotated scatter + boxplot figures.
import os
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

in_dir = 'results/paired_levina_spectral'
out_dir = in_dir
os.makedirs(out_dir, exist_ok=True)

paired_raw = os.path.join(in_dir, 'paired_levina_spectral_raw.csv')
if not os.path.exists(paired_raw):
    raise RuntimeError(f"Paired file not found: {paired_raw}")

df = pd.read_csv(paired_raw)

lambda_max_list = sorted(df['lambda_max'].unique())

def cohen_d_paired(x, y):
    # Cohen's d for paired samples: mean(diff)/sd(diff)
    d = x - y
    d = d[~np.isnan(d)]
    if d.size < 2:
        return np.nan
    return float(np.mean(d) / (np.std(d, ddof=1)))

summary_rows = []

for lm in lambda_max_list:
    subset = df[df['lambda_max'] == lm].dropna(subset=['levina_mle', 'd_s'])
    n_pairs = len(subset)
    if n_pairs == 0:
        summary_rows.append({
            'lambda_max': lm, 'n_pairs': 0,
            'median_levina': np.nan, 'median_spectral': np.nan,
            'wilcoxon_stat': np.nan, 'wilcoxon_p': np.nan,
            'paired_t_stat': np.nan, 'paired_t_p': np.nan,
            'spearman_rho': np.nan, 'spearman_p': np.nan,
            'cohens_d_paired': np.nan
        })
        continue

    lev = subset['levina_mle'].values
    spec = subset['d_s'].values
    diff = lev - spec

    # Descriptives
    med_lev = float(np.median(lev))
    med_spec = float(np.median(spec))
    mean_diff = float(np.mean(diff))
    std_diff = float(np.std(diff, ddof=1))

    # Wilcoxon signed-rank test (two-sided)
    try:
        wil_res = stats.wilcoxon(lev, spec, alternative='two-sided', zero_method='wilcox')
        wil_stat, wil_p = float(wil_res.statistic), float(wil_res.pvalue)
    except Exception:
        wil_stat, wil_p = np.nan, np.nan

    # Paired t-test
    try:
        t_res = stats.ttest_rel(lev, spec, nan_policy='omit')
        t_stat, t_p = float(t_res.statistic), float(t_res.pvalue)
    except Exception:
        t_stat, t_p = np.nan, np.nan

    # Spearman correlation
    try:
        rho, rho_p = stats.spearmanr(lev, spec, nan_policy='omit')
        rho, rho_p = float(rho), float(rho_p)
    except Exception:
        rho, rho_p = np.nan, np.nan

    # Cohen's d (paired)
    d_cohen = cohen_d_paired(lev, spec)

    summary_rows.append({
        'lambda_max': lm,
        'n_pairs': int(n_pairs),
        'median_levina': med_lev,
        'median_spectral': med_spec,
        'mean_diff': mean_diff,
        'std_diff': std_diff,
        'wilcoxon_stat': wil_stat,
        'wilcoxon_p': wil_p,
        'paired_t_stat': t_stat,
        'paired_t_p': t_p,
        'spearman_rho': rho,
        'spearman_p': rho_p,
        'cohens_d_paired': d_cohen
    })

    # Annotated scatter
    plt.figure(figsize=(5,4))
    plt.scatter(lev, spec, alpha=0.7, s=18)
    mn = min(np.min(lev), np.min(spec))
    mx = max(np.max(lev), np.max(spec))
    plt.plot([mn, mx], [mn, mx], color='gray', linestyle='--', linewidth=1)
    plt.xlabel('Levina-Bickel MLE (m_hat)')
    plt.ylabel('Spectral d_s (2*slope)')
    plt.title(f'Levina vs spectral (lambda_max={lm})')
    annotation = (
        f"n={n_pairs}\nmedian_lev={med_lev:.3f}\nmedian_spec={med_spec:.3f}\n"
        f"mean_diff={mean_diff:.3f} ± {std_diff:.3f}\n"
        f"Wilcoxon p={wil_p:.2e}\npaired t p={t_p:.2e}\nSpearman rho={rho:.2f} (p={rho_p:.2e})\n"
        f"Cohen d_paired={d_cohen:.2f}"
    )
    plt.gca().text(0.02, 0.98, annotation, transform=plt.gca().transAxes,
                   fontsize=8, va='top', ha='left',
                   bbox=dict(facecolor='white', alpha=0.85, edgecolor='none'))
    plt.grid(alpha=0.25)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, f'paired_scatter_lambda_{lm}_annotated_tests.png'), dpi=150)
    plt.close()

# Save summary CSV
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv(os.path.join(out_dir, 'paired_levina_spectral_tests_summary.csv'), index=False)

# Boxplot of paired differences with Wilcoxon p-values annotated
plt.figure(figsize=(6,3.5))
data = []
labels = []
p_texts = []
for lm in lambda_max_list:
    subset = df[df['lambda_max'] == lm].dropna(subset=['levina_mle', 'd_s'])
    diff = (subset['levina_mle'] - subset['d_s']).values
    data.append(diff)
    labels.append(str(lm))
    row = summary_df[summary_df['lambda_max'] == lm]
    pval = float(row['wilcoxon_p'].values[0]) if not row.empty else np.nan
    p_texts.append(f"p={pval:.1e}" if np.isfinite(pval) else "p=NA")

plt.boxplot(data, labels=labels, showfliers=False)
ymax = plt.ylim()[1]
for xi, txt in enumerate(p_texts, start=1):
    plt.text(xi, ymax*0.98, txt, ha='center', va='top', fontsize=8)
plt.xlabel('lambda_max')
plt.ylabel('Levina - spectral d_s')
plt.title('Paired differences (Levina - spectral) with Wilcoxon p-values')
plt.tight_layout()
plt.savefig(os.path.join(out_dir, 'paired_diff_boxplot_annotated_tests.png'), dpi=150)
plt.close()

print("Saved tests summary:", os.path.join(out_dir, 'paired_levina_spectral_tests_summary.csv'))
print("Saved annotated scatter plots and annotated boxplot in", out_dir)


Saved tests summary: results/paired_levina_spectral\paired_levina_spectral_tests_summary.csv
Saved annotated scatter plots and annotated boxplot in results/paired_levina_spectral
