In [61]:
# Cellule autonome : Télécharger Sunspots (Kaggle) -> embedding Takens -> k-NN -> Laplacian -> eigenvalues -> d_s (Theil-Sen) -> bootstrap -> T_log
# CONFIGURATION (éditer si besoin)
use_kaggle = True                        # True pour télécharger depuis Kaggle (requiert ~/.kaggle/kaggle.json ou ./kaggle.json)
kaggle_dataset = "robervalt/sunspots"    # dataset Kaggle à télécharger
download_path = Path("data/sunspots_kaggle")
extract_dir = download_path / "extracted"
results_dir = Path("results/sunspots_external")
results_dir.mkdir(parents=True, exist_ok=True)

# Embedding / graph params
embedding_dim = 10
tau = 1
k_neighbors = 10
n_eig = 200                               # nombre de valeurs propres à extraire
lambda_max = 0.2
n_boot = 200
subsample_frac = 0.8
random_seed = 42

# ---------- imports ----------
from pathlib import Path
import os, json, zipfile, math
import numpy as np, pandas as pd
import scipy.sparse as sp
from sklearn.neighbors import kneighbors_graph
from sklearn.linear_model import TheilSenRegressor
from sklearn.metrics import pairwise_distances
import networkx as nx
import matplotlib.pyplot as plt
import random
np.random.seed(random_seed); random.seed(random_seed)

def log_event(level, msg):
    print(f"[{level.upper()}] {msg}")

# ---------- Kaggle download helper ----------
def find_kaggle_config():
    locations = [
        os.path.join(os.path.expanduser('~'), '.kaggle', 'kaggle.json'),
        os.path.join(os.getcwd(), 'kaggle.json')
    ]
    for loc in locations:
        if os.path.exists(loc):
            try:
                with open(loc, 'r') as f:
                    return json.load(f)
            except Exception:
                continue
    return None

def kaggle_download(dataset, dst):
    try:
        import kaggle
    except Exception as e:
        raise RuntimeError("kaggle package required: pip install kaggle") from e
    cfg = find_kaggle_config()
    if not cfg:
        raise FileNotFoundError("kaggle.json not trouvé; place-le dans ~/.kaggle/ ou le répertoire courant.")
    os.environ['KAGGLE_USERNAME'] = cfg.get('username')
    os.environ['KAGGLE_KEY'] = cfg.get('key')
    kaggle.api.authenticate()
    dst.mkdir(parents=True, exist_ok=True)
    kaggle.api.dataset_download_files(dataset, path=str(dst), unzip=False)
    zips = [p for p in dst.iterdir() if p.suffix=='.zip']
    if not zips:
        raise FileNotFoundError("Aucun ZIP téléchargé dans " + str(dst))
    return zips[0]

# ---------- Step 1: download and extract ----------
if use_kaggle:
    log_event("info", f"Downloading {kaggle_dataset} to {download_path}")
    zip_path = kaggle_download(kaggle_dataset, download_path)
    log_event("info", f"Downloaded: {zip_path}")
    extract_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(extract_dir)
    log_event("info", f"Extracted into: {extract_dir}")
    # find likely Sunspots CSV
    candidates = list(extract_dir.rglob("*.csv"))
    if not candidates:
        raise FileNotFoundError("Aucun CSV trouvé dans l'archive extraite.")
    # prefer file with 'sunspot' or 'Sunspots' in name
    chosen = None
    for p in candidates:
        if 'sun' in p.name.lower():
            chosen = p; break
    if chosen is None:
        chosen = candidates[0]
else:
    # look for local copy
    local_candidates = list(Path("data").rglob("*.csv")) + list(Path("external").rglob("*.csv"))
    if not local_candidates:
        raise FileNotFoundError("Aucun CSV local trouvé. Active use_kaggle or place CSV in data/ or external/.")
    chosen = local_candidates[0]

log_event("info", f"Using CSV: {chosen}")

# ---------- Step 2: load sunspots series ----------
df = pd.read_csv(chosen)
# Attempt to detect monthly mean total sunspot number column
cols = df.columns.tolist()
col_candidates = [c for c in cols if 'sun' in c.lower()]
if col_candidates:
    series_col = col_candidates[0]
else:
    # fallback: numeric first column
    for c in cols:
        try:
            arr = pd.to_numeric(df[c], errors="coerce")
            if arr.notna().sum() > 0:
                series_col = c
                break
        except Exception:
            continue

series = pd.to_numeric(df[series_col], errors="coerce").dropna().values
n_system = len(series)
log_event("info", f"Loaded series column '{series_col}' with n={n_system}")

# ---------- Step 3: Takens embedding ----------
def takens_embed(x, emb_dim, tau):
    N = len(x)
    m = emb_dim
    T = tau
    L = N - (m-1)*T
    if L <= 0:
        raise ValueError("Time series too short for embedding with given dim/tau")
    Y = np.empty((L, m))
    for i in range(m):
        Y[:, i] = x[i*T : i*T + L]
    return Y

X = takens_embed(series, embedding_dim, tau)
log_event("info", f"Embedding shape: {X.shape}")

# ---------- Step 4: construct k-NN graph (symmetric unweighted) ----------
# compute kneighbors_graph (sparse)
A = kneighbors_graph(X, n_neighbors=k_neighbors, mode='connectivity', include_self=False, n_jobs=1)
# symmetrize
A = 0.5 * (A + A.T)
A = (A > 0).astype(int)
A = sp.csr_matrix(A)
log_event("info", f"k-NN adjacency shape: {A.shape}, nnz={A.nnz}")

# ---------- Step 5: normalized Laplacian and eigenvalues ----------
# degree
deg = np.array(A.sum(axis=1)).flatten()
# avoid zero-degree nodes by tiny smoothing
deg[deg==0] = 1.0
D_inv_sqrt = sp.diags(1.0/np.sqrt(deg))
L = sp.eye(A.shape[0]) - D_inv_sqrt @ A @ D_inv_sqrt  # normalized Laplacian
log_event("info", "Constructed normalized Laplacian")

# compute smallest n_eig eigenvalues (use scipy.sparse.linalg eigsh)
from scipy.sparse.linalg import eigsh
k_eig = min(n_eig, L.shape[0]-2) if L.shape[0] > 3 else min(1, L.shape[0]-1)
if k_eig < 1:
    raise ValueError("Graph too small for eigen decomposition with requested n_eig")
vals, vecs = eigsh(L.asfptype(), k=k_eig, which='SM', tol=1e-6, maxiter=5000)
vals = np.sort(np.real(vals))
log_event("info", f"Computed {len(vals)} eigenvalues (smallest)")

# save eigenvalues
eig_fp = results_dir / "laplacian_eigenvalues.csv"
pd.DataFrame({"lambda": vals}).to_csv(eig_fp, index=False)
log_event("info", f"Wrote eigenvalues to {eig_fp}")

# ---------- Step 6: estimate d_s using Theil-Sen on N(lambda) <= lambda_max ----------
lam = vals[vals>0]
Nlam = np.arange(1, len(lam)+1)

mask = lam <= lambda_max
if mask.sum() < 3:
    log_event("warn", f"Only {mask.sum()} eigenvalues with lambda <= {lambda_max}; Theil-Sen fit requires >=3 points")
else:
    x = np.log(lam[mask]).reshape(-1,1)
    y = np.log(Nlam[mask])
    reg = TheilSenRegressor(random_state=random_seed).fit(x,y)
    slope = float(reg.coef_[0]); intercept = float(reg.intercept_)
    d_s_point = 2.0 * slope
    log_event("info", f"Point estimate d_s = {d_s_point:.6f} using {mask.sum()} points (lambda<= {lambda_max})")

# ---------- Step 7: bootstrap d_s by subsampling eigenvalues with replacement ----------
def estimate_ds_from_eigs(eigs_arr, lambda_max_local=lambda_max):
    lam_loc = eigs_arr[eigs_arr>0]
    Nloc = np.arange(1, len(lam_loc)+1)
    sel = lam_loc <= lambda_max_local
    if sel.sum() < 3:
        return None
    xloc = np.log(lam_loc[sel]).reshape(-1,1)
    yloc = np.log(Nloc[sel])
    regloc = TheilSenRegressor(random_state=None).fit(xloc, yloc)
    return 2.0 * float(regloc.coef_[0])

rng = np.random.RandomState(random_seed)
ds_samples = []
n_e = len(lam)
for i in range(n_boot):
    idx = rng.choice(n_e, size=max(3, int(math.floor(n_e*subsample_frac))), replace=True)
    samp = np.sort(lam[idx])
    est_ds = estimate_ds_from_eigs(samp)
    if est_ds is not None:
        ds_samples.append(est_ds)
ds_samples = np.array(ds_samples)
log_event("info", f"Bootstrap complete, retained samples: {len(ds_samples)} out of {n_boot}")

# ---------- Step 8: propagate to T_log and save results ----------
if len(ds_samples)>0:
    ds_med = float(np.median(ds_samples)); ds_q025, ds_q975 = np.quantile(ds_samples, [0.025, 0.975])
    Tlog_samples = (ds_samples - 4.0) * math.log(n_system)
    Tlog_med = float(np.median(Tlog_samples)); Tlog_q025, Tlog_q975 = np.quantile(Tlog_samples, [0.025, 0.975])
    # save summaries
    out_summary = {
        "n_system": int(n_system),
        "d_s_point": float(d_s_point) if 'd_s_point' in locals() else None,
        "d_s_boot_median": ds_med,
        "d_s_boot_q025": float(ds_q025),
        "d_s_boot_q975": float(ds_q975),
        "Tlog_median": Tlog_med,
        "Tlog_q025": float(Tlog_q025),
        "Tlog_q975": float(Tlog_q975),
        "n_boot_retained": int(len(ds_samples))
    }
    pd.DataFrame([out_summary]).to_csv(results_dir / "external_Tlog_summary.csv", index=False)
    pd.DataFrame({"d_s": ds_samples}).to_csv(results_dir / "external_ds_boot.csv", index=False)
    pd.DataFrame({"T_log": Tlog_samples}).to_csv(results_dir / "external_Tlog_boot.csv", index=False)
    # histogram plot
    plt.figure(figsize=(6,3.5))
    plt.hist(Tlog_samples, bins=30, color='C0', alpha=0.85)
    plt.axvline(Tlog_med, color='k', linestyle='--', label=f"median {Tlog_med:.2f}")
    plt.title("Bootstrap T_log distribution (propagated from d_s)")
    plt.xlabel("T_log"); plt.ylabel("count"); plt.legend()
    hist_fp = results_dir / "external_Tlog_hist.png"
    plt.tight_layout(); plt.savefig(hist_fp, dpi=150); plt.close()
    log_event("info", f"Summary and plots written to {results_dir}")
    print("\nRESULTS SUMMARY:")
    for k,v in out_summary.items():
        print(f"- {k}: {v}")
else:
    log_event("warn", "No bootstrap d_s samples retained. Check lambda_max or eigen-spectrum quality.")

# ---------- Optional: quick null-model (temporal shuffle) basic check ----------
# Create a temporal shuffle null by shuffling the original series, rebuild embedding/Laplacian on the shuffled series,
# compute eigenvalues and a single d_s estimate to compare directionally (not bootstrapped to limit runtime).
try:
    null_runs = 1
    null_ds = []
    for i in range(null_runs):
        s_shuf = np.copy(series)
        rng.shuffle(s_shuf)
        Xn = takens_embed(s_shuf, embedding_dim, tau)
        An = kneighbors_graph(Xn, n_neighbors=k_neighbors, mode='connectivity', include_self=False)
        An = 0.5*(An + An.T); An = (An>0).astype(int); An = sp.csr_matrix(An)
        degn = np.array(An.sum(axis=1)).flatten(); degn[degn==0]=1.0
        D_inv_sqrtn = sp.diags(1.0/np.sqrt(degn))
        Ln = sp.eye(An.shape[0]) - D_inv_sqrtn @ An @ D_inv_sqrtn
        valsn, _ = eigsh(Ln.asfptype(), k=min(k_eig, Ln.shape[0]-1), which='SM')
        valsn = np.sort(np.real(valsn))
        estn = estimate_ds_from_eigs(valsn)
        if estn is not None:
            null_ds.append(estn)
    if null_ds:
        print("\nNull-model (temporal shuffle) quick check: d_s_null =", null_ds)
    else:
        print("\nNull-model quick check produced no valid d_s (insufficient small-lambda points).")
except Exception as e:
    log_event("warn", f"Null-model quick check failed: {e}")

# End of cell


[INFO] Downloading robervalt/sunspots to data\sunspots_kaggle
Dataset URL: https://www.kaggle.com/datasets/robervalt/sunspots
[INFO] Downloaded: data\sunspots_kaggle\sunspots.zip
[INFO] Extracted into: data\sunspots_kaggle\extracted
[INFO] Using CSV: data\sunspots_kaggle\extracted\Sunspots.csv
[INFO] Loaded series column 'Monthly Mean Total Sunspot Number' with n=3265
[INFO] Embedding shape: (3256, 10)
[INFO] k-NN adjacency shape: (3256, 3256), nnz=48972
[INFO] Constructed normalized Laplacian
[INFO] Computed 200 eigenvalues (smallest)
[INFO] Wrote eigenvalues to results\sunspots_external\laplacian_eigenvalues.csv
[INFO] Point estimate d_s = 1.999479 using 19 points (lambda<= 0.2)
[INFO] Bootstrap complete, retained samples: 200 out of 200
[INFO] Summary and plots written to results\sunspots_external

RESULTS SUMMARY:
- n_system: 3265
- d_s_point: 1.9994790122804786
- d_s_boot_median: 2.050927876374394
- d_s_boot_q025: 1.0731509417247878
- d_s_boot_q975: 4.422765618088611
- Tlog_median

Interprétation rapide des résultats

Estimation ponctuelle : d_s ≈ 2.00 (ajusté sur 19 points λ ≤ 0.2).

Bootstrap : médiane d_s ≈ 2.05, mais intervalle 95% très large [≈1.07, 4.42].

Propagation T_log (n=3265) : médiane T_log ≈ −15.77 (donc régime Divergence) mais CI 95% pour T_log inclut des valeurs > 0 (≈ [−23.68, +3.42]) à cause de la queue haute de la distribution de d_s.

Null-model rapide (temporal shuffle) renvoie un d_s très élevé (~15) — signe qu’au moins un des éléments suivants se produit : (a) le shuffle a changé complètement la structure de l’embedding + graphe (attendu), (b) l’ajustement Theil‑Sen a extrapolé sur très peu de points ou sur valeurs atypiques, ou (c) une erreur de paramétrage pour le null-run (k_eig, taille du graphe) a produit un spectre avec peu de petites λ valides mais des formes qui mènent à pentes extrêmes.

Ce que ça signifie pour ton équation T_log

Confirmation locale : la médiane T_log négative soutient l’hypothèse (pour Sunspots) que le système est dans le régime Divergence.

Incertitude : la large dispersion bootstrap et la CI de T_log traversant zéro indiquent qu’il faut investiguer les sources d’instabilité statistique (points influents, outliers bootstrap, choix de λ_max, sensibilité à embedding/k).

Null-model utile : le shuffle radicalise la structure — utile pour test directionnel, mais il faut des nulls construits strictement comparables (mêmes étapes embedding→graph→Laplacien) et plusieurs réplications.


 Je fournis une cellule autonome unique qui :

génère diagnostics clairs pour cette exécution Sunspots :

plot du spectre (lambda vs index) et N(lambda) log-log avec la zone λ ≤ 0.2 marquée,

scatter log N vs log λ avec la droite Theil‑Sen (point estimate) et annotations,

histogramme des échantillons d_s (bootstrap) avec médiane et CI marqués et marquage des outliers (d_s > 4 ou d_s < 0.5),

tableau des 10 plus grandes valeurs bootstrap (pour repérer la queue haute),

sauvegarde des figures et d'un CSV diagnostic results/sunspots_external/diagnostic_bootstrap_details.csv.

imprime des recommandations courtes basées sur ce diagnostic (p.ex. réduire λ_max, inspecter les échantillons bootstrap extrêmes, vérifier embedding/k).



In [62]:
# Cellule unique de diagnostic détaillé (Sunspots run) :
# Produit : spectre, N(lambda) log-log + fit Theil-Sen, histogramme bootstrap d_s, table outliers, sauvegarde fichiers
from pathlib import Path
import numpy as np, pandas as pd, math
import matplotlib.pyplot as plt
from sklearn.linear_model import TheilSenRegressor

root = Path.cwd()
results_dir = Path("results/sunspots_external")
eig_fp = results_dir / "laplacian_eigenvalues.csv"
ds_boot_fp = results_dir / "external_ds_boot.csv"
out_dir = results_dir / "diagnostics"
out_dir.mkdir(parents=True, exist_ok=True)

# Load data
if not eig_fp.exists():
    raise FileNotFoundError("Eigenvalues file not found: " + str(eig_fp))
eig = pd.read_csv(eig_fp)["lambda"].to_numpy(dtype=float)
eig = np.sort(eig[eig>0])

if not ds_boot_fp.exists():
    raise FileNotFoundError("Bootstrap d_s file not found: " + str(ds_boot_fp))
ds_boot = pd.read_csv(ds_boot_fp)["d_s"].to_numpy(dtype=float)

# Parameters
lambda_max = 0.2
n_system = 3265  # same as run; adjust if needed

# 1) Spectrum plot (lambda vs index)
plt.figure(figsize=(6,3))
plt.plot(np.arange(1, len(eig)+1), eig, '-o', markersize=3)
plt.yscale('log')
plt.xlabel('index (rank)')
plt.ylabel('lambda (log scale)')
plt.title('Spectrum: laplacian eigenvalues (ascending)')
plt.grid(True, which='both', ls=':', alpha=0.5)
spec_fp = out_dir / "spectrum_index_lambda.png"
plt.tight_layout(); plt.savefig(spec_fp, dpi=150); plt.close()

# 2) N(lambda) log-log and Theil-Sen fit on lambda <= lambda_max
lam = eig
Nlam = np.arange(1, len(lam)+1)
mask = lam <= lambda_max
lam_sel = lam[mask]; N_sel = Nlam[mask]
if len(lam_sel) >= 3:
    x = np.log(lam_sel).reshape(-1,1); y = np.log(N_sel)
    reg = TheilSenRegressor(random_state=0).fit(x,y)
    slope = float(reg.coef_[0]); intercept = float(reg.intercept_)
    d_s_point = 2.0 * slope
else:
    slope = np.nan; intercept = np.nan; d_s_point = np.nan

plt.figure(figsize=(5.5,4))
plt.loglog(lam, Nlam, 'o', markersize=3, label='N(lambda)')
if len(lam_sel) >= 3:
    x_line = np.linspace(np.log(lam_sel).min(), np.log(lam_sel).max(), 200)
    plt.loglog(np.exp(x_line), np.exp(intercept + slope * x_line), color='C1', lw=2, label=f"Theil-Sen slope={slope:.3f}")
plt.axvline(lambda_max, color='gray', linestyle='--', label=f'lambda_max={lambda_max}')
plt.xlabel('lambda (log)')
plt.ylabel('N(lambda) (log)')
plt.title('N(lambda) and Theil-Sen fit (lambda <= lambda_max)')
plt.legend(fontsize=8)
plt.grid(True, which='both', ls=':', alpha=0.5)
nlam_fp = out_dir / "Nlambda_loglog_fit.png"
plt.tight_layout(); plt.savefig(nlam_fp, dpi=150); plt.close()

# 3) Histogram of bootstrap d_s with median and CI, mark outliers
if len(ds_boot)>0:
    med = np.median(ds_boot); q025, q975 = np.quantile(ds_boot, [0.025, 0.975])
    plt.figure(figsize=(6,3.5))
    plt.hist(ds_boot, bins=40, color='C0', alpha=0.85)
    plt.axvline(med, color='k', linestyle='--', label=f"median {med:.3f}")
    plt.axvline(q025, color='gray', linestyle=':', label='CI 2.5%')
    plt.axvline(q975, color='gray', linestyle=':', label='CI 97.5%')
    # mark outliers (heuristic thresholds)
    out_high = ds_boot > 4.0
    out_low = ds_boot < 0.5
    if out_high.any():
        plt.hist(ds_boot[out_high], bins=20, color='red', alpha=0.6, label=f'high outliers (>{4.0}): {out_high.sum()}')
    if out_low.any():
        plt.hist(ds_boot[out_low], bins=20, color='purple', alpha=0.6, label=f'low outliers (<{0.5}): {out_low.sum()}')
    plt.xlabel('d_s (bootstrap samples)')
    plt.ylabel('count')
    plt.title('Bootstrap d_s distribution')
    plt.legend(fontsize=8)
    hist_ds_fp = out_dir / "ds_boot_hist.png"
    plt.tight_layout(); plt.savefig(hist_ds_fp, dpi=150); plt.close()
else:
    med = np.nan; q025 = np.nan; q975 = np.nan
    hist_ds_fp = None

# 4) Table of top 10 highest d_s boot values (to inspect tail)
if len(ds_boot)>0:
    top_high = np.sort(ds_boot)[-20:][::-1]
    df_high = pd.DataFrame({"d_s_high": top_high})
    df_high.to_csv(out_dir / "ds_boot_top_high.csv", index=False)
else:
    df_high = pd.DataFrame()

# 5) Save diagnostic CSV summary
summary = {
    "n_lambda_total": int(len(lam)),
    "n_points_fit_used": int(len(lam_sel)),
    "d_s_point": float(d_s_point) if not np.isnan(d_s_point) else None,
    "d_s_boot_median": float(med) if not np.isnan(med) else None,
    "d_s_boot_q025": float(q025) if not np.isnan(q025) else None,
    "d_s_boot_q975": float(q975) if not np.isnan(q975) else None,
    "n_boot_retained": int(len(ds_boot))
}
pd.DataFrame([summary]).to_csv(out_dir / "diagnostic_summary.csv", index=False)

# 6) Print concise human-readable results and recommendations
print("Diagnostic files written to:", out_dir)
print("\nQuick summary:")
for k,v in summary.items():
    print(f"- {k}: {v}")

print("\nRecommendations (single actionable checks):")
print("1) Inspect", nlam_fp.name, "and", nlam_fp.name, "to confirm fit alignment on lambda <= lambda_max.")
print("2) If bootstrap shows heavy right tail (many d_s > 4), inspect", out_dir / "ds_boot_top_high.csv", "to see if a few resamples cause the tail.")
print("3) Consider re-running with a smaller lambda_max (e.g. 0.1) or removing the smallest k eigenvalues if small-lambda noise dominates.")
print("4) If null-model produced extreme d_s, run several null replicates and compare medians (nulls should be processed identically).")

# print paths of generated figures to inspect
print("\nFigures:")
print("- spectrum index-lambda:", spec_fp)
print("- N(lambda) log-log with fit:", nlam_fp)
print("- d_s bootstrap histogram:", hist_ds_fp)
print("- top-high d_s CSV:", out_dir / "ds_boot_top_high.csv")
print("- diagnostic summary CSV:", out_dir / "diagnostic_summary.csv")


Diagnostic files written to: results\sunspots_external\diagnostics

Quick summary:
- n_lambda_total: 199
- n_points_fit_used: 19
- d_s_point: 1.9994790122804766
- d_s_boot_median: 2.050927876374394
- d_s_boot_q025: 1.0731509417247878
- d_s_boot_q975: 4.422765618088611
- n_boot_retained: 200

Recommendations (single actionable checks):
1) Inspect Nlambda_loglog_fit.png and Nlambda_loglog_fit.png to confirm fit alignment on lambda <= lambda_max.
2) If bootstrap shows heavy right tail (many d_s > 4), inspect results\sunspots_external\diagnostics\ds_boot_top_high.csv to see if a few resamples cause the tail.
3) Consider re-running with a smaller lambda_max (e.g. 0.1) or removing the smallest k eigenvalues if small-lambda noise dominates.
4) If null-model produced extreme d_s, run several null replicates and compare medians (nulls should be processed identically).

Figures:
- spectrum index-lambda: results\sunspots_external\diagnostics\spectrum_index_lambda.png
- N(lambda) log-log with fit:

Résumé interprétatif (en 4 points)

Estimation ponctuelle et bootstrap

d_s point ≈ 2.00 ; bootstrap médian ≈ 2.05 — ceci suggère une dimension spectrale ≈ 2 pour Sunspots (dans ta configuration d'embedding/k).

Incertitude et queue haute

L’IC bootstrap est large : [≈1.07, 4.42]. La distribution a une queue droite avec ~10 échantillons > 4 (ds_boot_top_high.csv). Cette queue élargit la CI et fait que T_log CI traverse zéro malgré une médiane très négative.

Origine probable de l’instabilité

Peu de points utilisés pour le fit (19 points λ ≤ 0.2) ; Theil‑Sen extrapole sur peu de données.

Certains resamples bootstrap sélectionnent configurations d’eigenvalues qui génèrent pentes très raides → outliers dans d_s.

Le null-model shuffle (un seul essai) a donné un d_s énorme ; cela requiert réplication contrôlée (multiple nulls traités identiquement) pour interpréter.

Conclusion sur la question scientifique

Ton pipeline fournit une validation empirique locale (Sunspots) orientée : médiane T_log fortement négative → régime « Divergence » pour ces paramètres.

Ce n’est pas une démonstration universelle ; il faut réduire les sources d’incertitude avant d’arguer généralité.

Proposition unique et prioritaire (une cellule — exécute-la si tu veux avancer maintenant)

Ce que fait la cellule : refit l’estimation en testant une petite grille de prétraitements qui ciblent exactement les causes d’instabilité observées :

Essayez lambda_max = 0.1 (plus conservateur) et lambda_max = 0.2 (référence).

Pour chaque lambda_max, refaites l’ajustement Theil‑Sen après avoir exclu k smallest eigenvalues where k in {0,1,2,3}.

Pour chaque combinaison, calcule point estimate d_s, nombre de points utilisés, et la propagation T_log (avec n_system = 3265).

Écris un petit CSV summary (results/sunspots_external/robust_grid_lambda_exclude_summary.csv) et prints succincts des combinaisons qui stabilisent la médiane et réduisent la queue droite.

Pourquoi : cela teste directement si la queue droite est due à bruit aux plus petites valeurs propres ou au fait que λ_max est trop large.

In [63]:
# Cellule unique : grid test pour lambda_max ∈ {0.1,0.2} et exclusion des k smallest eigenvals k=0..3
# Produit : CSV summary and console table showing stability of d_s and T_log
from pathlib import Path
import numpy as np, pandas as pd, math
from sklearn.linear_model import TheilSenRegressor

root = Path.cwd()
results_dir = Path("results/sunspots_external")
eig_fp = results_dir / "laplacian_eigenvalues.csv"
out_fp = results_dir / "robust_grid_lambda_exclude_summary.csv"

if not eig_fp.exists():
    raise FileNotFoundError("Eigenvalues file not found: " + str(eig_fp))

# load eigenvalues (sorted positive)
eig = pd.read_csv(eig_fp)["lambda"].to_numpy(dtype=float)
eig = np.sort(eig[eig>0])

n_system = 3265   # same as run; adjust if needed

lambda_tests = [0.1, 0.2]
exclude_k = [0,1,2,3]

def estimate_ds(lam_arr, lambda_max):
    lam = lam_arr
    Nlam = np.arange(1, len(lam)+1)
    mask = lam <= lambda_max
    if mask.sum() < 3:
        return None
    x = np.log(lam[mask]).reshape(-1,1)
    y = np.log(Nlam[mask])
    reg = TheilSenRegressor(random_state=0).fit(x,y)
    slope = float(reg.coef_[0])
    d_s = 2.0 * slope
    return {"d_s": d_s, "n_points": int(mask.sum()), "slope": slope}

rows = []
for lam_max in lambda_tests:
    for k in exclude_k:
        if k >= len(eig)-1:
            # can't exclude more than available
            rows.append({
                "lambda_max": lam_max,
                "exclude_k": k,
                "n_points_total": len(eig),
                "n_points_used": None,
                "d_s": None,
                "T_log": None,
                "note": "exclude exceeds eigencount"
            })
            continue
        lam_trim = eig[k:]  # drop k smallest eigenvalues
        est = estimate_ds(lam_trim, lam_max)
        if est is None:
            rows.append({
                "lambda_max": lam_max,
                "exclude_k": k,
                "n_points_total": len(lam_trim),
                "n_points_used": 0,
                "d_s": None,
                "T_log": None,
                "note": "too few points <= lambda_max"
            })
        else:
            d_s = float(est["d_s"])
            n_used = int(est["n_points"])
            T_log = (d_s - 4.0) * math.log(n_system)
            # add brief diagnostic flag if d_s > 4 or d_s < 0.5
            flag = ""
            if d_s > 4.0:
                flag = "high_ds_outlier"
            elif d_s < 0.5:
                flag = "low_ds_outlier"
            rows.append({
                "lambda_max": lam_max,
                "exclude_k": k,
                "n_points_total": int(len(lam_trim)),
                "n_points_used": n_used,
                "d_s": d_s,
                "T_log": T_log,
                "note": flag
            })

df_res = pd.DataFrame(rows)
df_res.to_csv(out_fp, index=False)

# print concise table sorted by stability heuristics (small |T_log| uncertainty proxy = larger n_points_used and d_s moderate)
print("Grid results written to:", out_fp)
print(df_res.to_string(index=False))


Grid results written to: results\sunspots_external\robust_grid_lambda_exclude_summary.csv
 lambda_max  exclude_k  n_points_total  n_points_used      d_s      T_log            note
        0.1          0             199              9 1.441367 -20.701939                
        0.1          1             198              8 1.976626 -16.371146                
        0.1          2             197              7 2.820601  -9.542536                
        0.1          3             196              6 4.082882   0.670600 high_ds_outlier
        0.2          0             199             19 1.999479 -16.186245                
        0.2          1             198             18 2.304848 -13.715498                
        0.2          2             197             17 2.764757  -9.994368                
        0.2          3             196             16 3.236706  -6.175827                


Résumé bref

Résultats de la grille : les configurations avec plus de points utilisés (n_points_used élevé) donnent des estimations plus stables; les configurations qui excluent trop de petits λ (exclude_k ≥ 3) produisent des d_s très élevés ou instables (queue droite).

Choix recommandé (meilleur compromis stabilité / signal) : lambda_max = 0.2, exclude_k = 0.

Raison : utilise 19 points pour le fit; d_s ≈ 1.999 (≈2.0) et T_log ≈ −16.19 — clairement négatif, cohérent avec la conclusion « Divergence », tout en évitant la forte sensibilité due à l’exclusion de petits λ ou à une λ_max trop petite.

In [64]:
# Cellule unique : appliquer la configuration choisie (lambda_max=0.2, exclude_k=0),
# recalculer fit Theil-Sen, sauvegarder summary final, annoter et réécrire le plot final,
# et enregistrer une note dans run_ready_for_commit.txt
from pathlib import Path
import numpy as np, pandas as pd, math
import matplotlib.pyplot as plt
from sklearn.linear_model import TheilSenRegressor
from datetime import datetime

# Config retenue
results_dir = Path("results/sunspots_external")
eig_fp = results_dir / "laplacian_eigenvalues.csv"
out_summary_fp = results_dir / "final_choice_summary.csv"
diag_dir = results_dir / "diagnostics"
diag_dir.mkdir(parents=True, exist_ok=True)
lambda_max = 0.2
exclude_k = 0
n_system = 3265

if not eig_fp.exists():
    raise FileNotFoundError("Eigenvalues file not found: " + str(eig_fp))

# load eigenvalues, trim exclude_k smallest positives
eig = pd.read_csv(eig_fp)["lambda"].to_numpy(dtype=float)
eig = np.sort(eig[eig>0])
if exclude_k >= len(eig):
    raise ValueError("exclude_k too large for eigenvalue count")

lam = eig[exclude_k:]
Nlam = np.arange(1, len(lam)+1)
mask = lam <= lambda_max
n_used = int(mask.sum())

if n_used < 3:
    raise RuntimeError(f"Too few points <= {lambda_max} after excluding {exclude_k} smallest lambdas: {n_used} points")

# fit Theil-Sen
x = np.log(lam[mask]).reshape(-1,1)
y = np.log(Nlam[mask])
reg = TheilSenRegressor(random_state=0).fit(x,y)
slope = float(reg.coef_[0]); intercept = float(reg.intercept_)
d_s = 2.0 * slope
T_log = (d_s - 4.0) * math.log(n_system)

# write final summary CSV
summary = {
    "timestamp": datetime.utcnow().isoformat()+"Z",
    "lambda_max": float(lambda_max),
    "exclude_k": int(exclude_k),
    "n_lambda_total": int(len(lam)),
    "n_points_used": int(n_used),
    "slope": float(slope),
    "d_s": float(d_s),
    "T_log": float(T_log)
}
pd.DataFrame([summary]).to_csv(out_summary_fp, index=False)

# regenerate annotated N(lambda) log-log plot (final)
plt.figure(figsize=(6,4))
plt.loglog(lam, Nlam, 'o', markersize=3, label='N(lambda)')
x_line = np.linspace(np.log(lam[mask]).min(), np.log(lam[mask]).max(), 200)
plt.loglog(np.exp(x_line), np.exp(intercept + slope * x_line), color='C1', lw=2,
           label=f"Theil-Sen slope={slope:.3f}  d_s={d_s:.3f}")
plt.axvline(lambda_max, color='gray', linestyle='--', label=f'lambda_max={lambda_max}')
plt.xlabel('lambda (log)')
plt.ylabel('N(lambda) (log)')
plt.title('Final N(lambda) fit (chosen config)')
plt.legend(fontsize=9)
plt.grid(True, which='both', ls=':', alpha=0.4)
final_plot_fp = diag_dir / "Nlambda_loglog_fit_final.png"
plt.tight_layout(); plt.savefig(final_plot_fp, dpi=150); plt.close()

# append note to run_ready_for_commit.txt
run_ready = Path("results/ds_remove_small_lambda/run_ready_for_commit.txt")
note = (f"[{datetime.utcnow().isoformat()}Z] Final choice: lambda_max={lambda_max}, exclude_k={exclude_k}, "
        f"d_s={d_s:.6f}, T_log={T_log:.6f}, n_points_used={n_used}\n")
run_ready.parent.mkdir(parents=True, exist_ok=True)
with open(run_ready, "a", encoding="utf8") as f:
    f.write(note)

# Print concise output
print("Final configuration applied and artifacts written:")
print("- final summary:", out_summary_fp)
print("- annotated fit plot:", final_plot_fp)
print("- appended note to:", run_ready)
print("\nFinal values:")
for k,v in summary.items():
    print(f"- {k}: {v}")


Final configuration applied and artifacts written:
- final summary: results\sunspots_external\final_choice_summary.csv
- annotated fit plot: results\sunspots_external\diagnostics\Nlambda_loglog_fit_final.png
- appended note to: results\ds_remove_small_lambda\run_ready_for_commit.txt

Final values:
- timestamp: 2025-11-11T03:49:59.602064Z
- lambda_max: 0.2
- exclude_k: 0
- n_lambda_total: 199
- n_points_used: 19
- slope: 0.9997395061402383
- d_s: 1.9994790122804766
- T_log: -16.186245402896272
