# 03_statistical_tests.ipynb

Objetivo: realizar pruebas estadísticas reproducibles sobre el summary de ángulos (results/angles_summary.csv). Contiene: bootstrap para IC del promedio, permutation test para diferencia entre dos subconjuntos, y ensayos de surrogate (circular shift) para evaluar autocorrelación. 

Autor: ChatGPT — implementación técnica.

In [None]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

OUT_DIR = 'results'
os.makedirs(OUT_DIR, exist_ok=True)

CSV_PATH = os.path.join(OUT_DIR, 'angles_summary.csv')
if not os.path.exists(CSV_PATH):
    raise SystemExit(f"File not found: {CSV_PATH}\nEjecuta notebooks/02_selection_and_angles.ipynb primero para generar results/angles_summary.csv")

print('Loading', CSV_PATH)
df = pd.read_csv(CSV_PATH)
print('Rows:', len(df))
# keep only events with at least one pair
df = df[~df['mean_angle_deg'].isna()].reset_index(drop=True)
print('Rows with angle data:', len(df))


In [None]:
# Exploratory stats
obs_mean = df['mean_angle_deg'].mean()
obs_median = df['mean_angle_deg'].median()
obs_std = df['mean_angle_deg'].std()
print('Observed mean (deg):', obs_mean)
print('Observed median (deg):', obs_median)
print('Observed std (deg):', obs_std)

plt.figure(figsize=(8,4))
plt.hist(df['mean_angle_deg'], bins=80, color='C0', alpha=0.7)
plt.xlabel('mean_angle_deg')
plt.ylabel('counts')
plt.title('Distribution of per-event mean angle (deg)')
plt.grid(alpha=0.2)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,'mean_angle_distribution.png'))
plt.show()


In [None]:
# 1) Bootstrap for CI of the mean
n_boot = 5000
rng = np.random.default_rng(42)
means = np.empty(n_boot)
vals = df['mean_angle_deg'].to_numpy()
N = len(vals)
for i in range(n_boot):
    sample = rng.choice(vals, size=N, replace=True)
    means[i] = sample.mean()

ci_lower, ci_upper = np.percentile(means, [2.5, 97.5])
print(f'Bootstrap {n_boot} CI for mean: [{ci_lower:.4f}, {ci_upper:.4f}] (deg)')

# save
np.save(os.path.join(OUT_DIR,'bootstrap_means.npy'), means)

plt.figure(figsize=(7,4))
plt.hist(means, bins=60, color='C2', alpha=0.8)
plt.axvline(obs_mean, color='k', lw=2, label=f'obs mean={obs_mean:.3f}')
plt.axvline(ci_lower, color='r', ls='--', label='2.5%')
plt.axvline(ci_upper, color='r', ls='--', label='97.5%')
plt.legend()
plt.title('Bootstrap distribution of mean (per-event)')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,'bootstrap_mean_hist.png'))
plt.show()


In [None]:
# 2) Permutation test: difference between first-half and second-half means
# define groups (by index order) — this tests stationarity / shift between halves
mid = N//2
groupA = vals[:mid]
groupB = vals[mid:]
obs_diff = groupA.mean() - groupB.mean()
print('Observed diff (A - B):', obs_diff)

n_perm = 5000
diffs = np.empty(n_perm)
all_vals = vals.copy()
for i in range(n_perm):
    rng.shuffle(all_vals)
    a = all_vals[:mid].mean()
    b = all_vals[mid:].mean()
    diffs[i] = a - b

# two-sided p-value
p_val = np.mean(np.abs(diffs) >= np.abs(obs_diff))
print(f'Permutation test p-value (two-sided) = {p_val:.5f}')

plt.figure(figsize=(7,4))
plt.hist(diffs, bins=60, color='C3', alpha=0.8)
plt.axvline(obs_diff, color='k', lw=2, label=f'obs diff={obs_diff:.3f}')
plt.legend()
plt.title('Permutation distribution: mean(A)-mean(B)')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,'permutation_diff_hist.png'))
plt.show()


In [None]:
# 3) Surrogate test: circular shift of the time-series of mean_angle_deg to break temporal dependence
# compute observed lag-1 autocorrelation
x = vals.copy()
# demean
x_d = x - x.mean()
obs_ac1 = np.corrcoef(x_d[:-1], x_d[1:])[0,1]
print('Observed lag-1 autocorrelation:', obs_ac1)

n_surr = 2000
ac_surr = np.empty(n_surr)
for i in range(n_surr):
    shift = rng.integers(1, N)
    xs = np.roll(x_d, shift)
    ac_surr[i] = np.corrcoef(xs[:-1], xs[1:])[0,1]

p_ac = np.mean(np.abs(ac_surr) >= np.abs(obs_ac1))
print(f'Surrogate test (lag-1) p-value = {p_ac:.5f}')

plt.figure(figsize=(7,4))
plt.hist(ac_surr, bins=60, color='C4', alpha=0.8)
plt.axvline(obs_ac1, color='k', lw=2, label=f'obs ac1={obs_ac1:.3f}')
plt.legend()
plt.title('Surrogate distribution of lag-1 autocorrelation')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR,'surrogate_ac1_hist.png'))
plt.show()


In [None]:
# Save a short report as CSV/JSON
report = {
    'obs_mean': float(obs_mean),
    'obs_median': float(obs_median),
    'obs_std': float(obs_std),
    'bootstrap_ci_lower': float(ci_lower),
    'bootstrap_ci_upper': float(ci_upper),
    'perm_pvalue': float(p_val),
    'surrogate_pvalue_ac1': float(p_ac),
    'n_events': int(N)
}
import json
with open(os.path.join(OUT_DIR,'stat_tests_report.json'),'w') as fh:
    json.dump(report, fh, indent=2)

# Save a CSV with main summary stats
pd.DataFrame([report]).to_csv(os.path.join(OUT_DIR,'stat_tests_report.csv'), index=False)
print('Saved report to results/stat_tests_report.*')



Interpretación y recomendaciones:

- El bootstrap provee IC para la media de la distribución de ángulos por evento; una CI muy estrecha indica alta estabilidad del estimador.
- El permutation test evalúa si la media en la primera mitad difiere de la segunda; p pequeño sugiere no estacionariedad temporal.
- El surrogate test con circular shifts evalúa si la autocorrelación observada es mayor que la esperada por azar (respetando la distribución marginal).

Siguientes pasos: si obtienes p-values pequeñas, investigar dependencias temporales, repetir tests en submuestras y controlar por variables (n_pairs, run, etc.).