
# AR Likelihood for CBC Inference — GWOSC Demo (Clean + Injection)

This notebook demonstrates a **time-domain autoregressive (AR) likelihood** for compact binary coalescence (CBC) inference using **real LIGO data** from **GWOSC**. It:

- downloads a short strain segment with **GWPy**
- pre-processes (high-pass, optional resample)
- estimates **AR(p)** noise model on off-source data (Yule–Walker)
- builds an **AR-whitened Gaussian likelihood**
- injects a CBC (via **PyCBC**), then compares log-likelihoods for:
  - **h = 0** (noise-only)
  - **h = template** (correct signal model)

> Requirements: `gwpy`, `pycbc`, `numpy`, `scipy`, `matplotlib`  
> Run the install cell below if needed.


In [None]:

# If running on a fresh environment, uncomment to install dependencies:
# %pip install -q gwpy pycbc numpy scipy matplotlib


In [None]:

import numpy as np
import matplotlib.pyplot as plt

from gwpy.timeseries import TimeSeries
from scipy.signal import welch
from scipy.signal.windows import tukey
from scipy.linalg import solve_toeplitz
from scipy.special import erfinv

# PyCBC for CBC waveform generation
from pycbc.waveform import get_td_waveform


In [None]:

# ---------------- Configuration ----------------
# Data span near GW150914 (public, convenient)
EVENT_GPS   = 1126259462.4
SPAN_SEC    = 32.0
IFO         = "L1"

# Preprocessing
HIGHPASS_HZ = 20.0
RESAMPLE_HZ = 2048.0       # set to None to keep native rate

# AR model
AR_ORDER    = 80           # try 40..200; larger -> smoother PSD model

# Segments used to estimate AR from off-source (in seconds relative to segment start)
# We'll estimate AR on two 6 s regions away from center to avoid any real signal
AR_TRAIN_WINDOWS = [(2.0, 8.0), (SPAN_SEC-8.0, SPAN_SEC-2.0)]

# Injection (PyCBC / IMRPhenomD, non-spinning)
DO_INJECTION = True
M1_Msun      = 30.0
M2_Msun      = 30.0
DIST_Mpc     = 400.0        # lower -> louder
F_LOWER      = 30.0
APPROXIMANT  = "IMRPhenomD"
INJ_INCL     = 0.0          # face-on
INJ_PHI      = 0.0
INJ_GPS      = EVENT_GPS    # coalescence time at center


In [None]:

# ---------------- Fetch GWOSC data ----------------
start = EVENT_GPS - SPAN_SEC/2
stop  = EVENT_GPS + SPAN_SEC/2
print(f"Fetching {IFO} strain from {start:.1f} to {stop:.1f} GPS ...")
data = TimeSeries.fetch_open_data(IFO, start, stop)  # requires internet
data = data.highpass(HIGHPASS_HZ)
if RESAMPLE_HZ is not None:
    data = data.resample(RESAMPLE_HZ)

t  = data.times.value
x0 = data.value.astype(float)
fs = float(data.sample_rate.value)
n  = len(x0)

print(f"Sample rate: {fs:.1f} Hz, samples: {n}, duration: {n/fs:.1f} s")
plt.figure(figsize=(11,3.5))
plt.plot(t - t[0], x0, lw=0.6)
plt.xlabel("Time since segment start [s]")
plt.ylabel("Strain (hp)")
plt.title(f"{IFO} strain — {SPAN_SEC:.0f}s, fs={fs:.0f} Hz")
plt.tight_layout()
plt.show()


In [None]:

# ---------------- AR estimation (Yule–Walker) ----------------
def autocorr_biased(x, maxlag):
    x = np.asarray(x)
    n = len(x)
    ac = np.correlate(x, x, mode='full')
    ac = ac[n-1 - maxlag : n + maxlag]  # not strictly needed; keep full if desired
    ac_full = np.correlate(x, x, mode='full')[n-1: n-1+maxlag+1]
    return ac_full / n

def estimate_ar_yw(x, order):
    """Estimate AR(p) via Yule–Walker.
    Returns phi (length p) and innovation variance sigma2.
    """
    x = np.asarray(x) - np.mean(x)
    r = autocorr_biased(x, order)  # r[0..order]
    R = solve_toeplitz((r[:-1], r[:-1]), r[1:])  # Toeplitz from r[0..p-1], RHS r[1..p]
    phi = R
    sigma2 = r[0] - np.dot(phi, r[1:])
    return phi, float(max(sigma2, np.finfo(float).tiny))

def apply_ar_whitener(x, phi):
    p = len(phi)
    x = np.asarray(x)
    y = np.zeros_like(x)
    for t_idx in range(p, len(x)):
        y[t_idx] = x[t_idx] - np.dot(phi, x[t_idx-p:t_idx][::-1])
    return y

def loglike_ar(d, h, phi, sigma2, drop=0):
    """AR Gaussian log-likelihood (white innovations).
    drop: number of initial samples to drop (e.g., p) to avoid filter transients.
    """
    wd = apply_ar_whitener(d, phi)
    wh = apply_ar_whitener(h, phi)
    r = wd - wh
    if drop > 0:
        r = r[drop:]
    N = len(r)
    return -0.5 * (np.dot(r, r) / sigma2 + N * np.log(2*np.pi*sigma2))


In [None]:

# ---------------- Build AR training vector ----------------
def time_to_index(t0, t1, t_array):
    # returns slice indices covering [t0, t1)
    t_rel = t_array - t_array[0]
    a = np.searchsorted(t_rel, t0, side='left')
    b = np.searchsorted(t_rel, t1, side='left')
    return slice(a, b)

train_chunks = []
for (a, b) in AR_TRAIN_WINDOWS:
    sl = time_to_index(a, b, t)
    if sl.stop - sl.start > AR_ORDER + 10:
        train_chunks.append(x0[sl])

if not train_chunks:
    raise RuntimeError("No valid AR training chunks; adjust AR_TRAIN_WINDOWS.")

x_train = np.concatenate(train_chunks)
phi, sigma2 = estimate_ar_yw(x_train, AR_ORDER)
print(f"Estimated AR(p={AR_ORDER}) sigma^2 (innovation): {sigma2:.3e}")


In [None]:

# ---------------- Diagnostics: whitening & PSD ----------------
xw = apply_ar_whitener(x0, phi)

plt.figure(figsize=(11,3.5))
plt.plot(t - t[0], xw, lw=0.5)
plt.xlabel("Time since segment start [s]")
plt.ylabel("AR-whitened strain")
plt.title("AR-whitened time series (should look ~white)")
plt.tight_layout()
plt.show()

# Welch PSDs
def plot_psd(sig, fs, label, color=None):
    nper = int(2.0 * fs)  # ~2 s windows
    nover = int(0.5 * nper)
    f, Pxx = welch(sig, fs=fs, nperseg=nper, noverlap=nover, detrend='constant')
    plt.semilogy(f, Pxx, label=label)

plt.figure(figsize=(11,3.8))
plot_psd(x0, fs, "raw (hp)")
plot_psd(xw[AR_ORDER:], fs, "AR-whitened (drop p)")
plt.xlabel("Frequency [Hz]"); plt.ylabel("PSD [strain^2/Hz]"); plt.legend(); plt.tight_layout(); plt.show()


In [None]:

# ---------------- Generate CBC injection (time-domain) ----------------
def make_td_injection(fs, n, start_gps, inj_gps, m1, m2, dist_mpc, f_lower, incl, phi0, approximant):
    dt = 1.0 / fs
    hp, hc = get_td_waveform(approximant=approximant,
                             mass1=m1, mass2=m2, spin1z=0, spin2z=0,
                             f_lower=f_lower, delta_t=dt, distance=dist_mpc,
                             inclination=incl, coa_phase=phi0)
    h = hp.numpy()  # plus polarization
    inj_index = int(round((inj_gps - start_gps) * fs))
    L = len(h)
    x_model = np.zeros(n, dtype=float)
    a = inj_index - (L - 1)
    b = inj_index + 1
    aa = max(a, 0); bb = min(b, n)
    ha = aa - a; hb = ha + (bb - aa)
    if bb > aa and hb > ha:
        x_model[aa:bb] += h[ha:hb]
    return x_model

template_td = make_td_injection(fs, n, t[0], INJ_GPS, M1_Msun, M2_Msun, DIST_Mpc,
                                F_LOWER, INJ_INCL, INJ_PHI, APPROXIMANT)

x_clean = x0.copy()
x_inj   = x0 + template_td if DO_INJECTION else x0

plt.figure(figsize=(11,3.5))
plt.plot(t - t[0], x_inj, lw=0.6, label="data + injection" if DO_INJECTION else "data")
if DO_INJECTION:
    plt.plot(t - t[0], template_td, lw=0.6, alpha=0.8, label="injected template")
plt.xlabel("Time since segment start [s]"); plt.ylabel("Strain")
ttl = f"Time series (fs={fs:.0f} Hz)"
if DO_INJECTION: ttl += f"  — inj {M1_Msun:.0f}+{M2_Msun:.0f} Msun @ {DIST_Mpc:.0f} Mpc"
plt.title(ttl); plt.legend(); plt.tight_layout(); plt.show()


In [None]:

# ---------------- AR likelihood comparison ----------------
drop = AR_ORDER  # drop first p samples to avoid filter transient in likelihood

def ll_summary(x_data, name):
    ll_h0 = loglike_ar(x_data, np.zeros_like(x_data), phi, sigma2, drop=drop)
    ll_ht = loglike_ar(x_data, template_td,               phi, sigma2, drop=drop)
    print(f"{name:12s}  logL(h=0): {ll_h0: .3f}    logL(h=template): {ll_ht: .3f}    Δ: {ll_ht-ll_h0: .3f}")
    return ll_h0, ll_ht

print("AR-likelihood (white innovations) results:")
ll_clean = ll_summary(x_clean, "clean")
ll_inj   = ll_summary(x_inj,   "injected" if DO_INJECTION else "data")


In [None]:

# ---------------- QQ plot of AR-whitened residuals ----------------
def qq_plot(z, title):
    z = z[np.isfinite(z)]
    z.sort()
    p = (np.arange(len(z)) + 0.5)/len(z)
    qn = np.sqrt(2) * erfinv(2*p - 1)
    plt.figure(figsize=(4.6,4.6))
    plt.plot(qn, z, ".", ms=2)
    lim = np.percentile(np.abs(z), 99.5)
    plt.plot([-lim, lim], [-lim, lim], lw=1)
    plt.xlabel("Normal quantiles")
    plt.ylabel("Residual quantiles")
    plt.title(title)
    plt.tight_layout(); plt.show()

# residuals for injected case, under h=0 and h=template
wd = apply_ar_whitener(x_inj, phi)[drop:]
wh0 = apply_ar_whitener(np.zeros_like(x_inj), phi)[drop:]
wht = apply_ar_whitener(template_td, phi)[drop:]

r_h0 = wd - wh0
r_ht = wd - wht

qq_plot(r_h0/np.sqrt(sigma2), "QQ: whitened residuals (h=0 model)")
qq_plot(r_ht/np.sqrt(sigma2), "QQ: whitened residuals (template model)")



## Notes & Next Steps

- **Model order (`AR_ORDER`)**: try 40–200. Higher order models finer spectral detail; beware overfit if the off-source is too short.
- **Training windows**: change `AR_TRAIN_WINDOWS` to avoid any on-source signal; use longer, quieter data to learn noise better.
- **Injection**: modify masses/distance, or set `DO_INJECTION=False` to analyze clean data.
- **Extensions**:
  - Student-t AR innovations (robust to glitches)
  - Time-varying AR (Kalman) for non-stationary drift
  - Multi-detector combination with per-site AR models
  - Wrap this likelihood inside an MCMC/Nested sampler to recover posteriors
