In [62]:
import os
import numpy as np
import h5py
import matplotlib.pyplot as plt

from scipy.signal import butter, filtfilt
from scipy import sparse
from scipy.sparse.linalg import spsolve

try:
    from sklearn.linear_model import Lasso
except Exception:
    Lasso = None
from pybaselines import Baseline
plt.rcParams["figure.figsize"] = (14, 4)
plt.rcParams["axes.grid"] = True


In [63]:
def list_doric_channels(path):
    with h5py.File(path, "r") as f:
        base = f["DataAcquisition"]["FPConsole"]["Signals"]["Series0001"]
        chans = []
        if "LockInAOUT02" in base:
            for k in base["LockInAOUT02"].keys():
                if k.startswith("AIN"):
                    chans.append(k)
        chans = sorted(chans)

        digital = []
        if "DigitalIO" in base:
            for k in base["DigitalIO"].keys():
                if k.startswith("DIO"):
                    digital.append(k)
        return chans, digital
def list_doric_files(folder_path):
    """
    Return a list of full paths to all .doric files in the given folder.
    """
    doric_files = []
    for fname in os.listdir(folder_path):
        if fname.lower().endswith(".doric"):
            doric_files.append(os.path.join(folder_path, fname))
    return doric_files

def load_doric(path, channel="AIN01", signal_folder="LockInAOUT02", ref_folder="LockInAOUT01",
              trigger_name=None):
    """
    Returns dict with:
      time, sig465, ref405, fs, (optional) trig_time, trig
    """
    with h5py.File(path, "r") as f:
        base = f["DataAcquisition"]["FPConsole"]["Signals"]["Series0001"]

        sig = np.asarray(base[signal_folder][channel][()], float)
        ref = np.asarray(base[ref_folder][channel][()], float)

        # time: prefer the matching folder time if size matches
        t_sig = np.asarray(base[signal_folder]["Time"][()], float) if "Time" in base[signal_folder] else np.array([])
        t_ref = np.asarray(base[ref_folder]["Time"][()], float) if "Time" in base[ref_folder] else np.array([])

        if t_sig.size == sig.size:
            t = t_sig
        elif t_ref.size == sig.size:
            t = t_ref
        else:
            # fallback
            dt = np.nanmedian(np.diff(t_sig)) if t_sig.size > 2 else 1/1000
            t = np.arange(sig.size) * dt

        # if ref length differs, interpolate onto t if possible
        if ref.size != sig.size:
            if t_ref.size == ref.size:
                ref = np.interp(t, t_ref, ref)
            else:
                ref = np.resize(ref, sig.size)

        # sampling rate
        fs = 1.0 / float(np.nanmedian(np.diff(t))) if t.size > 2 else np.nan

        # optional digital trigger overlay
        trig_time = None
        trig = None
        if trigger_name:
            if "DigitalIO" in base and trigger_name in base["DigitalIO"]:
                dio = base["DigitalIO"]
                trig = np.asarray(dio[trigger_name][()], float)
                trig_time = np.asarray(dio["Time"][()], float) if "Time" in dio else None

                # if lengths mismatch, interpolate signals to trigger time (like your Doric logic)
                if trig_time is not None and trig_time.size and trig_time.size != t.size:
                    sig = np.interp(trig_time, t, sig)
                    ref = np.interp(trig_time, t, ref)
                    t = trig_time
                    fs = 1.0 / float(np.nanmedian(np.diff(t))) if t.size > 2 else fs

    out = {"time": t, "sig465": sig, "ref405": ref, "fs": fs}
    if trig is not None and trig_time is not None:
        out["trig_time"] = trig_time
        out["trig"] = trig
    return out


In [64]:
import os
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# -----------------------------
# Helpers
# -----------------------------
# =========================
# Cell 2 — Helpers + Robust Fit
# =========================
def _as_float_1d(a):
    return np.asarray(a, dtype=float).ravel()

def mad_sigma(r):
    med = np.median(r)
    return 1.4826 * np.median(np.abs(r - med)) + 1e-12

def huber_irls(x, y, delta=1.5, max_iter=50, tol=1e-10):
    """
    Robust linear fit y ~ a*x + b using Huber IRLS.
    Returns (a, b, sigma_robust).
    """
    x = _as_float_1d(x)
    y = _as_float_1d(y)
    X = np.column_stack([x, np.ones_like(x)])
    beta = np.linalg.lstsq(X, y, rcond=None)[0]  # initial OLS

    for _ in range(max_iter):
        r = y - X @ beta
        s = mad_sigma(r)
        u = r / (s * delta)

        w = np.ones_like(u)
        mask = np.abs(u) > 1
        w[mask] = 1.0 / (np.abs(u[mask]) + 1e-12)

        Xw = X * w[:, None]
        yw = y * w
        beta_new = np.linalg.lstsq(Xw, yw, rcond=None)[0]

        if np.max(np.abs(beta_new - beta)) < tol:
            beta = beta_new
            break
        beta = beta_new

    r = y - X @ beta
    s = mad_sigma(r)
    return float(beta[0]), float(beta[1]), float(s)

def rolling_corr(x, y, win):
    """
    Rolling Pearson correlation in non-overlapping windows of size win.
    Returns corr array and window centers (indices).
    """
    n = min(len(x), len(y))
    nwin = n // win
    if nwin < 2:
        return np.array([]), np.array([])

    x2 = x[:nwin * win].reshape(nwin, win)
    y2 = y[:nwin * win].reshape(nwin, win)

    xmu = x2.mean(axis=1, keepdims=True)
    ymu = y2.mean(axis=1, keepdims=True)
    xv = x2 - xmu
    yv = y2 - ymu
    num = np.sum(xv * yv, axis=1)
    den = np.sqrt(np.sum(xv * xv, axis=1) * np.sum(yv * yv, axis=1)) + 1e-12
    c = num / den

    centers = (np.arange(nwin) * win + win / 2.0)
    return c, centers

def clipping_fraction(a, lo_q=0.001, hi_q=0.999):
    """
    Heuristic clipping: fraction of samples extremely close to extreme quantiles.
    Works even if you don't know ADC min/max.
    """
    a = _as_float_1d(a)
    a = a[np.isfinite(a)]
    if a.size < 10:
        return np.nan, np.nan, np.nan

    lo = np.quantile(a, lo_q)
    hi = np.quantile(a, hi_q)
    span = (hi - lo) + 1e-12
    eps = 0.001 * span
    frac_lo = np.mean(a <= (lo + eps))
    frac_hi = np.mean(a >= (hi - eps))
    return float(frac_lo + frac_hi), float(lo), float(hi)

def flatline_fraction(a, tol=1e-12):
    """
    Fraction of successive differences that are ~0 (flatlining / stuck values).
    """
    a = _as_float_1d(a)
    a = a[np.isfinite(a)]
    if a.size < 3:
        return np.nan
    d = np.diff(a)
    return float(np.mean(np.abs(d) < tol))


In [65]:
# =========================
# Cell 3 — Artifact Detection + Removal (Adaptive MAD)
# =========================
def adaptive_mad_artifact_mask(
    y: np.ndarray,
    fs: float,
    *,
    k: float = 6.0,
    window_s: float = 1.0,
    pad_s: float = 0.2,
    use_derivative: bool = True,
    min_mad: float = 1e-12,
) -> np.ndarray:
    """
    Build an artifact mask using Adaptive MAD (windowed).

    Detection is performed on dx=diff(y) if use_derivative=True, else directly on y.
    Within each non-overlapping window, compute median and MAD, then flag samples where:
        |x - median| > k * MAD

    Mask is returned at signal sample resolution (len(y)).
    """
    y = np.asarray(y, float)
    n = y.size
    if n == 0:
        return np.zeros((0,), dtype=bool)

    if not np.isfinite(fs) or fs <= 0:
        raise ValueError(f"adaptive_mad_artifact_mask: invalid fs={fs}")

    x = np.diff(y) if use_derivative else y.copy()
    nx = x.size
    if nx == 0:
        return np.zeros((n,), dtype=bool)

    win = int(round(window_s * fs))
    win = max(5, win)

    flagged_x = np.zeros((nx,), dtype=bool)

    for start in range(0, nx, win):
        stop = min(start + win, nx)
        seg = x[start:stop]

        seg_f = seg[np.isfinite(seg)]
        if seg_f.size < 5:
            continue

        med = np.median(seg_f)
        mad = np.median(np.abs(seg_f - med))
        mad = max(float(mad), float(min_mad))

        flagged_x[start:stop] = np.abs(seg - med) > (k * mad)

    mask = np.zeros((n,), dtype=bool)
    if use_derivative:
        hit = np.where(flagged_x)[0]
        mask[hit] = True
        mask[hit + 1] = True
    else:
        mask[:nx] = flagged_x

    pad_n = int(round(pad_s * fs))
    if pad_n > 0 and np.any(mask):
        kernel = np.ones((2 * pad_n + 1,), dtype=int)
        mask = (np.convolve(mask.astype(int), kernel, mode="same") > 0)

    return mask

def _nan_interp_1d(y: np.ndarray) -> np.ndarray:
    """
    Linearly interpolate NaNs in a 1D array.
    Edge NaNs are filled with nearest valid value.
    """
    y = np.asarray(y, float).copy()
    n = y.size
    if n == 0:
        return y

    isnan = ~np.isfinite(y)
    if not np.any(isnan):
        return y

    x = np.arange(n)
    good = np.isfinite(y)
    if np.sum(good) == 0:
        return y

    y[isnan] = np.interp(x[isnan], x[good], y[good])
    return y

def remove_artifacts_adaptive_mad(
    time: np.ndarray,
    sig465: np.ndarray,
    ref405: np.ndarray,
    fs: float = None,
    *,
    k: float = 6.0,
    window_s: float = 1.0,
    pad_s: float = 0.2,
    union_channels: bool = True,
    use_derivative: bool = True,
) -> dict:
    """
    Detect + remove artifacts using adaptive MAD (windowed) with padding.

    By default, artifacts are detected on BOTH channels and unioned (recommended),
    then removed from BOTH channels consistently.

    Returns a dict with cleaned signals and the artifact mask/regions.
    """
    t = np.asarray(time, float)
    s = np.asarray(sig465, float)
    r = np.asarray(ref405, float)

    n = min(t.size, s.size, r.size)
    t, s, r = t[:n], s[:n], r[:n]

    if fs is None:
        fs = 1.0 / float(np.nanmedian(np.diff(t))) if t.size > 2 else np.nan
    if not np.isfinite(fs) or fs <= 0:
        raise ValueError(f"remove_artifacts_adaptive_mad: invalid fs={fs}")

    m_s = adaptive_mad_artifact_mask(
        s, fs, k=k, window_s=window_s, pad_s=pad_s, use_derivative=use_derivative
    )
    m_r = adaptive_mad_artifact_mask(
        r, fs, k=k, window_s=window_s, pad_s=pad_s, use_derivative=use_derivative
    )
    mask = (m_s | m_r) if union_channels else m_s

    s_clean = s.copy()
    r_clean = r.copy()
    s_clean[mask] = np.nan
    r_clean[mask] = np.nan

    s_clean = _nan_interp_1d(s_clean)
    r_clean = _nan_interp_1d(r_clean)

    regions = []
    if np.any(mask):
        idx = np.where(mask)[0]
        breaks = np.where(np.diff(idx) > 1)[0]
        starts = np.r_[idx[0], idx[breaks + 1]]
        ends   = np.r_[idx[breaks], idx[-1]]
        for a, b in zip(starts, ends):
            regions.append((float(t[a]), float(t[b])))

    return {
        "time": t,
        "sig465_clean": s_clean,
        "ref405_clean": r_clean,
        "artifact_mask": mask,
        "artifact_regions_s": regions,
        "fs": float(fs),
    }



In [66]:
# =========================
# Cell A — Z-score helper (median + std)
# =========================
def zscore_median_std(x, eps=1e-12, ddof=0):
    """
    Z-score using median centering and (classical) std scaling:
        z = (x - median(x)) / std(x)

    Notes:
      - This is not a robust scale estimator (unlike MAD), but matches your requested definition.
      - If you want robust scaling, replace std with mad_sigma(x - median).
    """
    x = np.asarray(x, float)
    m = np.isfinite(x)
    if np.sum(m) < 10:
        return np.full_like(x, np.nan, dtype=float)

    med = np.median(x[m])
    sd = np.std(x[m], ddof=ddof)
    sd = sd if np.isfinite(sd) and sd > 0 else eps
    z = (x - med) / (sd + eps)
    return z


In [67]:
# =========================
# Cell B — dF/F computation (baseline via low-pass; stable, no extra deps)
# =========================
from scipy.signal import butter, filtfilt

def lowpass_baseline(x, fs, cutoff_hz=0.01, order=2):
    """
    Smooth baseline using a low-pass Butterworth filter (zero-phase filtfilt).
    cutoff_hz should be small (e.g., 0.005–0.05 Hz) depending on session length.
    """
    x = np.asarray(x, float)
    m = np.isfinite(x)
    if np.sum(m) < 10 or not np.isfinite(fs) or fs <= 0:
        return np.full_like(x, np.nan)

    # fill NaNs for filtering
    x_f = x.copy()
    x_f[~m] = np.interp(np.flatnonzero(~m), np.flatnonzero(m), x[m])

    nyq = 0.5 * fs
    cutoff = min(max(cutoff_hz / nyq, 1e-6), 0.99)
    b, a = butter(order, cutoff, btype="low")
    base = filtfilt(b, a, x_f)
    return base

def dff_from_baseline(x, fs, cutoff_hz=0.01, eps=1e-12):
    """
    dF/F = (x - baseline) / baseline
    """
    base = lowpass_baseline(x, fs, cutoff_hz=cutoff_hz)
    dff = (x - base) / (base + eps)
    return dff, base


In [68]:
# =========================
# Cell C — QC on z-scored dF/F (z_ref vs z_sig; Z = z_sig - z_ref)
# =========================
def qc_one_file_zscore(
    path,
    rec,
    outdir,
    *,
    # artifact removal params
    do_artifact_removal=True,
    ar_k=6.0,
    ar_window_s=1.0,
    ar_pad_s=0.2,
    ar_union_channels=True,
    ar_use_derivative=True,
    # dF/F baseline params
    baseline_cutoff_hz=0.01,
    # QC params
    corr_win_seconds=10.0,
    max_plot_points=150_000,
):
    """
    QC in z-score space:
      - artifact removal on raw sig/ref
      - compute dff_sig, dff_ref from low-pass baseline
      - compute z_sig, z_ref with zscore_median_std
      - define Z = z_sig - z_ref
      - correlate z_ref vs z_sig
      - plot Z distribution as PDF and annotate metrics
    """
    # --- load raw ---
    t_raw = _as_float_1d(rec["time"])
    sig_raw = _as_float_1d(rec["sig465"])
    ref_raw = _as_float_1d(rec["ref405"])
    fs = float(rec.get("fs", np.nan))

    n0 = min(len(t_raw), len(sig_raw), len(ref_raw))
    t_raw, sig_raw, ref_raw = t_raw[:n0], sig_raw[:n0], ref_raw[:n0]

    finite = np.isfinite(t_raw) & np.isfinite(sig_raw) & np.isfinite(ref_raw)
    t_raw, sig_raw, ref_raw = t_raw[finite], sig_raw[finite], ref_raw[finite]
    n = len(t_raw)

    # --- time sanity ---
    dt = np.diff(t_raw)
    dt_med = np.median(dt) if dt.size else np.nan
    nonmono_frac = float(np.mean(dt <= 0)) if dt.size else np.nan

    # --- artifact removal ---
    if do_artifact_removal:
        ar = remove_artifacts_adaptive_mad(
            t_raw,
            sig_raw,
            ref_raw,
            fs=fs if np.isfinite(fs) and fs > 0 else None,
            k=ar_k,
            window_s=ar_window_s,
            pad_s=ar_pad_s,
            union_channels=ar_union_channels,
            use_derivative=ar_use_derivative,
        )
        t = ar["time"]
        sig = ar["sig465_clean"]
        ref = ar["ref405_clean"]
        art_mask = ar["artifact_mask"]
        art_regions = ar["artifact_regions_s"]
        fs_eff = ar["fs"]
    else:
        t, sig, ref = t_raw, sig_raw, ref_raw
        art_mask = np.zeros_like(t, dtype=bool)
        art_regions = []
        fs_eff = fs if np.isfinite(fs) else (1.0 / np.nanmedian(np.diff(t)) if t.size > 2 else np.nan)

    art_frac = float(np.mean(art_mask)) if art_mask.size else 0.0

    # --- dF/F for each channel ---
    dff_sig, base_sig = dff_from_baseline(sig, fs_eff, cutoff_hz=baseline_cutoff_hz)
    dff_ref, base_ref = dff_from_baseline(ref, fs_eff, cutoff_hz=baseline_cutoff_hz)

    # --- z-score each dF/F ---
    z_sig = zscore_median_std(dff_sig)
    z_ref = zscore_median_std(dff_ref)

    # --- your Z trace definition ---
    Z = z_sig - z_ref

    # --- correlation + robust regression in z-space ---
    m = np.isfinite(z_sig) & np.isfinite(z_ref)
    if np.sum(m) >= 10:
        r_global, p_global = pearsonr(z_ref[m], z_sig[m])
        a, b, s_rob = huber_irls(z_ref[m], z_sig[m], delta=1.5)
    else:
        r_global, p_global = np.nan, np.nan
        a, b, s_rob = np.nan, np.nan, np.nan

    # --- rolling correlation (z_ref vs z_sig) ---
    if np.isfinite(fs_eff) and fs_eff > 0:
        win = int(max(10, round(fs_eff * corr_win_seconds)))
    else:
        win = 5000

    # Use full arrays (rolling_corr expects aligned arrays)
    r_roll, centers = rolling_corr(z_ref, z_sig, win=win)
    r_roll_med = float(np.nanmedian(r_roll)) if r_roll.size else np.nan
    r_roll_min = float(np.nanmin(r_roll)) if r_roll.size else np.nan
    pct_roll_gt_05 = float(np.mean(r_roll > 0.5) * 100.0) if r_roll.size else np.nan

    # --- Z distribution metrics ---
    Zf = Z[np.isfinite(Z)]
    if Zf.size:
        q25, q50, q75 = np.quantile(Zf, [0.25, 0.50, 0.75])
        iqr = q75 - q25
        halfwidth = 0.5 * iqr
        frac_gt3 = float(np.mean(np.abs(Zf) > 3.0) * 100.0)
        frac_gt5 = float(np.mean(np.abs(Zf) > 5.0) * 100.0)
    else:
        q25 = q50 = q75 = iqr = halfwidth = np.nan
        frac_gt3 = frac_gt5 = np.nan

    # --- Z AUC metrics (per second is most comparable) ---
    if t.size >= 2 and np.isfinite(Z).any():
        auc_signed = float(np.trapz(np.nan_to_num(Z, nan=0.0), t))
        auc_abs = float(np.trapz(np.abs(np.nan_to_num(Z, nan=0.0)), t))
        duration = float(t[-1] - t[0])
        auc_signed_per_s = auc_signed / duration if duration > 0 else np.nan
        auc_abs_per_s = auc_abs / duration if duration > 0 else np.nan
    else:
        auc_signed = auc_abs = auc_signed_per_s = auc_abs_per_s = np.nan

    # --- additional useful QC metric: Z rolling variability ---
    # Use same windowing as rolling corr, compute windowed std of Z
    zstd_roll = np.array([])
    if win and Z.size >= win * 2:
        nwin = (len(Z) // win)
        Z2 = Z[:nwin * win].reshape(nwin, win)
        zstd_roll = np.nanstd(Z2, axis=1)

    zstd_med = float(np.nanmedian(zstd_roll)) if zstd_roll.size else np.nan

    # -----------------------------
    # Plotting
    # -----------------------------
    fname = Path(path).stem
    outdir = Path(outdir)
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    if n > max_plot_points:
        idx = np.linspace(0, n - 1, max_plot_points).astype(int)
    else:
        idx = np.arange(n)

    # scatter downsample
    idx_sc = idx
    z_ref_sc = z_ref[idx_sc]
    z_sig_sc = z_sig[idx_sc]
    m_sc = np.isfinite(z_ref_sc) & np.isfinite(z_sig_sc)
    z_ref_sc = z_ref_sc[m_sc]
    z_sig_sc = z_sig_sc[m_sc]

    fig = plt.figure(figsize=(15, 10), dpi=150)
    gs = fig.add_gridspec(4, 3, height_ratios=[1.0, 1.0, 1.0, 1.0])

    ax0 = fig.add_subplot(gs[0, :])     # z_sig, z_ref, Z
    ax1 = fig.add_subplot(gs[1, 0:2])   # scatter z_ref vs z_sig
    ax2 = fig.add_subplot(gs[1, 2])     # Z PDF
    ax3 = fig.add_subplot(gs[2, :])     # Z over time (with AUC)
    ax4 = fig.add_subplot(gs[3, 0])     # rolling corr
    ax5 = fig.add_subplot(gs[3, 1])     # rolling std of Z
    ax6 = fig.add_subplot(gs[3, 2])     # dff overview (optional sanity)

    # A) z traces
    ax0.plot(t[idx], z_sig[idx], lw=0.7, color="C0", label="z_sig = z(dff_sig)")
    ax0.plot(t[idx], z_ref[idx], lw=0.7, color="C1", label="z_ref = z(dff_ref)")
    # ax0.plot(t[idx], Z[idx],     lw=0.9, color="k",  alpha=0.7, label="Z = z_sig - z_ref")

    if do_artifact_removal and art_regions:
        for (t0, t1) in art_regions[:20]:
            ax0.axvspan(t0, t1, color="crimson", alpha=0.05)

    ax0.set_title(f"{fname} | z-space QC | art_removed={art_frac*100:.2f}% | baseline_cutoff={baseline_cutoff_hz} Hz")
    ax0.set_xlabel("time (s)")
    ax0.set_ylabel("z units")
    ax0.grid(True, alpha=0.25)
    ax0.legend(loc="upper right", ncols=3, fontsize=8)

    # B) scatter z_ref vs z_sig + robust fit
    ax1.scatter(z_ref_sc, z_sig_sc, s=6, alpha=0.25, edgecolor="none")
    if np.isfinite(a) and np.isfinite(b) and z_ref_sc.size:
        xline = np.linspace(np.nanmin(z_ref_sc), np.nanmax(z_ref_sc), 200)
        ax1.plot(xline, a * xline + b, lw=2.0, color="C3", label="robust fit")
    ax1.set_xlabel("z_ref")
    ax1.set_ylabel("z_sig")
    ax1.set_title("Correlation in z-space")
    ax1.grid(True, alpha=0.25)

    stats_txt = (
        f"Pearson r={r_global:.4f}\n"
        f"rolling r med={r_roll_med:.3f}, min={r_roll_min:.3f}\n"
        f"% windows r>0.5={pct_roll_gt_05:.1f}%\n"
        f"fit: z_sig={a:.3g}*z_ref + {b:.3g}\n"
        f"dt_med={dt_med:.6g}  nonmono={nonmono_frac*100:.3f}%\n"
        f"Z roll-std median={zstd_med:.3g}"
    )
    ax1.text(
        0.02, 0.98, stats_txt,
        transform=ax1.transAxes, va="top", ha="left",
        bbox=dict(boxstyle="round,pad=0.35", facecolor="white", alpha=0.9, edgecolor="0.7"),
        fontsize=8
    )

    # C) Z distribution as PDF (hist density + optional KDE)
    ax2.set_title("Z distribution (PDF)")
    ax2.set_xlabel("Z = z_sig - z_ref")
    ax2.set_ylabel("density")
    ax2.grid(True, alpha=0.25)

    pdf_area_hist = np.nan
    pdf_area_kde = np.nan
    pdf_peak_kde = np.nan
    pdf_entropy_kde = np.nan

    if Zf.size:
        # histogram PDF + area
        hist_density, edges = np.histogram(Zf, bins=80, density=True)
        bw = np.diff(edges)
        centers_h = edges[:-1] + bw / 2
        pdf_area_hist = float(np.sum(hist_density * bw))  # should be ~1.0
        ax2.bar(centers_h, hist_density, width=bw, color="0.6", alpha=0.55, edgecolor="none", label="hist PDF")

        # KDE over central range (0.1%–99.9%) and area over that truncated range
        try:
            from scipy.stats import gaussian_kde
            kde = gaussian_kde(Zf)
            x_lo = float(np.quantile(Zf, 0.001))
            x_hi = float(np.quantile(Zf, 0.999))
            xs = np.linspace(x_lo, x_hi, 500)
            ys = kde(xs)

            pdf_area_kde = float(np.trapz(ys, xs))          # < 1 because truncated
            pdf_peak_kde = float(np.max(ys))

            # entropy over the plotted range (normalized)
            ys_norm = ys / (pdf_area_kde + 1e-12)
            pdf_entropy_kde = float(-np.trapz(ys_norm * np.log(ys_norm + 1e-12), xs))

            ax2.plot(xs, ys, color="C0", lw=2, label="KDE")
        except Exception:
            pass

        # quantiles overlay
        ax2.axvline(q50, color="C0", lw=2, label="median")
        ax2.axvspan(q25, q75, color="C0", alpha=0.12, label="25–75%")

        pdf_txt = (
            f"Hist area≈{pdf_area_hist:.5f} (≈1.0 expected)\n"
            f"KDE area (0.1–99.9%)≈{pdf_area_kde:.5f}\n"
            f"peak KDE={pdf_peak_kde:.4g}\n"
            f"entropy KDE={pdf_entropy_kde:.4g}\n"
            f"median={q50:.3g}\n"
            f"IQR={iqr:.3g}  halfwidth={halfwidth:.3g}\n"
            f"|Z|>3: {frac_gt3:.2f}%  |Z|>5: {frac_gt5:.2f}%"
        )
        ax2.text(
            0.02, 0.98, pdf_txt,
            transform=ax2.transAxes, va="top", ha="left",
            bbox=dict(boxstyle="round,pad=0.30", facecolor="white", alpha=0.9, edgecolor="0.7"),
            fontsize=8
        )
        ax2.legend(loc="best", fontsize=8)

    # D) Z over time + AUC annotation
    ax3.plot(t[idx], Z[idx], lw=0.7, color="k", alpha=0.9)
    ax3.axhline(0, color="k", lw=1, alpha=0.4)
    ax3.set_title("Z over time")
    ax3.set_xlabel("time (s)")
    ax3.set_ylabel("Z (z units)")
    ax3.grid(True, alpha=0.25)

    auc_txt = (
        f"AUC signed={auc_signed:.4g}\n"
        f"AUC abs={auc_abs:.4g}\n"
        f"signed/s={auc_signed_per_s:.4g}\n"
        f"abs/s={auc_abs_per_s:.4g}"
    )
    ax3.text(
        0.02, 0.98, auc_txt,
        transform=ax3.transAxes, va="top", ha="left",
        bbox=dict(boxstyle="round,pad=0.30", facecolor="white", alpha=0.9, edgecolor="0.7"),
        fontsize=8
    )

    # E) rolling corr
    ax4.set_title(f"Rolling corr(z_ref, z_sig) ({corr_win_seconds:.0f}s windows)")
    ax4.set_xlabel("time (s)")
    ax4.set_ylabel("r")
    ax4.grid(True, alpha=0.25)
    if r_roll.size:
        t_cent = t[(centers.astype(int)).clip(0, n - 1)]
        ax4.plot(t_cent, r_roll, lw=1.0)
        ax4.axhline(0.5, color="0.3", lw=1, ls="--", alpha=0.7)
        ax4.set_ylim(-1.05, 1.05)
        ax4.text(
            0.02, 0.98,
            f"% windows r>0.5: {pct_roll_gt_05:.1f}%",
            transform=ax4.transAxes, va="top", ha="left",
            bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.9, edgecolor="0.7"),
            fontsize=8,
        )

    # F) rolling std(Z)
    ax5.set_title("Rolling std(Z)")
    ax5.set_xlabel("window #")
    ax5.set_ylabel("std")
    ax5.grid(True, alpha=0.25)
    if zstd_roll.size:
        ax5.plot(zstd_roll, lw=1.0)
        ax5.axhline(zstd_med, color="C3", lw=1.2, ls="--", alpha=0.8, label=f"median={zstd_med:.3g}")
        ax5.legend(loc="best", fontsize=8)

    # G) dF/F sanity (downsample)
    ax6.set_title("dF/F sanity (sig vs ref)")
    ax6.plot(t[idx], dff_sig[idx], lw=0.7, color="C0", alpha=0.8, label="dff_sig")
    ax6.plot(t[idx], dff_ref[idx], lw=0.7, color="C1", alpha=0.8, label="dff_ref")
    ax6.axhline(0, color="k", lw=1, alpha=0.4)
    ax6.set_xlabel("time (s)")
    ax6.set_ylabel("dF/F")
    ax6.grid(True, alpha=0.25)
    ax6.legend(loc="best", fontsize=8)

    fig.tight_layout()
    outpath = outdir / f"{fname}_QC_zscore.png"
    fig.savefig(outpath, bbox_inches="tight")
    plt.close(fig)

    return dict(
        file=Path(path).name,
        n=n,
        fs=fs_eff,
        dt_median=dt_med,
        nonmono_frac=nonmono_frac,
        artifact_frac=art_frac,
        n_artifact_regions=len(art_regions),

        # z-space correlation metrics
        pearson_r_z=r_global,
        pearson_p_z=p_global,
        rolling_r_median_z=r_roll_med,
        rolling_r_min_z=r_roll_min,
        rolling_pct_gt_05_z=pct_roll_gt_05,
        fit_slope_z=a,
        fit_intercept_z=b,

        # Z distribution metrics
        Z_median=q50,
        Z_q25=q25,
        Z_q75=q75,
        Z_iqr=iqr,
        Z_halfwidth=halfwidth,
        Z_tail_frac_gt3=frac_gt3,
        Z_tail_frac_gt5=frac_gt5,

        # Z AUC
        Z_auc_signed=auc_signed,
        Z_auc_abs=auc_abs,
        Z_auc_signed_per_s=auc_signed_per_s,
        Z_auc_abs_per_s=auc_abs_per_s,

        # PDF metrics
        Z_pdf_area_hist=pdf_area_hist,                  # ~1.0 (sanity check)
        Z_pdf_area_kde_001_999=pdf_area_kde,            # area over truncated range
        Z_pdf_peak_kde=pdf_peak_kde,
        Z_pdf_entropy_kde=pdf_entropy_kde,

        # rolling Z variability
        Z_roll_std_median=zstd_med,

        qc_figure=str(outpath),
    )


In [69]:
# =========================
# Cell D — Folder runner (z-score QC)
# =========================
def qc_folder_zscore(
    folder,
    doric_paths,
    load_doric,
    *,
    out_subdir="QC_reports_zscore",
    do_artifact_removal=True,
    ar_k=6.0,
    ar_window_s=1.0,
    ar_pad_s=0.2,
    ar_union_channels=True,
    ar_use_derivative=True,
    baseline_cutoff_hz=0.01,
    corr_win_seconds=10.0,
    max_plot_points=150_000,
):
    folder = Path(folder)
    outdir = folder / out_subdir
    outdir.mkdir(parents=True, exist_ok=True)

    all_metrics = []
    for path in doric_paths:
        rec = load_doric(path)
        m = qc_one_file_zscore(
            path, rec, outdir,
            do_artifact_removal=do_artifact_removal,
            ar_k=ar_k, ar_window_s=ar_window_s, ar_pad_s=ar_pad_s,
            ar_union_channels=ar_union_channels, ar_use_derivative=ar_use_derivative,
            baseline_cutoff_hz=baseline_cutoff_hz,
            corr_win_seconds=corr_win_seconds,
            max_plot_points=max_plot_points,
        )
        all_metrics.append(m)
        print(
            f"{m['file']:<22s} "
            f"r_z={m['pearson_r_z']:.3f} "
            f"roll_med={m['rolling_r_median_z']:.3f} "
            f"%r>0.5={m['rolling_pct_gt_05_z']:.1f}% "
            f"|Z|>3={m['Z_tail_frac_gt3']:.2f}% "
            f"art%={100*m['artifact_frac']:.2f} "
            f"fig={Path(m['qc_figure']).name}"
        )

    return all_metrics


In [70]:
# =========================
# Cell E — Run (z-score QC)
# =========================
folder = r"C:\Analysis\fiber_photometry_app\test_data\batch1_test"
doric_paths = list_doric_files(folder)

metrics_z = qc_folder_zscore(
    folder,
    doric_paths,
    load_doric,
    out_subdir="QC_reports_zscore",
    do_artifact_removal=True,
    ar_k=6.0,
    ar_window_s=1.0,
    ar_pad_s=0.2,
    baseline_cutoff_hz=0.01,   # tune based on session length
    corr_win_seconds=10.0,
)


  auc_signed = float(np.trapz(np.nan_to_num(Z, nan=0.0), t))
  auc_abs = float(np.trapz(np.abs(np.nan_to_num(Z, nan=0.0)), t))
  pdf_area_kde = float(np.trapz(ys, xs))          # < 1 because truncated
  pdf_entropy_kde = float(-np.trapz(ys_norm * np.log(ys_norm + 1e-12), xs))


30546-test_0005.doric  r_z=0.803 roll_med=0.748 %r>0.5=66.7% |Z|>3=0.00% art%=0.00 fig=30546-test_0005_QC_zscore.png
30547-test_0006.doric  r_z=0.477 roll_med=-0.222 %r>0.5=6.9% |Z|>3=0.64% art%=0.00 fig=30547-test_0006_QC_zscore.png
30550-test_0001.doric  r_z=0.918 roll_med=0.813 %r>0.5=88.9% |Z|>3=0.00% art%=0.00 fig=30550-test_0001_QC_zscore.png
30551-test_0001.doric  r_z=0.805 roll_med=0.807 %r>0.5=100.0% |Z|>3=0.00% art%=0.00 fig=30551-test_0001_QC_zscore.png
