In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
from pathlib import Path



In [8]:
# ---- CONFIG ----
collapsed_excel = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsed_matrix.xlsx"
sheet_name      = "mean_cells_per_mm3"
out_dir         = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\ES_plots_normalized"
out_excel       = str(Path(out_dir) / "effect_sizes_PE_vs_CT_by_hemi.xlsx")
normalization   = "brain_mean"       # "none" or "brain_mean"

In [9]:
# If names don’t encode genotype/condition cleanly, provide a mapping CSV:
mapping_csv = None  # CSV with columns: mouse_base, genotype (WT/Shank3), condition (PE/CT)

Path(out_dir).mkdir(parents=True, exist_ok=True)

In [10]:
# Helper functions
def extract_base_and_hemi(col):
    m = re.match(r"^(?P<base>.+)_(?P<hemi>[LR])$", col)
    if not m:
        return None, None
    return m.group("base"), m.group("hemi")

def parse_labels(base_name):
    s = base_name.lower()
    genotype = "WT" if "wt" in s else ("Shank3" if "shank3" in s else None)
    if re.search(r"\bpe\b", s):
        condition = "PE"
    elif re.search(r"\b(ct|ctrl|control)\b", s):
        condition = "CT"
    else:
        toks = re.split(r"[_\-\s]+", s)
        condition = "PE" if "pe" in toks else ("CT" if any(t in {"ct","ctrl","control"} for t in toks) else None)
    return genotype, condition

def cohens_d(x, y):
    x = np.asarray(x, float); y = np.asarray(y, float)
    x = x[np.isfinite(x)]; y = y[np.isfinite(y)]
    n1, n2 = len(x), len(y)
    if n1 < 2 or n2 < 2:
        return np.nan, np.nan, np.nan, n1, n2, np.nan, np.nan, np.nan, np.nan
    m1, m2 = x.mean(), y.mean()
    s1, s2 = x.std(ddof=1), y.std(ddof=1)
    sp2 = (((n1-1)*s1**2) + ((n2-1)*s2**2)) / (n1+n2-2)
    sp = sqrt(sp2) if sp2 > 0 else np.nan
    d = (m1 - m2) / sp if sp > 0 else np.nan
    var_d = (n1+n2)/(n1*n2) + (d**2)/(2*(n1+n2-2)) if np.isfinite(d) else np.nan
    se_d = sqrt(var_d) if np.isfinite(var_d) and var_d >= 0 else np.nan
    ci_lo = d - 1.96*se_d if np.isfinite(se_d) else np.nan
    ci_hi = d + 1.96*se_d if np.isfinite(se_d) else np.nan
    return d, ci_lo, ci_hi, n1, n2, m1, m2, s1, s2

def hedges_g_from_d(d, n1, n2, ci_lo_d, ci_hi_d):
    if not np.isfinite(d) or n1 is None or n2 is None:
        return np.nan, np.nan, np.nan
    J = 1.0 - 3.0 / (4.0*(n1+n2) - 9.0)
    g = J * d
    # approximate CI by scaling d's CI (common practice for quick reporting)
    return g, J*ci_lo_d, J*ci_hi_d

def build_registry(df, mapping_csv=None):
    meta_candidates = ["region_id","acronym","name","structure_id_path","depth","structure_name"]
    meta_cols = [c for c in meta_candidates if c in df.columns]
    value_cols = [c for c in df.columns if c not in meta_cols and pd.api.types.is_numeric_dtype(df[c])]
    rows = []
    for c in value_cols:
        base, hemi = extract_base_and_hemi(c)
        if base is None:
            continue
        rows.append({"col": c, "base": base, "hemi": hemi})
    reg = pd.DataFrame(rows)
    if mapping_csv is None:
        reg[["genotype","condition"]] = reg["base"].apply(lambda b: pd.Series(parse_labels(b)))
    else:
        mp = pd.read_csv(mapping_csv)
        mp["mouse_base"] = mp["mouse_base"].astype(str)
        reg = reg.merge(mp.rename(columns={"mouse_base":"base"}), on="base", how="left")
    reg = reg[reg["genotype"].isin(["WT","Shank3"]) & reg["condition"].isin(["PE","CT"])]
    return reg, meta_cols, value_cols

def normalize_brain_mean(df, reg):
    """Divide each mouse’s L/R columns by that mouse’s across-region mean (using both hemispheres)."""
    df = df.copy()
    for base in sorted(reg["base"].unique()):
        cols = []
        sub = reg[reg["base"] == base]
        if "L" in set(sub["hemi"]):
            cols.append(sub[sub["hemi"]=="L"]["col"].iloc[0])
        if "R" in set(sub["hemi"]):
            cols.append(sub[sub["hemi"]=="R"]["col"].iloc[0])
        cols = [c for c in cols if c in df.columns]
        if not cols:
            continue
        denom = pd.concat([df[c] for c in cols], axis=1).mean(axis=1)  # mean across hemispheres, per region
        # Use overall mean across regions as the scaling factor for this mouse
        scale = denom.mean(skipna=True)
        if np.isfinite(scale) and scale != 0:
            for c in cols:
                df[c] = df[c] / scale
    return df



In [11]:
# ---- Load data and (optionally) normalize ----
df = pd.read_excel(collapsed_excel, sheet_name=sheet_name)
reg, meta_cols, value_cols = build_registry(df, mapping_csv)

if normalization == "brain_mean":
    df = normalize_brain_mean(df, reg)


In [12]:
# ---- Quick plots (per genotype) ----
def make_scatter(df, cols_L, cols_R, title, outpath):
    plt.figure()
    # mean across mice for a region per hemi
    x = df[cols_L].mean(axis=1, skipna=True)
    y = df[cols_R].mean(axis=1, skipna=True)
    plt.scatter(x, y, alpha=0.7)
    mn = np.nanmin([x.min(), y.min()])
    mx = np.nanmax([x.max(), y.max()])
    plt.plot([mn, mx], [mn, mx])
    plt.xlabel("Left (mean across mice)")
    plt.ylabel("Right (mean across mice)")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(outpath, dpi=150)
    plt.close()

def make_delta_hist(df, cols_L, cols_R, title, outpath):
    plt.figure()
    # build per-mouse deltas then stack (one big distribution)
    deltas = []
    for cL, cR in zip(cols_L, cols_R):
        if cL in df.columns and cR in df.columns:
            d = (df[cR] - df[cL]).to_numpy(dtype=float)
            d = d[np.isfinite(d)]
            deltas.append(d)
    if deltas:
        all_d = np.concatenate(deltas)
        plt.hist(all_d, bins=40)
    plt.xlabel("Delta (R - L)")
    plt.ylabel("Count")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(outpath, dpi=150)
    plt.close()

for genotype in ["WT","Shank3"]:
    for condition in ["PE","CT"]:
        sub = reg[(reg["genotype"]==genotype) & (reg["condition"]==condition)]
        # Collect matching L/R columns (by base)
        bases = sorted(sub["base"].unique())
        cols_L = []; cols_R = []
        for b in bases:
            cL = sub[(sub["base"]==b) & (sub["hemi"]=="L")]["col"]
            cR = sub[(sub["base"]==b) & (sub["hemi"]=="R")]["col"]
            if len(cL)==1: cols_L.append(cL.iloc[0])
            if len(cR)==1: cols_R.append(cR.iloc[0])
        if cols_L and cols_R:
            make_scatter(df, cols_L, cols_R,
                         f"{genotype} {condition}: Left vs Right (region means over mice)",
                         str(Path(out_dir)/f"scatter_{genotype}_{condition}.png"))
            make_delta_hist(df, cols_L, cols_R,
                            f"{genotype} {condition}: Δ (R−L) distribution",
                            str(Path(out_dir)/f"delta_hist_{genotype}_{condition}.png"))


In [13]:


# ---- Effect sizes (Hedges’ g and Cohen’s d) per region ----
def effect_table_for(genotype, hemi):
    sub = reg[(reg["genotype"]==genotype) & (reg["hemi"]==hemi)]
    pe_cols = sub[sub["condition"]=="PE"]["col"].tolist()
    ct_cols = sub[sub["condition"]=="CT"]["col"].tolist()
    rows = []
    for i in range(len(df)):
        meta = {k: df.at[i,k] for k in meta_cols if k in df.columns}
        x = df.loc[i, pe_cols].to_numpy(dtype=float) if pe_cols else np.array([])
        y = df.loc[i, ct_cols].to_numpy(dtype=float) if ct_cols else np.array([])
        d, dlo, dhi, n1, n2, m1, m2, s1, s2 = cohens_d(x, y)
        g, glo, ghi = hedges_g_from_d(d, n1, n2, dlo, dhi)
        rows.append({**meta,
                     "g": g, "g_lo": glo, "g_hi": ghi,
                     "d": d, "d_lo": dlo, "d_hi": dhi,
                     "n_PE": n1, "n_CT": n2,
                     "mean_PE": m1, "mean_CT": m2,
                     "sd_PE": s1, "sd_CT": s2})
    return pd.DataFrame(rows)

def effect_table_delta_for(genotype):
    # require paired L/R per base
    pivot = (reg[reg["genotype"]==genotype]
             .pivot_table(index=["base","condition"], columns="hemi", values="col", aggfunc="first")
             .reset_index())
    pe = pivot[pivot["condition"]=="PE"].dropna(subset=["L","R"])
    ct = pivot[pivot["condition"]=="CT"].dropna(subset=["L","R"])
    # build per-region per-mouse delta matrices
    deltas_pe = [ (df[r["R"]] - df[r["L"]]).to_numpy(dtype=float) for _, r in pe.iterrows() ]
    deltas_ct = [ (df[r["R"]] - df[r["L"]]).to_numpy(dtype=float) for _, r in ct.iterrows() ]
    rows = []
    for i in range(len(df)):
        meta = {k: df.at[i,k] for k in meta_cols if k in df.columns}
        x = np.array([col[i] for col in deltas_pe], float) if deltas_pe else np.array([])
        y = np.array([col[i] for col in deltas_ct], float) if deltas_ct else np.array([])
        d, dlo, dhi, n1, n2, m1, m2, s1, s2 = cohens_d(x, y)
        g, glo, ghi = hedges_g_from_d(d, n1, n2, dlo, dhi)
        rows.append({**meta,
                     "g": g, "g_lo": glo, "g_hi": ghi,
                     "d": d, "d_lo": dlo, "d_hi": dhi,
                     "n_PE": n1, "n_CT": n2,
                     "mean_PE_delta": m1, "mean_CT_delta": m2,
                     "sd_PE_delta": s1, "sd_CT_delta": s2})
    return pd.DataFrame(rows)

wt_L  = effect_table_for("WT","L")
wt_R  = effect_table_for("WT","R")
wt_D  = effect_table_delta_for("WT")
sh_L  = effect_table_for("Shank3","L")
sh_R  = effect_table_for("Shank3","R")
sh_D  = effect_table_delta_for("Shank3")

with pd.ExcelWriter(out_excel) as xw:
    wt_L.to_excel(xw, sheet_name=f"WT_L_{normalization}", index=False)
    wt_R.to_excel(xw, sheet_name=f"WT_R_{normalization}", index=False)
    wt_D.to_excel(xw, sheet_name=f"WT_Delta_{normalization}", index=False)
    sh_L.to_excel(xw, sheet_name=f"Shank3_L_{normalization}", index=False)
    sh_R.to_excel(xw, sheet_name=f"Shank3_R_{normalization}", index=False)
    sh_D.to_excel(xw, sheet_name=f"Shank3_Delta_{normalization}", index=False)

print(f"Saved plots to: {out_dir}")
print(f"Saved effect sizes to: {out_excel}")

Saved plots to: Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\ES_plots_normalized
Saved effect sizes to: Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\ES_plots_normalized\effect_sizes_PE_vs_CT_by_hemi.xlsx
