In [1]:
# ============================================================
# v6_figures — Cell 1: Environment + Artifact Resolution (Render-only)
# ============================================================
# Fix:
#   - Your repo has multiple versions of the same artifact names
#   - We must *prefer v6* deterministically (results_v6 > artifacts_v6 > else)
#   - NO interactive prompts, NO manual path edits required
# ============================================================

import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

PROJECT_ROOT = Path.cwd()
FIG_DIR = PROJECT_ROOT / "figures_v6"
FIG_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------
# Resolution policy (deterministic)
# ------------------------
# Priority order: pick the first directory that exists.
PREFERRED_DIRS = [
    PROJECT_ROOT / "results_v6",
    PROJECT_ROOT / "artifacts_v6",
    PROJECT_ROOT / "outputs_v6",
]

def resolve_dir() -> Path:
    for d in PREFERRED_DIRS:
        if d.exists() and d.is_dir():
            return d
    # Fallback: if none exist, stay at root but warn via print
    return PROJECT_ROOT

ARTIFACT_DIR = resolve_dir()

# ------------------------
# Required artifacts (must exist in ARTIFACT_DIR)
# ------------------------
REQUIRED = {
    "run_metadata": "run_metadata.json",
    "control_mean": "control_mean.npy",
    "control_q_lo": "control_q_lo.npy",
    "control_q_hi": "control_q_hi.npy",
    "control_iqr": "control_iqr.npy",
    "control_sd": "control_sd.npy",
    "pc1_dominance": "pc1_dominance.csv",
    "pc_scores": "pc_scores.csv",
    "alignment_table": "alignment_table.csv",
    "delta_table": "delta_table.csv",
    "exceedance_rates": "exceedance_rates.csv",
    "bin_residual_stats": "bin_residual_stats.csv",
    "pc1_collisions": "pc1_collisions.csv",
    "null_random_axis_abs_rhos": "null_random_axis_abs_rhos.npy",
    "null_shuffle_label_abs_rhos": "null_shuffle_label_abs_rhos.npy",
    "falsifiability_checklist": "falsifiability_checklist.csv",
}

ARTIFACTS = {k: (ARTIFACT_DIR / v) for k, v in REQUIRED.items()}
missing = [k for k, p in ARTIFACTS.items() if not p.exists()]

# If missing under chosen dir, try a secondary deterministic fallback:
# pick the *most v6-looking* directory among all matches (results_v6 > any path containing 'v6')
if missing:
    def best_match(fname: str) -> Path:
        matches = list(PROJECT_ROOT.rglob(fname))
        if not matches:
            raise FileNotFoundError(f"Required artifact not found anywhere: {fname}")

        def score(p: Path) -> tuple:
            s = str(p).lower()
            # Higher is better; tuple sorts ascending so we invert with negatives
            return (
                0 if "results_v6" in s else 1,
                0 if "artifacts_v6" in s else 1,
                0 if "v6" in s else 1,
                0 if "results" in s else 1,
                len(s),  # shorter path wins last
            )

        return sorted(matches, key=score)[0]

    for k in missing:
        fname = REQUIRED[k]
        ARTIFACTS[k] = best_match(fname)

# Final existence check
still_missing = [k for k, p in ARTIFACTS.items() if not p.exists()]
assert not still_missing, f"Missing required artifacts: {still_missing}"

# ------------------------
# Load metadata
# ------------------------
with open(ARTIFACTS["run_metadata"], "r") as f:
    RUN_META = json.load(f)

# ------------------------
# Matplotlib defaults (journal-safe)
# ------------------------
plt.rcParams.update({
    "figure.dpi": 120,
    "savefig.dpi": 300,
    "font.size": 10,
    "axes.titlesize": 11,
    "axes.labelsize": 10,
    "legend.fontsize": 9,
    "xtick.labelsize": 9,
    "ytick.labelsize": 9,
    "figure.constrained_layout.use": True,
})

print("v6_figures initialized (render-only).")
print("Figure output dir:", FIG_DIR)
print("Artifact dir preference root:", ARTIFACT_DIR)
print("Resolved artifacts:")
for k, v in ARTIFACTS.items():
    print(f"  {k:30s} -> {v}")
print("Run metadata:", RUN_META)


v6_figures initialized (render-only).
Figure output dir: c:\Users\Bryan\Documents\CrunchDAO Obesity\figures_v6
Artifact dir preference root: c:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6
Resolved artifacts:
  run_metadata                   -> c:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\run_metadata.json
  control_mean                   -> c:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_mean.npy
  control_q_lo                   -> c:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_q_lo.npy
  control_q_hi                   -> c:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_q_hi.npy
  control_iqr                    -> c:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_iqr.npy
  control_sd                     -> c:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_sd.npy
  pc1_dominance                  -> c:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\pc1_dominance.csv
  pc_scores                      -> c

In [2]:
# ============================================================
# v6_figures — Cell 2 (REPLACEMENT): Figure 1 (Control Envelope) [Render-only]
# ============================================================
# Fix:
#   - delta_table.csv is LONG-FORM (not wide), so headers are not gene names.
#   - We do NOT need gene names to render the control envelope.
#
# Output:
#   - fig_01_control_envelope_<runstamp>.pdf
#   - fig_01_control_envelope_<runstamp>.png
# ============================================================

import json
from pathlib import Path
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt

# --- Explicit figures directory ---
FIG_DIR = Path(r"C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)

# --- Explicit artifact paths ---
RUN_META_PATH   = Path(r"C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\run_metadata.json")
CONTROL_MEAN_FP = Path(r"C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_mean.npy")
CONTROL_QLO_FP  = Path(r"C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_q_lo.npy")
CONTROL_QHI_FP  = Path(r"C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_q_hi.npy")
CONTROL_IQR_FP  = Path(r"C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\control_iqr.npy")

for fp in [RUN_META_PATH, CONTROL_MEAN_FP, CONTROL_QLO_FP, CONTROL_QHI_FP, CONTROL_IQR_FP]:
    assert fp.exists(), f"Missing required file: {fp}"

with open(RUN_META_PATH, "r") as f:
    RUN_META = json.load(f)

run_stamp = RUN_META.get("run_timestamp_utc", "unknown")
run_stamp = run_stamp.replace(":", "").replace("-", "").replace("T", "_").replace("Z", "")

def savefig(fig, stem: str):
    pdf_path = FIG_DIR / f"{stem}_{run_stamp}.pdf"
    png_path = FIG_DIR / f"{stem}_{run_stamp}.png"
    fig.savefig(pdf_path, bbox_inches="tight")
    fig.savefig(png_path, bbox_inches="tight")
    print(f"Saved: {pdf_path}")
    print(f"Saved: {png_path}")

# --- Load control summaries (frozen) ---
mu  = np.load(CONTROL_MEAN_FP)
qlo = np.load(CONTROL_QLO_FP)
qhi = np.load(CONTROL_QHI_FP)
iqr = np.load(CONTROL_IQR_FP)

assert mu.ndim == qlo.ndim == qhi.ndim == iqr.ndim == 1, "Control arrays must be 1D"
assert mu.shape == qlo.shape == qhi.shape == iqr.shape, "Control arrays shape mismatch"
n_genes = mu.shape[0]

# --- Deterministic selection: top-K genes by control IQR ---
K = 30
top_idx = np.argsort(iqr)[-K:][::-1]

sel_mu  = mu[top_idx]
sel_qlo = qlo[top_idx]
sel_qhi = qhi[top_idx]

# Use stable labels (rank + index) since gene names are not available from wide headers
labels = [f"g{rank+1:02d}:{idx}" for rank, idx in enumerate(top_idx)]

# --- Plot ---
fig = plt.figure(figsize=(7.5, 4.0))
ax = fig.add_subplot(111)

x = np.arange(K)

# Asymmetric error bars: lower = mean - qlo, upper = qhi - mean
yerr = np.vstack([sel_mu - sel_qlo, sel_qhi - sel_mu])
ax.errorbar(x, sel_mu, yerr=yerr, fmt="o", capsize=3, linewidth=1)

ax.set_title("Control distribution envelope (top genes by control IQR)")
ax.set_xlabel("Gene (ranked by control IQR; label = rank:index)")
ax.set_ylabel("Control expression (quantile envelope around mean)")

ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=60, ha="right")
ax.grid(True, axis="y", alpha=0.3)

meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | n_genes={RUN_META.get('n_genes')} | "
    f"control={RUN_META.get('control_label')} | seed={RUN_META.get('random_state')}"
)
ax.text(0.01, -0.28, meta_line, transform=ax.transAxes, fontsize=8, va="top")

savefig(fig, "fig_01_control_envelope")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_01_control_envelope_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_01_control_envelope_20260207_152953.png


In [4]:
# ============================================================
# v6_figures — Cell 3 (REPLACEMENT): Figure 2 (PC1 Dominance via Decomposition) [Render-only]
# ============================================================
# What this figure shows:
#   - How much of each perturbation delta vector is captured by PC1 (parallel norm)
#   - How much remains orthogonal (residual norm)
#   - Distribution of fraction explained by PC1 across perturbations
#
# Why this is "dominance" (properly scoped):
#   - PC1 provides a dominant axis, but NOT necessarily sufficient
#
# Constraints:
#   - Load frozen artifact only
#   - No recomputation of PCA / bins / correlations
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ------------------------
# Load artifact (your schema)
# ------------------------
df = pd.read_csv(ARTIFACTS["pc1_dominance"])
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

required = {
    "perturbation",
    "pc1_score",
    "delta_norm",
    "pc1_parallel_norm",
    "pc1_residual_norm",
    "fraction_explained_by_pc1",
}
assert required.issubset(df.columns), f"pc1_dominance.csv missing columns: {required - set(df.columns)}"

# Basic numeric sanity
for c in ["pc1_score", "delta_norm", "pc1_parallel_norm", "pc1_residual_norm", "fraction_explained_by_pc1"]:
    assert np.all(np.isfinite(df[c].values)), f"Non-finite values detected in column: {c}"

# ------------------------
# Plot (single figure, 2 panels as subplots is OK for a single figure)
# ------------------------
fig = plt.figure(figsize=(7.5, 4.5))
gs = fig.add_gridspec(1, 2, wspace=0.25)

# Panel A: decomposition scatter (parallel vs residual)
ax1 = fig.add_subplot(gs[0, 0])
ax1.scatter(df["pc1_parallel_norm"], df["pc1_residual_norm"], s=22, alpha=0.85)
ax1.set_xlabel("||Δ_parallel|| (along PC1)")
ax1.set_ylabel("||Δ_residual|| (orthogonal to PC1)")
ax1.set_title("PC1 vs orthogonal residual magnitude")
ax1.grid(True, axis="both", alpha=0.25)

# 1:1 reference line for scale
mx = float(max(df["pc1_parallel_norm"].max(), df["pc1_residual_norm"].max()))
ax1.plot([0, mx], [0, mx], linewidth=1, alpha=0.6)

# Panel B: distribution of fraction explained by PC1
ax2 = fig.add_subplot(gs[0, 1])
ax2.hist(df["fraction_explained_by_pc1"].values, bins=20, edgecolor="black", linewidth=0.7)
ax2.set_xlabel("Fraction of ||Δ|| explained by PC1")
ax2.set_ylabel("Number of perturbations")
ax2.set_title("Distribution of PC1 explained fraction")
ax2.grid(True, axis="y", alpha=0.25)

# Add summary stats as text (deterministic)
mean_frac = float(df["fraction_explained_by_pc1"].mean())
med_frac  = float(df["fraction_explained_by_pc1"].median())
min_frac  = float(df["fraction_explained_by_pc1"].min())
max_frac  = float(df["fraction_explained_by_pc1"].max())

stats_line = f"mean={mean_frac:.3f} | median={med_frac:.3f} | min={min_frac:.3f} | max={max_frac:.3f}"
fig.suptitle("PC1 dominance as a geometric decomposition (render-only)", y=1.02, fontsize=11)
fig.text(0.5, -0.02, stats_line, ha="center", va="top", fontsize=9)

# ------------------------
# Metadata stamp
# ------------------------
meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

# ------------------------
# Save
# ------------------------
savefig(fig, "fig_02_pc1_dominance_decomposition")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_02_pc1_dominance_decomposition_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_02_pc1_dominance_decomposition_20260207_152953.png


In [6]:
# ============================================================
# v6_figures — Cell 4 (REPLACEMENT): Figure 3 (Alignment Summary) [Render-only]
# ============================================================
# Purpose:
#   - Report PC1 alignment with programs as a summary table/plot
#   - Uses frozen Spearman rho and p-values already computed in v6
# Constraints:
#   - No recomputation
#   - alignment_table.csv is summary-form (program-level)
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(ARTIFACTS["alignment_table"])
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

required = {"program", "spearman_rho", "abs_rho", "p_value"}
assert required.issubset(df.columns), (
    f"alignment_table.csv must contain columns {required}, "
    f"found {set(df.columns)}"
)

# Sort by absolute rho descending
df = df.sort_values("abs_rho", ascending=False).reset_index(drop=True)

# Plot top-K programs (or all if small)
K = min(10, len(df))
sub = df.iloc[:K].copy()

fig = plt.figure(figsize=(6.5, 4.0))
ax = fig.add_subplot(111)

ax.barh(sub["program"][::-1], sub["spearman_rho"][::-1])
ax.set_xlabel("Spearman ρ (PC1 vs program score)")
ax.set_title("Alignment of PC1 with program scores (summary)")

ax.grid(True, axis="x", alpha=0.25)

# Annotate values
for i, (prog, rho, pval) in enumerate(zip(sub["program"][::-1], sub["spearman_rho"][::-1], sub["p_value"][::-1])):
    ax.text(rho, i, f"  ρ={rho:.3f}, p={pval:.1e}", va="center", fontsize=8)

meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

savefig(fig, "fig_03_alignment_summary")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_03_alignment_summary_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_03_alignment_summary_20260207_152953.png


In [7]:
# ============================================================
# v6_figures — Cell 5: Figure 4 (PC1 Insufficiency via Collision Pairs) [Render-only]
# ============================================================
# Purpose:
#   - Show "PC1-collision" pairs: similar PC1 score but divergent outcomes
#   - Use frozen pc1_collisions.csv (pair-level) only
# Notes:
#   - This cell makes ZERO assumptions about gene names or upstream objects
#   - Defensive schema handling: will pick sensible column names if they differ
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ------------------------
# Load artifact
# ------------------------
col = pd.read_csv(ARTIFACTS["pc1_collisions"])
col = col.drop(columns=["Unnamed: 0"], errors="ignore")

# ------------------------
# Resolve schema robustly (no guessing beyond common patterns)
# ------------------------
def pick_first(cols, candidates):
    for c in candidates:
        if c in cols:
            return c
    return None

cols = set(col.columns)

pc_sim_col = pick_first(cols, ["pc1_bin", "pc1_bin_id", "pc1_bin_index", "bin", "bin_id"])
pc_diff_col = pick_first(cols, ["pc1_diff", "abs_pc1_diff", "pc1_distance", "pc1_gap"])
outcome_diff_col = pick_first(cols, ["adipo_diff", "abs_adipo_diff", "outcome_diff", "abs_outcome_diff", "program_diff"])
resid_diff_col = pick_first(cols, ["residual_diff", "abs_residual_diff", "pc1_residual_diff", "abs_pc1_residual_diff"])
pair_id_col = pick_first(cols, ["pair_id", "pair", "collision_id"])

# Minimal required: an outcome divergence metric OR residual divergence metric
assert (outcome_diff_col is not None) or (resid_diff_col is not None), (
    f"pc1_collisions.csv must contain an outcome or residual divergence column. "
    f"Found columns: {sorted(cols)}"
)

# If both exist, we plot outcome divergence and color by residual divergence; otherwise plot what's available
metric_x = outcome_diff_col if outcome_diff_col is not None else resid_diff_col
metric_c = resid_diff_col if (outcome_diff_col is not None and resid_diff_col is not None) else None

# Optional filtering: top-N collision pairs by outcome divergence (or by residual divergence if outcome missing)
col["_metric_x"] = col[metric_x].astype(float)
col = col[np.isfinite(col["_metric_x"].values)].copy()

# Keep top N pairs for readability
TOP_N = 60
col_top = col.sort_values("_metric_x", ascending=False).head(TOP_N).copy()

# ------------------------
# Plot
# ------------------------
fig = plt.figure(figsize=(7.5, 4.5))
ax = fig.add_subplot(111)

y = np.arange(len(col_top))[::-1]

# X values = divergence metric
x = col_top["_metric_x"].values

if metric_c is not None:
    cvals = col_top[metric_c].astype(float).values
    cvals = np.where(np.isfinite(cvals), cvals, np.nan)
    sc = ax.scatter(x, y, c=cvals, s=40, alpha=0.9)
    cbar = fig.colorbar(sc, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label("Residual divergence (|Δ_residual| difference)")
else:
    ax.scatter(x, y, s=40, alpha=0.9)

# Labels
xlab = "Outcome divergence" if metric_x == outcome_diff_col else "Residual divergence"
ax.set_xlabel(f"{xlab} among PC1-collision pairs")
ax.set_ylabel(f"Top collision pairs (n={len(col_top)})")
ax.set_title("PC1 insufficiency: collision pairs with divergent outcomes")

# Optional annotation: if we have pc1_bin or pc1_diff, show in a compact ytick label
yticks = []
for _, r in col_top.iterrows():
    parts = []
    if pair_id_col is not None:
        parts.append(str(r[pair_id_col]))
    if pc_sim_col is not None:
        parts.append(f"bin={r[pc_sim_col]}")
    if pc_diff_col is not None and np.isfinite(r[pc_diff_col]):
        parts.append(f"|Δpc1|={float(r[pc_diff_col]):.3g}")
    yticks.append(" | ".join(parts) if parts else "")

ax.set_yticks(y)
if any(t != "" for t in yticks):
    ax.set_yticklabels(yticks, fontsize=7)
else:
    ax.set_yticklabels([""] * len(y))

ax.grid(True, axis="x", alpha=0.25)

# ------------------------
# Metadata stamp
# ------------------------
meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

# ------------------------
# Save
# ------------------------
savefig(fig, "fig_04_pc1_collision_pairs")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_04_pc1_collision_pairs_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_04_pc1_collision_pairs_20260207_152953.png


In [8]:
# ============================================================
# v6_figures — Cell 6: Figure 5 (Null Models) [Render-only]
# ============================================================
# Purpose:
#   - Show PC1 alignment is non-trivial vs (i) random-axis null and (ii) label-shuffle null
#   - Uses frozen null distributions (abs_rho) + observed adipo alignment from alignment_table.csv
# Constraints:
#   - No recomputation of nulls
#   - No recomputation of PCA
#   - Observed rho pulled from alignment_table.csv (program == 'adipo')
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ------------------------
# Load null distributions (frozen)
# ------------------------
rand_abs = np.load(ARTIFACTS["null_random_axis_abs_rhos"])
shuf_abs = np.load(ARTIFACTS["null_shuffle_label_abs_rhos"])

assert rand_abs.ndim == 1 and shuf_abs.ndim == 1, "Null arrays must be 1D"
assert np.all(np.isfinite(rand_abs)), "Non-finite values in null_random_axis_abs_rhos"
assert np.all(np.isfinite(shuf_abs)), "Non-finite values in null_shuffle_label_abs_rhos"

# ------------------------
# Load observed alignment (frozen summary)
# ------------------------
align = pd.read_csv(ARTIFACTS["alignment_table"]).drop(columns=["Unnamed: 0"], errors="ignore")
req = {"program", "spearman_rho"}
assert req.issubset(align.columns), f"alignment_table.csv missing columns: {req - set(align.columns)}"

# Observed rho for adipogenic program
row = align.loc[align["program"].astype(str).str.lower() == "adipo"]
assert len(row) == 1, f"Expected exactly one row for program == 'adipo', found {len(row)}"
rho_obs = float(row["spearman_rho"].iloc[0])
abs_obs = abs(rho_obs)

# Empirical p-values (render-only summary; does not change any analysis)
p_rand = float((rand_abs >= abs_obs).mean())
p_shuf = float((shuf_abs >= abs_obs).mean())

# ------------------------
# Plot
# ------------------------
fig = plt.figure(figsize=(7.5, 4.0))
ax = fig.add_subplot(111)

bins = 30
ax.hist(rand_abs, bins=bins, alpha=0.6, label=f"Random-axis null (n={len(rand_abs)})")
ax.hist(shuf_abs, bins=bins, alpha=0.6, label=f"Shuffled-label null (n={len(shuf_abs)})")

ax.axvline(abs_obs, linewidth=2)
ax.text(
    abs_obs, ax.get_ylim()[1] * 0.95,
    f"|ρ_obs|={abs_obs:.3f}\n"
    f"p_rand={p_rand:.2e}\n"
    f"p_shuf={p_shuf:.2e}",
    ha="left", va="top", fontsize=9
)

ax.set_xlabel("Absolute Spearman correlation |ρ|")
ax.set_ylabel("Count")
ax.set_title("Null model comparisons for PC1–adipogenic alignment")
ax.grid(True, axis="y", alpha=0.25)
ax.legend(frameon=False)

# ------------------------
# Metadata stamp
# ------------------------
meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

# ------------------------
# Save
# ------------------------
savefig(fig, "fig_05_null_model_comparisons")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_05_null_model_comparisons_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_05_null_model_comparisons_20260207_152953.png


In [9]:
# ============================================================
# v6_figures — Cell 7: Figure 6 (Distributional Grounding: Exceedance Rates) [Render-only]
# ============================================================
# Purpose:
#   - Show how often perturbations exceed the control envelope (distribution-first grounding)
#   - Uses frozen exceedance_rates.csv only
# Constraints:
#   - No recomputation of envelopes or exceedance
#   - Schema-robust: will find the exceedance column deterministically
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(ARTIFACTS["exceedance_rates"])
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# ------------------------
# Resolve schema robustly
# ------------------------
cols = set(df.columns)

def pick_first(candidates):
    for c in candidates:
        if c in cols:
            return c
    return None

pert_col = pick_first(["perturbation", "pert", "perturb", "target"])
exc_col  = pick_first([
    "exceed_rate_total", "exceedance_rate", "exceed_rate", "rate_exceed",
    "frac_exceed", "fraction_exceed", "exceed_frac"
])

assert pert_col is not None, f"Could not find perturbation column in exceedance_rates.csv. Columns: {sorted(cols)}"
assert exc_col  is not None, f"Could not find exceedance-rate column in exceedance_rates.csv. Columns: {sorted(cols)}"

# Numeric sanity
df[exc_col] = df[exc_col].astype(float)
df = df[np.isfinite(df[exc_col].values)].copy()

# Sort by exceedance descending
df = df.sort_values(exc_col, ascending=False).reset_index(drop=True)

# Plot top K for readability
K = min(30, len(df))
sub = df.iloc[:K].copy()

fig = plt.figure(figsize=(7.5, 4.5))
ax = fig.add_subplot(111)

ax.bar(np.arange(K), sub[exc_col].values)
ax.set_title("Distributional grounding: perturbation exceedance over control envelope")
ax.set_xlabel(f"Top perturbations by exceedance rate (n={K})")
ax.set_ylabel("Exceedance rate (fraction of genes outside control envelope)")

# Optional: show names only if short; otherwise omit labels to avoid clutter
names = sub[pert_col].astype(str).tolist()
if max(len(s) for s in names) <= 12:
    ax.set_xticks(np.arange(K))
    ax.set_xticklabels(names, rotation=60, ha="right", fontsize=8)
else:
    ax.set_xticks([])

ax.grid(True, axis="y", alpha=0.25)

# Summary stats line
mean_exc = float(df[exc_col].mean())
med_exc  = float(df[exc_col].median())
max_exc  = float(df[exc_col].max())
fig.text(
    0.5, -0.02,
    f"mean={mean_exc:.3f} | median={med_exc:.3f} | max={max_exc:.3f}",
    ha="center", va="top", fontsize=9
)

# Metadata stamp
meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

savefig(fig, "fig_06_exceedance_rates")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_06_exceedance_rates_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_06_exceedance_rates_20260207_152953.png


In [11]:
# ============================================================
# v6_figures — Cell 8 (REPLACEMENT): Figure 7 (Bin-wise residual–outcome dependence) [Render-only]
# ============================================================
# Purpose:
#   - Show that, within PC1 bins, residual magnitude is outcome-dependent
#   - Visualize per-bin Spearman correlation between residual and adipogenic priming
# Inputs (frozen):
#   - bin_residual_stats.csv with columns:
#       pc1_bin, spearman_rho(residual,adipo), p_value, n_perts
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(ARTIFACTS["bin_residual_stats"])
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

cols = set(df.columns)

bin_col = "pc1_bin"
rho_col = "spearman_rho(residual,adipo)"
p_col   = "p_value"
n_col   = "n_perts"

required = {bin_col, rho_col, p_col, n_col}
assert required.issubset(cols), f"bin_residual_stats.csv missing columns: {required - cols}"

# Coerce numeric
df[bin_col] = pd.to_numeric(df[bin_col], errors="coerce")
df[rho_col] = pd.to_numeric(df[rho_col], errors="coerce")
df[p_col]   = pd.to_numeric(df[p_col], errors="coerce")
df[n_col]   = pd.to_numeric(df[n_col], errors="coerce")

df = df.dropna(subset=[bin_col, rho_col, p_col, n_col]).copy()
df = df.sort_values(bin_col).reset_index(drop=True)

# ------------------------
# Plot
# ------------------------
fig = plt.figure(figsize=(7.5, 4.5))
ax = fig.add_subplot(111)

x = df[bin_col].values
y = df[rho_col].values

# Point size encodes n_perts (bounded for readability)
sizes = 30 + 120 * (df[n_col].values / df[n_col].values.max())

ax.scatter(x, y, s=sizes, alpha=0.9)
ax.axhline(0, linewidth=1, alpha=0.6)

ax.set_title("Outcome-dependence persists within PC1 bins (residual ↔ adipogenic priming)")
ax.set_xlabel("PC1 bin")
ax.set_ylabel("Spearman ρ(residual, adipo) within bin")

ax.grid(True, axis="y", alpha=0.25)

# Annotate significance (simple, no multiple-testing claims)
sig = df[df[p_col] < 0.05]
for _, r in sig.iterrows():
    ax.text(float(r[bin_col]), float(r[rho_col]), "*", ha="center", va="bottom", fontsize=12)

# Footer: show legend for point size
fig.text(0.5, -0.02, "Point size ∝ number of perturbations in bin (n_perts); * p<0.05", ha="center", va="top", fontsize=9)

# Metadata stamp
meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

savefig(fig, "fig_07_binwise_residual_adipo_dependence")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_07_binwise_residual_adipo_dependence_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_07_binwise_residual_adipo_dependence_20260207_152953.png


In [12]:
# ============================================================
# v6_figures — Cell 9: Figure 8 (Falsifiability Checklist Summary) [Render-only]
# ============================================================
# Purpose:
#   - Convert falsifiability_checklist.csv into a simple, reviewer-friendly figure
#   - Visual: pass/fail (or status) per falsifiability item
# Inputs (frozen):
#   - falsifiability_checklist.csv
# Constraints:
#   - No recomputation of any tests
#   - Schema-robust: detects common column patterns
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(ARTIFACTS["falsifiability_checklist"])
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

cols = set(df.columns)

def pick_first(candidates):
    for c in candidates:
        if c in cols:
            return c
    return None

item_col = pick_first(["check", "item", "criterion", "test", "name"])
status_col = pick_first(["status", "passed", "pass", "result", "verdict"])

assert item_col is not None, f"Could not find checklist item column. Columns: {sorted(cols)}"
assert status_col is not None, f"Could not find checklist status column. Columns: {sorted(cols)}"

# Normalize status into {PASS, FAIL, UNKNOWN}
raw = df[status_col]

def norm_status(x):
    if pd.isna(x):
        return "UNKNOWN"
    s = str(x).strip().lower()
    if s in {"true", "pass", "passed", "yes", "y", "1"}:
        return "PASS"
    if s in {"false", "fail", "failed", "no", "n", "0"}:
        return "FAIL"
    # allow already-encoded statuses
    if "pass" in s:
        return "PASS"
    if "fail" in s:
        return "FAIL"
    return "UNKNOWN"

df["_status"] = raw.map(norm_status)

# Stable ordering: keep file order
items = df[item_col].astype(str).tolist()
statuses = df["_status"].tolist()

# Map to numeric for plotting
val_map = {"FAIL": 0, "UNKNOWN": 1, "PASS": 2}
vals = np.array([val_map[s] for s in statuses], dtype=float)

fig = plt.figure(figsize=(7.5, max(3.5, 0.35 * len(items) + 1.5)))
ax = fig.add_subplot(111)

y = np.arange(len(items))[::-1]

# Plot as a dot-strip (no color dependence for meaning)
ax.scatter(vals, y, s=80, alpha=0.9)

ax.set_yticks(y)
ax.set_yticklabels(items, fontsize=9)

ax.set_xticks([0, 1, 2])
ax.set_xticklabels(["FAIL", "UNKNOWN", "PASS"])
ax.set_xlim(-0.4, 2.4)

ax.set_title("Falsifiability checklist (pre-specified conditions)")
ax.grid(True, axis="x", alpha=0.25)

# Counts summary
n_pass = int((df["_status"] == "PASS").sum())
n_fail = int((df["_status"] == "FAIL").sum())
n_unk  = int((df["_status"] == "UNKNOWN").sum())

fig.text(
    0.5, -0.02,
    f"PASS={n_pass} | FAIL={n_fail} | UNKNOWN={n_unk}",
    ha="center", va="top", fontsize=9
)

# Metadata stamp
meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')} | "
    f"verdict={RUN_META.get('verdict')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

savefig(fig, "fig_08_falsifiability_checklist")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_08_falsifiability_checklist_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_08_falsifiability_checklist_20260207_152953.png


In [13]:
# ============================================================
# v6_figures — Cell 10: Figure 9 (Falsifiability Checklist — FIXED status mapping) [Render-only]
# ============================================================
# Your current plot shows UNKNOWN=3 because the checklist likely stores status as something like:
#   "yes/no", "PASS/FAIL", "TRUE/FALSE", or a "met" column, etc.
# This cell:
#   - prints the detected status column + unique values
#   - applies a stronger normalization (handles PASS/FAIL strings and boolean-ish variants)
#   - regenerates the figure as fig_09_falsifiability_checklist_fixed
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(ARTIFACTS["falsifiability_checklist"])
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

cols = set(df.columns)

def pick_first(candidates):
    for c in candidates:
        if c in cols:
            return c
    return None

item_col = pick_first(["check", "item", "criterion", "test", "name", "statement"])
status_col = pick_first(["status", "passed", "pass", "result", "verdict", "met", "satisfied", "ok"])

assert item_col is not None, f"Could not find checklist item column. Columns: {sorted(cols)}"
assert status_col is not None, f"Could not find checklist status column. Columns: {sorted(cols)}"

print("Checklist columns:", sorted(cols))
print("Using item_col:", item_col)
print("Using status_col:", status_col)
print("Unique raw status values:", sorted({str(x) for x in df[status_col].dropna().unique()}))

def norm_status(x):
    if pd.isna(x):
        return "UNKNOWN"
    # If it's a real boolean
    if isinstance(x, (bool, np.bool_)):
        return "PASS" if bool(x) else "FAIL"

    s = str(x).strip().lower()

    # numeric-like
    if s in {"1", "1.0", "true", "t", "yes", "y"}:
        return "PASS"
    if s in {"0", "0.0", "false", "f", "no", "n"}:
        return "FAIL"

    # string-like
    if s in {"pass", "passed", "ok", "met", "satisfied"}:
        return "PASS"
    if s in {"fail", "failed", "not met", "unsatisfied"}:
        return "FAIL"

    # substring safety
    if "pass" in s or "met" in s or "satisf" in s or "ok" in s:
        return "PASS"
    if "fail" in s or "not" in s or "unsatisf" in s:
        return "FAIL"

    return "UNKNOWN"

df["_status"] = df[status_col].map(norm_status)

items = df[item_col].astype(str).tolist()
vals = np.array([{"FAIL": 0, "UNKNOWN": 1, "PASS": 2}[s] for s in df["_status"].tolist()], dtype=float)

fig = plt.figure(figsize=(7.5, max(3.5, 0.35 * len(items) + 1.5)))
ax = fig.add_subplot(111)

y = np.arange(len(items))[::-1]
ax.scatter(vals, y, s=120, alpha=0.9)

ax.set_yticks(y)
ax.set_yticklabels(items, fontsize=10)

ax.set_xticks([0, 1, 2])
ax.set_xticklabels(["FAIL", "UNKNOWN", "PASS"])
ax.set_xlim(-0.4, 2.4)

ax.set_title("Falsifiability checklist (pre-specified conditions) — status resolved")
ax.grid(True, axis="x", alpha=0.25)

n_pass = int((df["_status"] == "PASS").sum())
n_fail = int((df["_status"] == "FAIL").sum())
n_unk  = int((df["_status"] == "UNKNOWN").sum())

fig.text(
    0.5, -0.02,
    f"PASS={n_pass} | FAIL={n_fail} | UNKNOWN={n_unk}",
    ha="center", va="top", fontsize=10
)

meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')} | "
    f"verdict={RUN_META.get('verdict')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

savefig(fig, "fig_09_falsifiability_checklist_fixed")
plt.close(fig)


Checklist columns: ['check', 'expected_under_H0', 'observed', 'result']
Using item_col: check
Using status_col: result
Unique raw status values: ['VIOLATED']
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_09_falsifiability_checklist_fixed_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_09_falsifiability_checklist_fixed_20260207_152953.png


In [14]:
# ============================================================
# v6_figures — Cell 11: Figure 10 (Falsifiability Checklist — handle "VIOLATED") [Render-only]
# ============================================================
# Your checklist uses result == "VIOLATED" (not PASS/FAIL).
# In v6 logic, "VIOLATED" means the H0 expectation was violated -> that is a PASS for falsification.
# This cell:
#   - maps {"VIOLATED" -> PASS, "SUPPORTED"/"HELD"/"NOT_VIOLATED" -> FAIL} with robust fallbacks
#   - regenerates as fig_10_falsifiability_checklist_passfail
# ============================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(ARTIFACTS["falsifiability_checklist"])
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

required = {"check", "result"}
assert required.issubset(df.columns), f"falsifiability_checklist.csv missing columns: {required - set(df.columns)}"

def norm_status_from_result(x):
    if pd.isna(x):
        return "UNKNOWN"
    s = str(x).strip().lower()

    # Key mapping for this notebook:
    # If H0 expectation is "violated", that supports our falsification claim -> PASS (for the paper).
    if "violat" in s:
        return "PASS"

    # If explicitly says not violated / held / supported, that means H0 survived -> FAIL (for falsification).
    if ("not" in s and "violat" in s) or ("held" in s) or ("support" in s) or ("consistent" in s):
        return "FAIL"

    # Fallbacks
    if s in {"pass", "passed", "true", "yes", "y", "1"}:
        return "PASS"
    if s in {"fail", "failed", "false", "no", "n", "0"}:
        return "FAIL"

    return "UNKNOWN"

df["_status"] = df["result"].map(norm_status_from_result)

items = df["check"].astype(str).tolist()
vals = np.array([{"FAIL": 0, "UNKNOWN": 1, "PASS": 2}[s] for s in df["_status"].tolist()], dtype=float)

fig = plt.figure(figsize=(7.5, max(3.5, 0.40 * len(items) + 1.5)))
ax = fig.add_subplot(111)

y = np.arange(len(items))[::-1]
ax.scatter(vals, y, s=140, alpha=0.9)

ax.set_yticks(y)
ax.set_yticklabels(items, fontsize=11)

ax.set_xticks([0, 1, 2])
ax.set_xticklabels(["FAIL (H0 holds)", "UNKNOWN", "PASS (H0 violated)"])
ax.set_xlim(-0.5, 2.5)

ax.set_title("Falsifiability checklist (pre-specified conditions) — interpreted for falsification")
ax.grid(True, axis="x", alpha=0.25)

n_pass = int((df["_status"] == "PASS").sum())
n_fail = int((df["_status"] == "FAIL").sum())
n_unk  = int((df["_status"] == "UNKNOWN").sum())

fig.text(
    0.5, -0.02,
    f"PASS={n_pass} | FAIL={n_fail} | UNKNOWN={n_unk}   (PASS means H0 expectation was violated)",
    ha="center", va="top", fontsize=10
)

meta_line = (
    f"n_cells={RUN_META.get('n_cells')} | "
    f"n_perturbations={RUN_META.get('n_perturbations_excl_control')} | "
    f"control={RUN_META.get('control_label')} | "
    f"seed={RUN_META.get('random_state')} | "
    f"verdict={RUN_META.get('verdict')}"
)
fig.text(0.01, 0.01, meta_line, fontsize=8, ha="left", va="bottom")

savefig(fig, "fig_10_falsifiability_checklist_passfail")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_10_falsifiability_checklist_passfail_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_10_falsifiability_checklist_passfail_20260207_152953.png


In [15]:
# ============================================================
# v6_figures — Cell 12: Figure 11 (One-page figure manifest) [Render-only]
# ============================================================
# Purpose:
#   - Create a single "manifest" figure listing all saved figure files for the manuscript
#   - Prevents missing/forgotten panels when exporting to Overleaf / Word
# Inputs:
#   - FIG_DIR contents + RUN_META
# Output:
#   - fig_11_figure_manifest_<runstamp>.pdf/.png
# ============================================================

from pathlib import Path
import matplotlib.pyplot as plt

# Collect all figures generated in this run (match run_stamp)
run_stamp = RUN_META.get("run_timestamp_utc", "unknown").replace(":", "").replace("-", "").replace("T", "_").replace("Z", "")

fig_files = sorted([p.name for p in FIG_DIR.glob(f"*_{run_stamp}.pdf")])

# Fallback: if you saved without run_stamp suffix in some earlier cells
if not fig_files:
    fig_files = sorted([p.name for p in FIG_DIR.glob("*.pdf")])

lines = []
lines.append(f"v6 figure manifest")
lines.append(f"run_timestamp_utc: {RUN_META.get('run_timestamp_utc')}")
lines.append(f"n_cells: {RUN_META.get('n_cells')} | n_genes: {RUN_META.get('n_genes')} | n_perts: {RUN_META.get('n_perturbations_excl_control')}")
lines.append(f"control: {RUN_META.get('control_label')} | seed: {RUN_META.get('random_state')} | verdict: {RUN_META.get('verdict')}")
lines.append("")
lines.append("Saved figures (PDF):")
if fig_files:
    for i, f in enumerate(fig_files, 1):
        lines.append(f"{i:02d}. {f}")
else:
    lines.append("(none found)")

# Render as a simple text figure
fig = plt.figure(figsize=(8.5, 11.0))
ax = fig.add_subplot(111)
ax.axis("off")

ax.text(
    0.02, 0.98,
    "\n".join(lines),
    ha="left", va="top",
    fontsize=11,
    family="monospace"
)

savefig(fig, "fig_11_figure_manifest")
plt.close(fig)


Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_11_figure_manifest_20260207_152953.pdf
Saved: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\fig_11_figure_manifest_20260207_152953.png


In [16]:
# ============================================================
# v6_figures — Cell 13: Export a manuscript-ready figure bundle (ZIP) [Render-only]
# ============================================================
# Purpose:
#   - Package all figures from this run into a single zip for upload/sharing
# Output:
#   - figures_v6_<runstamp>.zip inside FIG_DIR
# ============================================================

import zipfile
from pathlib import Path

run_stamp = RUN_META.get("run_timestamp_utc", "unknown").replace(":", "").replace("-", "").replace("T", "_").replace("Z", "")

pdfs = sorted(FIG_DIR.glob(f"*_{run_stamp}.pdf"))
pngs = sorted(FIG_DIR.glob(f"*_{run_stamp}.png"))

# Fallback if some figures were saved without the runstamp suffix
if not pdfs and not pngs:
    pdfs = sorted(FIG_DIR.glob("*.pdf"))
    pngs = sorted(FIG_DIR.glob("*.png"))

zip_path = FIG_DIR / f"figures_v6_{run_stamp}.zip"

with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for p in pdfs + pngs:
        z.write(p, arcname=p.name)

print("Zipped figures to:", zip_path)
print("Included PDFs:", len(pdfs))
print("Included PNGs:", len(pngs))


Zipped figures to: C:\Users\Bryan\Documents\CrunchDAO Obesity\results_v6\figures\figures_v6_20260207_152953.zip
Included PDFs: 11
Included PNGs: 11
