# MaxQuant Mutant Peptide Channel Enrichment

For each detected mutant evidence row, parses the **Proteins** column to find
which mutation the peptide is consistent with, looks up which TMT channels carry
that mutation (via per-sample FASTAs), then computes:

> **ratio = mean RI in channels that SHOULD have the mutation /
>            mean RI in channels that should NOT**

Mutant proteins in MaxQuant's `Proteins` column have the format
`{accession}-{swap}-{hash}` (e.g. `Q8N4T8-L70M-DC40`), produced by
`combine_plex_fastas.py` for Philosopher/TMT-Integrator compatibility.
This lets us parse `(accession, swap)` directly from that column.

Input: `combined/txt/evidence.txt` from a MaxQuant TMT11 search.

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# ── CONFIG ────────────────────────────────────────────────────────────────────
PLEX_ID     = "01CPTAC_CCRCC_Proteome_JHU_20171007"
RESULTS_DIR = f"/scratch/leduc.an/AAS_Evo/MQ_SEARCH/results/{PLEX_ID}"
REPO_DIR    = "/home/leduc.an/AAS_Evo_project/AAS_Evo"
TMT_MAP     = f"{REPO_DIR}/metadata/PDC_meta/pdc_file_tmt_map.tsv"
GDC_META    = f"{REPO_DIR}/metadata/GDC_meta/gdc_meta_matched.tsv"
FASTA_DIR   = "/scratch/leduc.an/AAS_Evo/FASTA/per_sample"

EVIDENCE_FILE = os.path.join(RESULTS_DIR, "combined", "txt", "evidence.txt")
print(f"Evidence file: {EVIDENCE_FILE}")

# ── LOAD EVIDENCE ─────────────────────────────────────────────────────────────
ev = pd.read_csv(EVIDENCE_FILE, sep="\t", low_memory=False)
print(f"Total evidence rows: {len(ev):,}")

# ── DETECT REPORTER INTENSITY COLUMNS ─────────────────────────────────────────
# TMT11 channel order matches isobaricLabels order in mqpar.xml:
# idx 1=126C, 2=127N, 3=127C, 4=128N, 5=128C, 6=129N, 7=129C,
#     8=130N, 9=130C, 10=131N, 11=131C
RI_CHANNEL_ORDER = ["126C","127N","127C","128N","128C",
                    "129N","129C","130N","130C","131N","131C"]
CHANNEL_TO_IDX   = {ch: i + 1 for i, ch in enumerate(RI_CHANNEL_ORDER)}

def find_ri_cols(df):
    """Return {channel: column_name} for reporter intensity columns."""
    for prefix in ("Reporter intensity corrected", "Reporter intensity"):
        found = {ch: f"{prefix} {idx}"
                 for ch, idx in CHANNEL_TO_IDX.items()
                 if f"{prefix} {idx}" in df.columns}
        if found:
            print(f"Using prefix: '{prefix}'  ({len(found)} channels found)")
            return found
    ri_like = [c for c in df.columns if "reporter" in c.lower() or "intensity" in c.lower()]
    print("WARNING: no standard reporter intensity columns found. Intensity-like columns:")
    for c in ri_like[:20]: print(f"  {c}")
    return {}

ri_col_map = find_ri_cols(ev)
print(f"RI column map: {ri_col_map}")

# ── PARSE MUTANT PROTEINS FROM Proteins COLUMN ────────────────────────────────
# Mutant entries: accession-swap-hash  e.g. Q8N4T8-L70M-DC40
# Reference entries: plain accession   e.g. P04637
_MUT_PAT = re.compile(r'^([A-Z][A-Z0-9]{4,9})-([A-Z][0-9]+[A-Z])-([A-Z0-9]{4})$')

def parse_mutant_proteins(proteins_str):
    """Return set of (accession, swap) from a MaxQuant Proteins field."""
    if pd.isna(proteins_str):
        return set()
    pairs = set()
    for entry in str(proteins_str).split(";"):
        m = _MUT_PAT.match(entry.strip())
        if m:
            pairs.add((m.group(1), m.group(2)))
    return pairs

ev["_mut_pairs"] = ev["Proteins"].map(parse_mutant_proteins)
mut_mask = ev["_mut_pairs"].map(bool)   # True if any mutant protein found
mut_all  = ev[mut_mask].copy()

ri_cols = list(ri_col_map.values())
if ri_cols:
    nonzero_mask = (mut_all[ri_cols].fillna(0) > 0).any(axis=1)
    mut_ev = mut_all[nonzero_mask].copy()
else:
    mut_ev = mut_all.copy()

print(f"\nMutant evidence rows (any RI > 0): {len(mut_ev):,}  "
      f"(of {len(mut_all):,} mutant, {len(ev):,} total)")


In [None]:
# ── BUILD MUTATION → CHANNELS MAP (from per-sample FASTAs) ───────────────────
# Per-sample FASTAs use the original header format:
#   >mut|{accession}|{gene}|{swap}|{source}|{case_id}|{sample_type}
# which carries the patient identity needed to map mutation → TMT channel.

TMT_CHANNEL_MAP = {
    "tmt_126":"126C", "tmt_127n":"127N", "tmt_127c":"127C",
    "tmt_128n":"128N","tmt_128c":"128C", "tmt_129n":"129N",
    "tmt_129c":"129C","tmt_130n":"130N", "tmt_130c":"130C",
    "tmt_131":"131N", "tmt_131c":"131C",
}

tmt = pd.read_csv(TMT_MAP, sep="\t")
gdc = pd.read_csv(GDC_META, sep="\t")

# Resolve GDC UUID column — may be 'gdc_file_id' or 'file_id' depending on
# which version of the metadata pipeline generated the file.
if "gdc_file_id" in gdc.columns:
    uuid_col = "gdc_file_id"
elif "file_id" in gdc.columns:
    gdc = gdc.rename(columns={"file_id": "gdc_file_id"})
    uuid_col = "gdc_file_id"
else:
    raise ValueError(f"Cannot find UUID column in GDC metadata. "
                     f"Columns present: {list(gdc.columns)}")
print(f"GDC UUID column: '{uuid_col}'")

plex_tmt = (tmt[tmt["run_metadata_id"] == PLEX_ID]
            [["tmt_channel","case_submitter_id","sample_type"]].drop_duplicates())
plex_tmt = plex_tmt[~plex_tmt["case_submitter_id"].str.lower()
                    .isin(["ref","reference","pooled","pool","nan"])]
plex_tmt["channel"] = plex_tmt["tmt_channel"].map(TMT_CHANNEL_MAP)

# Diagnostic: check sample_type values align between TMT map and GDC metadata
tmt_sample_types = set(plex_tmt["sample_type"].dropna().str.lower().unique())
gdc_sample_types = set(gdc["sample_type"].dropna().str.lower().unique())
overlap = tmt_sample_types & gdc_sample_types
print(f"TMT map sample_type values:  {sorted(tmt_sample_types)}")
print(f"GDC meta sample_type values: {sorted(gdc_sample_types)}")
if not overlap:
    print("WARNING: No sample_type overlap — merge will produce all NaN UUIDs! "
          "Check normalization of sample_type in both files.")
else:
    print(f"sample_type overlap (OK):    {sorted(overlap)}")

plex_meta = plex_tmt.merge(
    gdc[["gdc_file_id","case_submitter_id","sample_type"]],
    on=["case_submitter_id","sample_type"], how="left")

n_matched = plex_meta["gdc_file_id"].notna().sum()
n_total   = len(plex_meta)
print(f"\nTMT channels in plex:        {n_total}")
print(f"  matched to GDC UUID:       {n_matched}")
print(f"  no GDC match (NaN UUID):   {n_total - n_matched}")

all_patient_channels = set(plex_meta["channel"].dropna().tolist())
print(f"All patient channels ({len(all_patient_channels)}): {sorted(all_patient_channels)}")

mutation_to_channels = defaultdict(set)   # (accession, swap) → {channels}
missing_fastas = []
# Only channels where a per-sample FASTA was found are included in the ratio
# universe. Channels with no GDC match or missing FASTA are excluded from both
# the numerator and denominator so they don't dilute the ratio computation.
channels_with_fastas = set()

for _, row in plex_meta.iterrows():
    uuid, channel = row["gdc_file_id"], row["channel"]
    if pd.isna(uuid) or pd.isna(channel):
        continue
    fasta_path = os.path.join(FASTA_DIR, f"{uuid}_mutant.fasta")
    if not os.path.exists(fasta_path):
        missing_fastas.append(uuid)
        continue
    channels_with_fastas.add(channel)
    with open(fasta_path) as f:
        for line in f:
            if not line.startswith(">"):
                continue
            parts = line[1:].strip().split("|")
            if len(parts) >= 4 and parts[0] == "mut":
                mutation_to_channels[(parts[1], parts[3])].add(channel)

print(f"Channels with per-sample FASTA ({len(channels_with_fastas)}): {sorted(channels_with_fastas)}")
n_excluded = len(all_patient_channels - channels_with_fastas)
if n_excluded:
    print(f"  Excluded from ratio universe (no FASTA): "
          f"{sorted(all_patient_channels - channels_with_fastas)}")
print(f"Unique (accession, swap) mutations mapped: {len(mutation_to_channels):,}")
print(f"Missing per-sample FASTAs:                 {len(missing_fastas)}")


In [None]:
# ── LOAD VAF PER (accession, swap, channel) ────────────────────────────────

MISSENSE  = "/scratch/leduc.an/AAS_Evo/VEP/all_missense_mutations.tsv"
REF_FASTA = "/scratch/leduc.an/AAS_Evo/SEQ_FILES/uniprot_human_canonical.fasta"

AA3TO1 = {
    "Ala":"A","Arg":"R","Asn":"N","Asp":"D","Cys":"C","Gln":"Q","Glu":"E",
    "Gly":"G","His":"H","Ile":"I","Leu":"L","Lys":"K","Met":"M","Phe":"F",
    "Pro":"P","Ser":"S","Thr":"T","Trp":"W","Tyr":"Y","Val":"V",
}

def parse_hgvsp_to_swap(hgvsp):
    m = re.search(r'p\.([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})', str(hgvsp))
    if m:
        ref = AA3TO1.get(m.group(1))
        alt = AA3TO1.get(m.group(3))
        if ref and alt:
            return f"{ref}{m.group(2)}{alt}"
    return None

gene_to_acc = {}
with open(REF_FASTA) as f:
    for line in f:
        if line.startswith(">"):
            m = re.search(r'GN=(\S+)', line)
            if m:
                gene_to_acc[m.group(1)] = line.split("|")[1]
print(f"Gene → accession entries: {len(gene_to_acc):,}")

uuid_to_channel = plex_meta.dropna(subset=["gdc_file_id","channel"]) \
                            .set_index("gdc_file_id")["channel"].to_dict()

missense = pd.read_csv(MISSENSE, sep="\t", low_memory=False)
plex_missense = missense[missense["sample_id"].isin(uuid_to_channel)]
print(f"Missense rows for this plex: {len(plex_missense):,}")

mutation_channel_vaf = {}
for _, vrow in plex_missense.iterrows():
    vaf     = vrow.get("VAF", np.nan)
    channel = uuid_to_channel.get(vrow["sample_id"])
    if pd.isna(vaf) or not channel:
        continue
    acc  = gene_to_acc.get(str(vrow.get("SYMBOL", "")))
    swap = parse_hgvsp_to_swap(str(vrow.get("HGVSp", "")))
    if acc and swap:
        mutation_channel_vaf[(acc, swap, channel)] = float(vaf)

print(f"(accession, swap, channel) VAF entries: {len(mutation_channel_vaf):,}")


In [None]:
# ── COMPUTE CHANNEL ENRICHMENT RATIO PER MUTANT EVIDENCE ROW ─────────────────
# ratio     : mean RI(have channels) / mean RI(not-have channels)
# ratio_vaf : VAF-weighted mean RI(have channels) / mean RI(not-have channels)
#
# Universe = channels_with_fastas: only channels where we have genomic data
# (per-sample FASTA). Channels with no GDC match or missing FASTA are excluded
# from both numerator and denominator so they don't dilute the ratio.
# Reference/pooled channels are already excluded (filtered in cell 2).
#
# All-zero RI rows were already removed above.
# Channels with RI = 0 are included in the average (0s are valid signal).
# Channels with RI = NaN are excluded.

if "mutation_channel_vaf" not in dir():
    print("NOTE: VAF cell not run — ratio_vaf will be NaN for all rows.")
    mutation_channel_vaf = {}

print(f"Ratio universe — channels with FASTA ({len(channels_with_fastas)}): "
      f"{sorted(channels_with_fastas)}")

PRECURSOR_CANDIDATES = ["Intensity", "MS/MS count"]
precursor_col = next((c for c in PRECURSOR_CANDIDATES if c in mut_ev.columns), None)
if precursor_col:
    nonzero_prec = (mut_ev[precursor_col].fillna(0) > 0).sum()
    print(f"Precursor intensity: '{precursor_col}'  ({nonzero_prec:,} / {len(mut_ev):,} non-zero)")
else:
    print(f"WARNING: no precursor intensity column found. Checked: {PRECURSOR_CANDIDATES}")

results = []
n_no_mutations = n_no_channel_info = n_cant_split = 0

for _, row in mut_ev.iterrows():
    mutations = row["_mut_pairs"]
    if not mutations:
        n_no_mutations += 1; continue

    have_channels = set()
    for mut_key in mutations:
        have_channels |= mutation_to_channels.get(mut_key, set())
    # Restrict to channels where we have FASTA data (the ratio universe)
    have_channels     &= channels_with_fastas
    not_have_channels  = channels_with_fastas - have_channels

    if not have_channels or not not_have_channels:
        n_cant_split += 1; continue

    have_ri = [(ch, row[ri_col_map[ch]])
               for ch in have_channels
               if ch in ri_col_map and pd.notna(row[ri_col_map[ch]])]
    not_ri  = [row[ri_col_map[ch]]
               for ch in not_have_channels
               if ch in ri_col_map and pd.notna(row[ri_col_map[ch]])]

    if not have_ri or not not_ri:
        n_no_channel_info += 1; continue

    mean_not  = np.mean(not_ri)
    mean_have = np.mean([r for _, r in have_ri])
    ratio     = mean_have / mean_not if mean_not > 0 else np.nan

    weighted_pairs = []
    for ch, ri_val in have_ri:
        vaf = None
        for acc, sw in mutations:
            v = mutation_channel_vaf.get((acc, sw, ch))
            if v is not None:
                vaf = v; break
        if vaf is not None:
            weighted_pairs.append((ri_val, vaf))

    ratio_vaf = (np.average([r for r, _ in weighted_pairs],
                             weights=[w for _, w in weighted_pairs]) / mean_not
                 if weighted_pairs and mean_not > 0 else np.nan)

    prec_intens = float(row[precursor_col]) \
        if (precursor_col and pd.notna(row.get(precursor_col))
            and row.get(precursor_col, 0) > 0) else np.nan

    results.append({
        "Peptide":             row.get("Sequence", ""),
        "n_have_ch":           len(have_channels),
        "n_not_have_ch":       len(not_have_channels),
        "mean_have_ri":        mean_have,
        "mean_not_ri":         mean_not,
        "ratio":               ratio,
        "ratio_vaf":           ratio_vaf,
        "precursor_intensity": prec_intens,
    })

ratios = pd.DataFrame(results).dropna(subset=["ratio"])
print(f"\nRows with computable ratio:         {len(ratios):,}  (of {len(mut_ev):,} mutant rows)")
print(f"  of which with VAF-weighted ratio: {ratios['ratio_vaf'].notna().sum():,}")
print(f"  with precursor intensity:         {ratios['precursor_intensity'].notna().sum():,}")
print(f"  Skipped — no mutation key:        {n_no_mutations}")
print(f"  Skipped — no channel split:       {n_cant_split}")
print(f"  Skipped — RI data missing:        {n_no_channel_info}")
print(f"\nUnweighted ratio summary:")
print(ratios["ratio"].describe().round(2))


In [None]:
# ── PLOT: enrichment ratio histograms ─────────────────────────────────────────

r_unw  = ratios["ratio"].replace(0, np.nan).dropna()
r_vaf  = ratios["ratio_vaf"].replace(0, np.nan).dropna()
r_spec = ratios.loc[ratios["n_have_ch"] == 1, "ratio"].replace(0, np.nan).dropna()

all_vals = pd.concat([r_unw, r_vaf]) if len(r_vaf) else r_unw
bins     = np.logspace(np.log10(all_vals.clip(lower=1e-3).min()),
                       np.log10(all_vals.max()), 60)
bins_spec = (np.logspace(np.log10(r_spec.clip(lower=1e-3).min()),
                         np.log10(r_spec.max()), 60)
             if len(r_spec) else bins)

med_unw  = r_unw.median()
med_vaf  = r_vaf.median()  if len(r_vaf)  else None
med_spec = r_spec.median() if len(r_spec) else None

fig, axes = plt.subplots(1, 3, figsize=(18, 4), sharey=False)
panels = [
    (axes[0], r_unw,  med_unw,  bins,      "All rows — Unweighted",                   "#4878d0"),
    (axes[1], r_vaf,  med_vaf,  bins,      "All rows — VAF-weighted",                 "#6acc65"),
    (axes[2], r_spec, med_spec, bins_spec, "Channel-specific (n_have=1)\nUnweighted", "#e07b39"),
]

for ax, data, med, b, title, color in panels:
    if len(data) == 0:
        ax.text(0.5, 0.5, "No data", ha="center", va="center", transform=ax.transAxes)
    else:
        ax.hist(data, bins=b, color=color, edgecolor="white", linewidth=0.4)
        ax.axvline(x=1,   color="grey",    linestyle="--", linewidth=1.2, label="ratio = 1")
        ax.axvline(x=med, color="#e74c3c", linestyle="-",  linewidth=1.5,
                   label=f"median = {med:.2f}")
        ax.set_xscale("log")
        ax.legend(fontsize=8)
    ax.set_xlabel("mean RI (have) / mean RI (not-have)  [log scale]")
    ax.set_ylabel("Number of mutant evidence rows")
    ax.set_title(f"{title}\n(n={len(data):,})")

fig.suptitle(f"Per-row channel enrichment — {PLEX_ID}", fontsize=11, y=1.02)
plt.tight_layout()
fig.savefig(os.path.join(RESULTS_DIR, "mutant_channel_enrichment.pdf"), bbox_inches="tight")
plt.show()

print(f"{'Metric':<32} {'Unweighted':>11} {'VAF-weighted':>13} {'n_have=1':>10}")
print("-" * 68)
for label, d1, d2, d3 in [
    ("Median ratio",
     f"{med_unw:.2f}", f"{med_vaf:.2f}" if med_vaf else "—", f"{med_spec:.2f}" if med_spec else "—"),
    ("% ratio > 1",
     f"{100*(r_unw>1).mean():.1f}%", f"{100*(r_vaf>1).mean():.1f}%" if len(r_vaf) else "—",
     f"{100*(r_spec>1).mean():.1f}%" if len(r_spec) else "—"),
    ("% ratio > 2",
     f"{100*(r_unw>2).mean():.1f}%", f"{100*(r_vaf>2).mean():.1f}%" if len(r_vaf) else "—",
     f"{100*(r_spec>2).mean():.1f}%" if len(r_spec) else "—"),
    ("% ratio > 5",
     f"{100*(r_unw>5).mean():.1f}%", f"{100*(r_vaf>5).mean():.1f}%" if len(r_vaf) else "—",
     f"{100*(r_spec>5).mean():.1f}%" if len(r_spec) else "—"),
    ("N rows", f"{len(r_unw):,}", f"{len(r_vaf):,}", f"{len(r_spec):,}"),
]:
    print(f"{label:<32} {d1:>11} {d2:>13} {d3:>10}")


In [None]:
# ── PER-MUTATION SUMMARY TABLE ─────────────────────────────────────────────────
# One row per detected mutation.

score_col          = "PEP" if "PEP" in ev.columns else ("Score" if "Score" in ev.columns else None)
score_lower_better = score_col != "Score"
print(f"Score column: '{score_col}'  (lower better: {score_lower_better})")

pep_prec_intens = defaultdict(list)   # seq → [precursor intensities]
pep_scores_all  = defaultdict(list)   # seq → [score values]
for _, row in ev.iterrows():
    seq = str(row.get("Sequence", "")).upper().strip()
    if not seq: continue
    pi = row.get(precursor_col, np.nan) if precursor_col else np.nan
    sc = row.get(score_col, np.nan)     if score_col     else np.nan
    if not pd.isna(pi) and float(pi) > 0:
        pep_prec_intens[seq].append(float(pi))
    if not pd.isna(sc):
        pep_scores_all[seq].append(float(sc))

all_detected_seqs = set(ev["Sequence"].str.upper().str.strip().dropna())

# Group evidence rows by (accession, swap)
mut_key_to_ev_rows = defaultdict(list)
for _, row in mut_ev.iterrows():
    for mut_key in row["_mut_pairs"]:
        mut_key_to_ev_rows[mut_key].append(row)

def derive_ref_candidates(mut_pep, swap):
    m = re.match(r'^([A-Z])(\d+)([A-Z])$', str(swap))
    if not m: return []
    ref_aa, mut_aa = m.group(1), m.group(3)
    pep = str(mut_pep).upper()
    return [pep[:i] + ref_aa + pep[i+1:] for i, aa in enumerate(pep) if aa == mut_aa]

mut_info = {}              # (accession, swap) → {mut_pep, ref_pep, ref_detected}
ref_pep_to_mut_keys = defaultdict(set)

for (accession, swap), rows_list in mut_key_to_ev_rows.items():
    pep_counts = defaultdict(int)
    for r in rows_list:
        pep_counts[str(r.get("Sequence", "")).upper().strip()] += 1
    mut_pep = max(pep_counts, key=pep_counts.get) if pep_counts else None
    if not mut_pep: continue

    ref_cands   = [rc.upper() for rc in derive_ref_candidates(mut_pep, swap)]
    ref_pep_det = next((rc for rc in ref_cands if rc in all_detected_seqs), None)
    ref_pep_use = ref_pep_det or (ref_cands[0] if ref_cands else None)

    mut_info[(accession, swap)] = {
        "mut_pep":      mut_pep,
        "ref_pep":      ref_pep_use,
        "ref_detected": (ref_pep_det is not None) if ref_cands else None,
    }
    if ref_pep_use:
        ref_pep_to_mut_keys[ref_pep_use].add((accession, swap))

summary_rows = []
for (accession, swap), channels in mutation_to_channels.items():
    info = mut_info.get((accession, swap))
    if not info: continue

    mut_pep = info["mut_pep"]
    ref_pep = info["ref_pep"]

    vafs     = [mutation_channel_vaf[(accession, swap, ch)]
                for ch in channels if (accession, swap, ch) in mutation_channel_vaf]
    plex_vaf = np.mean(vafs) if vafs else np.nan

    other_muts = set()
    if ref_pep:
        for other_key in ref_pep_to_mut_keys.get(ref_pep, set()):
            if other_key != (accession, swap):
                other_muts.add(f"{other_key[0]}:{other_key[1]}")

    rows_list    = mut_key_to_ev_rows[(accession, swap)]
    mut_intens   = pep_prec_intens.get(mut_pep, [])
    ref_intens   = pep_prec_intens.get(ref_pep, []) if ref_pep else []
    other_intens = []
    for om_key in other_muts:
        om_acc, om_sw = om_key.split(":", 1)
        om_inf = mut_info.get((om_acc, om_sw))
        if om_inf:
            other_intens += pep_prec_intens.get(om_inf["mut_pep"], [])

    total_site = sum(mut_intens) + sum(ref_intens) + sum(other_intens)
    mut_frac   = sum(mut_intens) / total_site if total_site > 0 else np.nan
    ref_frac   = sum(ref_intens) / total_site if total_site > 0 else np.nan

    scores  = pep_scores_all.get(mut_pep, [])
    best_sc = (min(scores) if score_lower_better else max(scores)) if scores else np.nan

    summary_rows.append({
        "accession":           accession,
        "swap":                swap,
        "n_channels":          len(channels),
        "plex_vaf":            plex_vaf,
        "n_evidence_rows":     len(rows_list),
        "mut_peptide":         mut_pep,
        "ref_peptide":         ref_pep,
        "ref_detected":        info["ref_detected"],
        "other_muts_at_site":  "; ".join(sorted(other_muts)) or "",
        "mut_site_fraction":   mut_frac,
        "ref_site_fraction":   ref_frac,
        "mean_prec_intensity": np.mean(mut_intens) if mut_intens else np.nan,
        score_col or "score":  best_sc,
    })

summary = pd.DataFrame(summary_rows)
print(f"Per-mutation summary: {len(summary):,} mutations with evidence rows")
print(f"  ref_detected = True:  {summary['ref_detected'].eq(True).sum():,}")
print(f"  ref_detected = False: {summary['ref_detected'].eq(False).sum():,}")
print(f"\nMS-level allele fraction:")
print(summary["mut_site_fraction"].describe().round(3))
display(summary.sort_values("mut_site_fraction", ascending=False).head(20))


In [None]:
# ── DETECTION RATE BY VAF BIN ──────────────────────────────────────────────────

detected_keys  = set(mut_key_to_ev_rows.keys())
ref_det_lookup = {k: v["ref_detected"] for k, v in mut_info.items()}

all_mut_rows = []
for (accession, swap), channels in mutation_to_channels.items():
    vafs = [mutation_channel_vaf[(accession, swap, ch)]
            for ch in channels if (accession, swap, ch) in mutation_channel_vaf]
    all_mut_rows.append({
        "accession":    accession,
        "swap":         swap,
        "n_channels":   len(channels),
        "plex_vaf":     np.mean(vafs) if vafs else np.nan,
        "mut_detected": (accession, swap) in detected_keys,
        "ref_detected": ref_det_lookup.get((accession, swap)),
    })

all_mut_df = pd.DataFrame(all_mut_rows)
has_vaf    = all_mut_df.dropna(subset=["plex_vaf"]).copy()

VAF_EDGES  = [0, 0.10, 0.20, 0.30, 0.50, 1.001]
VAF_LABELS = ["0.00–0.10","0.10–0.20","0.20–0.30","0.30–0.50","0.50–1.00"]
has_vaf["vaf_bin"] = pd.cut(has_vaf["plex_vaf"], bins=VAF_EDGES, labels=VAF_LABELS)

print(f"All mutations in plex FASTA: {len(all_mut_df):,}  (with VAF: {len(has_vaf):,})")
print()
print(f"  {'VAF bin':<12} {'Total':>7}  {'Mut det':>8}  {'Det%':>6}  {'Ref det|mut':>12}  {'Ref%|mut':>9}")
print("  " + "-" * 60)

def _row(label, grp):
    n_tot   = len(grp)
    n_mdet  = grp["mut_detected"].sum()
    det_pct = f"{100*n_mdet/n_tot:.1f}%" if n_tot else "—"
    det_grp = grp[grp["mut_detected"]]
    n_rdet  = det_grp["ref_detected"].eq(True).sum()
    ref_pct = f"{100*n_rdet/len(det_grp):.1f}%" if len(det_grp) else "—"
    print(f"  {label:<12} {n_tot:>7,}  {n_mdet:>8,}  {det_pct:>6}  {n_rdet:>12,}  {ref_pct:>9}")

for b in VAF_LABELS:
    _row(b, has_vaf[has_vaf["vaf_bin"] == b])
print("  " + "-" * 60)
_row("ALL w/ VAF", has_vaf)
_row("ALL",        all_mut_df)


In [None]:
# ── PLOT: ratio by precursor intensity quintile ────────────────────────────────

r_prec = ratios.dropna(subset=["precursor_intensity"]).copy()
r_prec = r_prec[r_prec["precursor_intensity"] > 0]

if len(r_prec) < 10:
    print(f"Insufficient data: only {len(r_prec)} rows have precursor intensity > 0.")
else:
    log_intens = np.log10(r_prec["precursor_intensity"])
    try:
        r_prec["intens_bin"] = pd.qcut(log_intens, q=5, duplicates="drop")
    except ValueError:
        r_prec["intens_bin"] = pd.qcut(log_intens, q=4, duplicates="drop")

    bin_order   = list(r_prec["intens_bin"].cat.categories)
    panel_specs = [
        ("ratio",     "All rows — Unweighted",                       "#4878d0", r_prec),
        ("ratio_vaf", "All rows — VAF-weighted",                     "#6acc65",
         r_prec[r_prec["ratio_vaf"].notna()]),
        ("ratio",     "Channel-specific (n_have=1)\nUnweighted",     "#e07b39",
         r_prec[r_prec["n_have_ch"] == 1]),
    ]

    fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=False)
    for ax, (col, title, color, df) in zip(axes, panel_specs):
        data_by_bin = []
        tick_labels = []
        for b in bin_order:
            grp = df[df["intens_bin"] == b][col].replace(0, np.nan).dropna()
            data_by_bin.append(grp.values if len(grp) else np.array([np.nan]))
            tick_labels.append(f"10^{b.left:.1f}–\n10^{b.right:.1f}\n(n={len(grp):,})")
        ax.boxplot(
            data_by_bin, patch_artist=True, showfliers=True,
            flierprops=dict(marker=".", markersize=2, alpha=0.25, color=color),
            medianprops=dict(color="#e74c3c", linewidth=2),
            boxprops=dict(facecolor=color, alpha=0.45),
            whiskerprops=dict(color="grey", linewidth=1),
            capprops=dict(color="grey", linewidth=1),
        )
        ax.axhline(y=1, color="grey", linestyle="--", linewidth=1.2, label="ratio = 1")
        ax.set_yscale("log")
        ax.set_xticks(range(1, len(bin_order) + 1))
        ax.set_xticklabels(tick_labels, fontsize=7.5)
        ax.set_xlabel(f"Precursor intensity ('{precursor_col}') — low → high", fontsize=9)
        ax.set_ylabel("mean RI (have) / mean RI (not-have)  [log scale]")
        ax.set_title(f"{title}\n(n = {df[col].replace(0,np.nan).dropna().__len__():,})")
        ax.legend(fontsize=8)

    fig.suptitle(
        f"Channel enrichment vs precursor intensity — {PLEX_ID}\n"
        "(Flat trend → background RI noise is NOT the main cause of diluted ratios)",
        fontsize=10, y=1.03)
    plt.tight_layout()
    fig.savefig(os.path.join(RESULTS_DIR, "mutant_channel_enrichment_intensity.pdf"),
                bbox_inches="tight")
    plt.show()
