# FragPipe Search Validation

Validates that mutant peptides are identified in the correct TMT channel.

**New FASTA header format** (Philosopher-compatible):
`>sp|{accession}-{swap}-{hash}|{gene}-mut ... GN={gene} PE=1 SV=1`

For each TMT channel, identifies mutant PSMs where that channel has the
highest reporter ion (RI) intensity (data-driven channel assignment), then
plots boxplots of RI across all channels for those PSMs.

If the pipeline is working correctly, the channel with the highest RI for
a given mutant peptide should strongly dominate over other channels — i.e.,
the red (focal) box should sit above the blue boxes.

In [None]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os

# ── CONFIG ────────────────────────────────────────────────────────────────────
PLEX_ID     = "01CPTAC_CCRCC_Proteome_JHU_20171007"   # change to target plex
RESULTS_DIR = f"/scratch/leduc.an/AAS_Evo/MS_SEARCH/results/{PLEX_ID}"
ANNOT_FILE  = f"/scratch/leduc.an/AAS_Evo/MS_SEARCH/annotations/{PLEX_ID}_annotation.tsv"

# psm.tsv lives in a sub-directory named {PLEX_ID}_1/ (FragPipe experiment folder).
# Use *_1 glob to skip tmt-report/ and any other output directories.
psm_matches = sorted(glob.glob(os.path.join(RESULTS_DIR, "*_1/psm.tsv")))
if not psm_matches:
    raise FileNotFoundError(f"No *_1/psm.tsv found under {RESULTS_DIR}")
PSM_FILE = psm_matches[0]
# ─────────────────────────────────────────────────────────────────────────────

for f in [PSM_FILE, ANNOT_FILE]:
    if not os.path.exists(f):
        raise FileNotFoundError(f"Not found: {f}")

print(f"Plex:       {PLEX_ID}")
print(f"PSM file:   {PSM_FILE}")
print(f"Annotation: {ANNOT_FILE}")

In [None]:
# ── LOAD DATA ─────────────────────────────────────────────────────────────────
psm   = pd.read_csv(PSM_FILE,   sep="\t", low_memory=False)
annot = pd.read_csv(ANNOT_FILE, sep="\t")

print(f"Total PSMs:  {len(psm):,}")
print(f"Columns ({len(psm.columns)}): {list(psm.columns[:20])} ...")
print(f"\nAnnotation:\n{annot.to_string(index=False)}")

In [None]:
# ── DETECT TMT INTENSITY COLUMNS ──────────────────────────────────────────────
# FragPipe TMT column names depend on the experiment layout.
# Known formats:
#   '126', '127N', ...                           (bare channel label)
#   '126 Intensity', '127N Intensity', ...        (label + Intensity)
#   'Intensity {experiment}_{channel}'            (e.g. Intensity 01CPTAC_..._1_127N)
CHANNEL_ORDER = [
    "126", "127N", "127C", "128N", "128C",
    "129N", "129C", "130N", "130C", "131N", "131C"
]

def find_ri_cols(df, channel_order):
    """Find reporter ion intensity columns, return ordered {channel: col_name}."""
    found = {}
    for ch in channel_order:
        # 1. Exact match (bare channel label)
        if ch in df.columns:
            found[ch] = ch
            continue
        # 2. 'Intensity ..._{channel}' — FragPipe per-experiment TMT style
        candidates = [c for c in df.columns
                      if c.startswith("Intensity") and c.endswith(f"_{ch}")]
        if candidates:
            found[ch] = candidates[0]
            continue
        # 3. '{channel} Intensity' or '{channel}_...' fallback
        candidates = [c for c in df.columns
                      if c.startswith(ch) and "intensity" in c.lower()]
        if candidates:
            found[ch] = candidates[0]
    return found  # {channel_label: column_name}

ri_col_map = find_ri_cols(psm, CHANNEL_ORDER)
print(f"RI columns found ({len(ri_col_map)}):")
for ch, col in ri_col_map.items():
    print(f"  {ch:6s} -> {col}")

if not ri_col_map:
    # Help diagnose if still nothing found
    intensity_cols = [c for c in psm.columns if "intensity" in c.lower()]
    print(f"\nWARNING: no RI columns matched. Intensity-related columns in PSM:")
    for c in intensity_cols[:20]:
        print(f"  {c}")

In [None]:
# ── FILTER TO MUTANT PSMs ─────────────────────────────────────────────────────
# New FASTA format: >sp|{accession}-{swap}-{hash}|{gene}-mut ... GN={gene} ...
#
# FragPipe PSM columns:
#   Protein    = "sp|Q02952-K117E-A3F2|AKAP12-mut"  (raw FASTA entry)
#   Protein ID = "Q02952-K117E-A3F2"                 (accession field)
#   Entry Name = "AKAP12-mut"                         (entry name field)
#
# Filter: Entry Name ends with "-mut"  (excludes reference sp| and comp| entries)

entry_col = "Entry Name"
protid_col = "Protein ID"

if entry_col not in psm.columns:
    # Fallback: derive from Protein column (3rd pipe-delimited field)
    psm[entry_col] = psm["Protein"].str.split("|").str[2].str.split().str[0]

mut_psm = psm[psm[entry_col].str.endswith("-mut", na=False)].copy()
print(f"Mutant PSMs: {len(mut_psm):,} of {len(psm):,} total")

def parse_mutation(protein_id):
    """Extract mutation code from Protein ID like 'Q02952-K117E-A3F2' -> 'K117E'."""
    parts = str(protein_id).split("-")
    return parts[1] if len(parts) >= 3 else None

def parse_accession(protein_id):
    """Extract base UniProt accession like 'Q02952-K117E-A3F2' -> 'Q02952'."""
    return str(protein_id).split("-")[0]

if protid_col in mut_psm.columns:
    mut_psm["mutation"]  = mut_psm[protid_col].apply(parse_mutation)
    mut_psm["accession"] = mut_psm[protid_col].apply(parse_accession)
else:
    mut_psm["mutation"]  = None
    mut_psm["accession"] = None

mut_psm["gene"] = mut_psm[entry_col].str.replace("-mut", "", regex=False)

print(f"Unique mutations:  {mut_psm['mutation'].nunique()}")
print(f"Unique genes:      {mut_psm['gene'].nunique()}")
print(f"\nTop mutations:\n{mut_psm['mutation'].value_counts().head(10)}")

In [None]:
# ── BUILD CHANNEL → PATIENT MAPPING ──────────────────────────────────────────
# annotation: sample, plex, channel, condition
# sample = "{case_id}_{sample_type}" or "Reference"
print(annot.to_string(index=False))

channel_to_patient = {}  # FragPipe channel label -> case_submitter_id
for _, row in annot.iterrows():
    sample  = str(row["sample"])
    channel = str(row["channel"]).upper()  # normalise to '126', '127N', etc.
    if sample.lower() in ("reference", "ref", "pooled", "pool"):
        continue
    patient = sample.split("_")[0]
    channel_to_patient[channel] = patient

print(f"\nChannel → patient mapping ({len(channel_to_patient)} channels):")
for ch, pt in sorted(channel_to_patient.items(),
                     key=lambda x: CHANNEL_ORDER.index(x[0])
                     if x[0] in CHANNEL_ORDER else 99):
    print(f"  {ch:6s} -> {pt}")

In [None]:
# ── CELL A: LOAD PATIENT ↔ CHANNEL METADATA ───────────────────────────────────
# Builds plex_meta: channel → case_submitter_id → GDC UUID for this plex.
# Needed to know which per-sample FASTAs belong to which TMT channels.

REPO_DIR   = "/home/leduc.an/AAS_Evo_project/AAS_Evo"
TMT_MAP    = f"{REPO_DIR}/metadata/PDC_meta/pdc_file_tmt_map.tsv"
GDC_META   = f"{REPO_DIR}/metadata/GDC_Meta/gdc_meta_matched.tsv"
FASTA_DIR  = "/scratch/leduc.an/AAS_Evo/FASTA/per_sample"
PLEX_FASTA = f"/scratch/leduc.an/AAS_Evo/FASTA/per_plex/{PLEX_ID}.fasta"

TMT_CHANNEL_MAP = {
    "tmt_126":"126","tmt_127n":"127N","tmt_127c":"127C",
    "tmt_128n":"128N","tmt_128c":"128C","tmt_129n":"129N",
    "tmt_129c":"129C","tmt_130n":"130N","tmt_130c":"130C",
    "tmt_131":"131N","tmt_131c":"131C",
}

tmt = pd.read_csv(TMT_MAP, sep="\t")
gdc = pd.read_csv(GDC_META, sep="\t")

# Filter to this plex, drop reference/pooled channels
plex_tmt = (tmt[tmt["run_metadata_id"] == PLEX_ID]
            [["tmt_channel","case_submitter_id","sample_type"]].drop_duplicates())
plex_tmt = plex_tmt[~plex_tmt["case_submitter_id"].str.lower()
                    .isin(["ref","reference","pooled","pool","nan"])]
plex_tmt["channel"] = plex_tmt["tmt_channel"].map(TMT_CHANNEL_MAP)

# Join GDC meta to get UUID
plex_meta = plex_tmt.merge(
    gdc[["gdc_file_id","case_submitter_id","sample_type"]],
    on=["case_submitter_id","sample_type"], how="left")

print(f"Patients in plex: {len(plex_meta)}")
print(plex_meta[["channel","case_submitter_id","gdc_file_id"]].to_string(index=False))

In [None]:
# ── CELL B: MUTATION → CHANNELS MAP FROM PER-SAMPLE FASTAs ───────────────────
# For each (accession, swap) mutation, collect which TMT channels (patients)
# carry it by scanning per-sample mutant FASTAs.
# Per-sample FASTAs use OLD header format: >mut|accession|gene|swap|source

from collections import defaultdict, Counter

mutation_to_channels = defaultdict(list)   # (accession, swap) → [channel, ...]
missing_fastas = []

for _, row in plex_meta.iterrows():
    uuid    = row["gdc_file_id"]
    channel = row["channel"]
    if pd.isna(uuid) or pd.isna(channel):
        continue
    fasta_path = os.path.join(FASTA_DIR, f"{uuid}_mutant.fasta")
    if not os.path.exists(fasta_path):
        missing_fastas.append(uuid)
        continue
    with open(fasta_path) as f:
        for line in f:
            if not line.startswith(">"):
                continue
            parts = line[1:].strip().split("|")
            if len(parts) >= 4 and parts[0] == "mut":   # old format
                mutation_to_channels[(parts[1], parts[3])].append(channel)

print(f"Unique mutations in plex:  {len(mutation_to_channels):,}")
print(f"Missing per-sample FASTAs: {len(missing_fastas)}")

patient_counts = Counter(len(v) for v in mutation_to_channels.values())
print("\nN patients per mutation (how many patients share each mutation):")
for k in sorted(patient_counts):
    print(f"  {k} patient(s): {patient_counts[k]:,} mutations")

In [None]:
# ── CELL C: DERIVE REFERENCE TRYPTIC PEPTIDES & CHECK DETECTION ───────────────
#
# For each mutant tryptic peptide (e.g. "VVVDGGGK"):
#   1. Parse swap code (e.g. "A123D") → ref_aa="A", mut_aa="D"
#   2. Replace every occurrence of mut_aa with ref_aa in the mutant sequence
#      → candidate reference peptide sequences (e.g. "VVVAGGGK")
#   3. Check if ANY candidate appears in the full PSM Peptide column
#      (reference peptide comes from a UniProt entry, not a mutant entry)
#
# Separation into "ref detected / not detected" is the key conditioning:
#   - ref NOT detected → the MS didn't even sample that tryptic region;
#     a missing mutant PSM tells us nothing.
#   - ref detected     → the region WAS sampled; a missing mutant is a real miss.

import re

def parse_swap(swap):
    """'K117E' → (ref_aa='K', mut_aa='E'). Returns (None, None) if unparseable."""
    m = re.match(r'^([A-Z])(\d+)([A-Z])$', str(swap))
    return (m.group(1), m.group(3)) if m else (None, None)

def derive_ref_candidates(seq, ref_aa, mut_aa):
    """Replace each occurrence of mut_aa in seq with ref_aa → candidate ref peptides."""
    return [seq[:i] + ref_aa + seq[i+1:] for i, aa in enumerate(seq) if aa == mut_aa]

# Parse per-plex FASTA for mutant entries → (accession, swap) → sequence
mut_seqs = {}      # (accession, swap) → peptide sequence
current_key, buf = None, []
with open(PLEX_FASTA) as f:
    for line in f:
        line = line.strip()
        if line.startswith(">"):
            if current_key and buf:
                mut_seqs[current_key] = "".join(buf)
            buf, current_key = [], None
            parts = line[1:].split("|")
            # New format: sp|Q02952-K117E-A3F2|AKAP12-mut ...
            if len(parts) >= 3 and parts[2].split()[0].endswith("-mut"):
                pid_parts = parts[1].split("-")   # [accession, swap, hash]
                if len(pid_parts) >= 3:
                    current_key = (pid_parts[0], pid_parts[1])
        else:
            if current_key:
                buf.append(line)
    if current_key and buf:
        mut_seqs[current_key] = "".join(buf)

print(f"Mutant entries in per-plex FASTA: {len(mut_seqs):,}")

# Build full set of detected peptides from ALL PSMs (reference + mutant)
psm_peptides = set(psm["Peptide"].dropna().str.upper())
print(f"Unique peptides in PSM file:      {len(psm_peptides):,}")

# For each (accession, swap), is the corresponding reference tryptic peptide detected?
ref_peptide_detected = {}   # (accession, swap) → True / False / None (unparseable swap)
for (accession, swap), seq in mut_seqs.items():
    ref_aa, mut_aa = parse_swap(swap)
    if ref_aa is None:
        ref_peptide_detected[(accession, swap)] = None
        continue
    candidates = derive_ref_candidates(seq, ref_aa, mut_aa)
    ref_peptide_detected[(accession, swap)] = any(
        c.upper() in psm_peptides for c in candidates)

n_ref_yes  = sum(1 for v in ref_peptide_detected.values() if v is True)
n_ref_no   = sum(1 for v in ref_peptide_detected.values() if v is False)
n_ref_none = sum(1 for v in ref_peptide_detected.values() if v is None)
print(f"\nReference tryptic peptide detected:     {n_ref_yes:,}")
print(f"Reference tryptic peptide NOT detected: {n_ref_no:,}")
print(f"Swap code unparseable (skipped):        {n_ref_none:,}")

In [None]:
# ── CELL E: DIAGNOSTIC PLOTS ─────────────────────────────────────────────────

# ── Plot 1: Overall recovery breakdown (stacked bar) ──────────────────────────
labels = ["ref detected", "ref NOT detected"]
ref_yes = stats_d[stats_d["ref_pep_detected"] == True]
ref_no  = stats_d[stats_d["ref_pep_detected"] == False]

fig1, ax1 = plt.subplots(figsize=(5, 4))
bars_data = [
    (ref_yes["mut_pep_detected"].sum(),      len(ref_yes) - ref_yes["mut_pep_detected"].sum()),
    (ref_no["mut_pep_detected"].sum(),       len(ref_no)  - ref_no["mut_pep_detected"].sum()),
]
x = np.arange(len(labels))
ax1.bar(x, [b[0] for b in bars_data], color="#4878d0", label="Mutant PSM detected")
ax1.bar(x, [b[1] for b in bars_data], bottom=[b[0] for b in bars_data],
        color="#d3d3d3", label="Mutant PSM NOT detected")
ax1.set_xticks(x)
ax1.set_xticklabels(labels)
ax1.set_ylabel("Number of mutations")
ax1.set_title(f"Mutant peptide recovery\n(conditioned on reference tryptic peptide)\n{PLEX_ID}")
ax1.legend(fontsize=8)
for i, (n_det, n_miss) in enumerate(bars_data):
    total = n_det + n_miss
    if total > 0:
        ax1.text(i, total + total * 0.01, f"{100*n_det/total:.1f}%",
                 ha="center", va="bottom", fontsize=9, fontweight="bold")
plt.tight_layout()
fig1.savefig(os.path.join(RESULTS_DIR, "recovery_breakdown.pdf"), bbox_inches="tight")
plt.show()

# ── Plot 2: Detection rate vs N patients (main diagnostic) ────────────────────
#
# Blue  = mutations where reference tryptic peptide WAS detected:
#         rate should RISE with N patients (more signal = easier to find)
# Orange = mutations where reference NOT detected:
#         rate should stay near 0 (sanity check — no reference means no context)

fig2, ax2 = plt.subplots(figsize=(8, 4))

for ref_status, color, label in [
    (True,  "#4878d0", "Reference peptide detected"),
    (False, "#d65f5f", "Reference peptide NOT detected"),
]:
    grp = stats_d[stats_d["ref_pep_detected"] == ref_status]
    if grp.empty:
        continue

    # Jittered scatter
    jitter = np.random.uniform(-0.12, 0.12, len(grp))
    ax2.scatter(grp["n_patients"] + jitter, grp["mut_pep_detected"].astype(float),
                color=color, alpha=0.25, s=12, zorder=2)

    # Binned mean ± 95% CI
    binned = grp.groupby("n_patients")["mut_pep_detected"]
    means  = binned.mean()
    sems   = binned.sem().fillna(0)
    ns     = binned.count()
    ci95   = 1.96 * sems
    ax2.errorbar(means.index, means.values, yerr=ci95.values,
                 fmt="o-", color=color, capsize=4, linewidth=2,
                 markersize=7, label=f"{label}  (n={len(grp):,})", zorder=3)

ax2.set_xlabel("N patients in plex carrying this mutation")
ax2.set_ylabel("Mutant peptide detection rate")
ax2.set_ylim(-0.05, 1.05)
ax2.axhline(0, color="grey", linewidth=0.8, linestyle="--")
ax2.legend(fontsize=8)
ax2.set_title(f"Mutant peptide detection rate vs. mutation prevalence\n"
              f"{PLEX_ID}\n"
              f"Blue should rise with N patients if pipeline is working")
plt.tight_layout()
fig2.savefig(os.path.join(RESULTS_DIR, "recovery_vs_n_patients.pdf"), bbox_inches="tight")
plt.show()

# ── Plot 3: N mutations by patient count (sample size context) ────────────────
fig3, ax3 = plt.subplots(figsize=(6, 3))
counts = stats_d["n_patients"].value_counts().sort_index()
ax3.bar(counts.index, counts.values, color="#aec6cf", edgecolor="grey", linewidth=0.5)
ax3.set_xlabel("N patients in plex with mutation")
ax3.set_ylabel("Number of mutations")
ax3.set_title("Distribution of mutation prevalence across plex patients")
for x, y in zip(counts.index, counts.values):
    ax3.text(x, y + 0.5, str(y), ha="center", va="bottom", fontsize=8)
plt.tight_layout()
fig3.savefig(os.path.join(RESULTS_DIR, "mutation_patient_count_dist.pdf"), bbox_inches="tight")
plt.show()

print(f"\nSaved 3 plots to {RESULTS_DIR}")

In [None]:
# ── CELL D: PER-MUTATION DETECTION STATS ─────────────────────────────────────
# For every mutation placed in the per-plex FASTA, compute:
#   n_patients:       how many patients in this plex carry it
#   ref_pep_detected: is the reference tryptic peptide in the PSM file?
#   mut_pep_detected: is any mutant PSM found for this (accession, swap)?
#
# Key insight: (accession, swap) is the unique mutation ID — must use BOTH
# to avoid attributing different mutations in the same protein to each other.

mut_psm_set = set(zip(mut_psm["accession"].fillna(""),
                      mut_psm["mutation"].fillna("")))

rows_d = []
for (accession, swap), channels in mutation_to_channels.items():
    ref_det = ref_peptide_detected.get((accession, swap))   # True/False/None
    mut_det = (accession, swap) in mut_psm_set
    rows_d.append({
        "accession":        accession,
        "swap":             swap,
        "n_patients":       len(channels),
        "ref_pep_detected": ref_det,
        "mut_pep_detected": mut_det,
    })

stats_d = pd.DataFrame(rows_d)

print("Detection breakdown by reference-peptide status:")
print(stats_d.groupby("ref_pep_detected", dropna=False)["mut_pep_detected"]
             .agg(n_mut=("sum"), n_total=("count"), detection_rate=("mean"))
             .round(3))

detected_any = (stats_d["mut_pep_detected"]).sum()
print(f"\nMutations with ≥1 PSM detected: {detected_any:,} / {len(stats_d):,} "
      f"({100*detected_any/len(stats_d):.1f}%)")

In [None]:
# ── PLOT: RI DISTRIBUTION ACROSS CHANNELS FOR ALL MUTANT PSMs ────────────────
#
# Takes ALL mutant PSMs (no grouping) and shows the RI distribution in each
# TMT channel as a boxplot.
#
# Interpretation:
#   - If the pipeline is working: channels are UNEVEN. Each patient has a
#     different number of detectable mutations, so some channels accumulate
#     more mutant-peptide signal than others.
#   - If mutant PSMs are noise/false positives: all channels look the same
#     (uniform distribution across channels).
#
# Compare to the reference-protein boxplot (second figure) which should be
# roughly uniform — reference proteins are present in all samples equally.

channels_ordered = [ch for ch in CHANNEL_ORDER if ch in channel_to_patient]
ri_labels = list(ri_col_map.keys())
ri_cols   = list(ri_col_map.values())

# ── Figure 1: Mutant PSMs — all channels ──────────────────────────────────────
fig, ax = plt.subplots(figsize=(10, 4))

mut_data = [
    np.log10(mut_psm[col].replace(0, np.nan).dropna().values + 1)
    for col in ri_cols
]
colors_mut = [
    "#e74c3c" if lbl in channel_to_patient else "#aec6cf"
    for lbl in ri_labels
]
bp = ax.boxplot(mut_data, patch_artist=True, widths=0.6,
                medianprops=dict(color="black", linewidth=1.5),
                flierprops=dict(marker=".", markersize=1.5, alpha=0.3))
for patch, color in zip(bp["boxes"], colors_mut):
    patch.set_facecolor(color)

ax.set_xticks(range(1, len(ri_labels) + 1))
ax.set_xticklabels(ri_labels, rotation=45, ha="right")
ax.set_ylabel("log10(RI + 1)")
ax.set_title(f"ALL mutant PSMs — RI by channel  (n={len(mut_psm):,})\n"
             f"{PLEX_ID}\n"
             r"Uneven medians across channels = patient-specific signal "
             r"(good). Uniform = noise.")
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, "mutant_ri_all_channels.pdf"), bbox_inches="tight")
plt.show()

# ── Figure 2: Reference PSMs — same plot for comparison ───────────────────────
ref_psm = psm[~psm["Entry Name"].str.endswith("-mut", na=False)
              & ~psm["Entry Name"].str.endswith("-comp", na=False)].copy()

ref_data = [
    np.log10(ref_psm[col].replace(0, np.nan).dropna().values + 1)
    for col in ri_cols
]
fig2, ax2 = plt.subplots(figsize=(10, 4))
bp2 = ax2.boxplot(ref_data, patch_artist=True, widths=0.6,
                  medianprops=dict(color="black", linewidth=1.5),
                  flierprops=dict(marker=".", markersize=1.5, alpha=0.3))
for patch in bp2["boxes"]:
    patch.set_facecolor("#aec6cf")

ax2.set_xticks(range(1, len(ri_labels) + 1))
ax2.set_xticklabels(ri_labels, rotation=45, ha="right")
ax2.set_ylabel("log10(RI + 1)")
ax2.set_title(f"Reference PSMs — RI by channel  (n={len(ref_psm):,})\n"
              f"{PLEX_ID}\n"
              r"Should be roughly UNIFORM across channels (internal control).")
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, "reference_ri_all_channels.pdf"), bbox_inches="tight")
plt.show()

print(f"Mutant PSMs:    {len(mut_psm):,}")
print(f"Reference PSMs: {len(ref_psm):,}")

In [None]:
# ── SUMMARY TABLE ─────────────────────────────────────────────────────────────
# For each channel: n mutant PSMs assigned to it (by max-RI),
# median own-channel RI vs median across all other channels, and the ratio.
# A ratio >> 1 confirms that mutant peptides are enriched in the correct channel.

rows = []
for focal_ch in channels_ordered:
    patient = channel_to_patient.get(focal_ch, "?")
    if focal_ch not in ri_col_map:
        continue
    own_col    = ri_col_map[focal_ch]
    other_cols = [ri_col_map[ch] for ch in ri_labels
                  if ch != focal_ch and ch in ri_col_map]

    subset = mut_psm[mut_psm["max_channel"] == focal_ch].replace(0, np.nan)
    if subset.empty:
        continue

    own_median   = np.nanmedian(subset[own_col].values)
    other_median = np.nanmedian(subset[other_cols].values.flatten())
    ratio = own_median / other_median if other_median > 0 else np.nan

    rows.append({
        "channel":      focal_ch,
        "patient":      patient,
        "n_mut_psms":   len(subset),
        "median_own":   int(round(own_median, 0)),
        "median_other": int(round(other_median, 0)),
        "own/other":    round(ratio, 2),
    })

summary = pd.DataFrame(rows)
print(summary.to_string(index=False))

# Overall enrichment score
total_mut     = len(mut_psm)
n_sample_ch   = len(channels_ordered)
random_expect = 1 / max(len(ri_labels), 1)
actual_frac   = summary["n_mut_psms"].sum() / total_mut if total_mut else 0
mean_ratio    = summary["own/other"].mean()

print(f"\nTotal mutant PSMs:          {total_mut:,}")
print(f"PSMs in a sample channel:   {summary['n_mut_psms'].sum():,} "
      f"({100*actual_frac:.1f}%)")
print(f"Mean own/other ratio:       {mean_ratio:.2f}  "
      f"(>1 = signal concentrated in one channel — expected if working)")

In [None]:
# ── ENRICHMENT SCORE DISTRIBUTION ────────────────────────────────────────────
# For every mutant PSM: max_channel_RI / mean_other_channels_RI
# A score >> 1 means signal is concentrated in one channel (as expected).
# This doesn't depend on knowing which patient owns which channel.

ri_matrix = mut_psm[ri_cols].replace(0, np.nan)
max_ri     = ri_matrix.max(axis=1)
mean_ri    = ri_matrix.mean(axis=1)           # mean of all channels
# mean of non-max channels: (sum - max) / (n - 1)
sum_ri     = ri_matrix.sum(axis=1)
n_nonnan   = ri_matrix.notna().sum(axis=1)
mean_other = (sum_ri - max_ri) / (n_nonnan - 1).clip(lower=1)

enrich = (max_ri / mean_other.replace(0, np.nan)).dropna()

fig, ax = plt.subplots(figsize=(6, 4))
ax.hist(enrich.clip(upper=20), bins=60, color="#4878d0", edgecolor="white", linewidth=0.4)
ax.axvline(x=1, color="grey", linestyle="--", linewidth=1, label="ratio = 1 (no enrichment)")
ax.axvline(x=enrich.median(), color="#e74c3c", linestyle="-", linewidth=1.5,
           label=f"median = {enrich.median():.1f}")
ax.set_xlabel("max-channel RI / mean-other-channels RI  (clipped at 20)")
ax.set_ylabel("Number of mutant PSMs")
ax.set_title(f"Per-PSM channel enrichment score\n{PLEX_ID}")
ax.legend(fontsize=9)
plt.tight_layout()
fig.savefig(os.path.join(RESULTS_DIR, "mutant_enrichment_dist.pdf"), bbox_inches="tight")
plt.show()

pct_enriched = 100 * (enrich > 2).sum() / len(enrich)
print(f"Median enrichment ratio: {enrich.median():.2f}")
print(f"PSMs with ratio > 2:     {pct_enriched:.1f}%  "
      f"(higher = more channel-specific signal)")