# FragPipe Mutant Peptide Channel Enrichment

For each detected mutant PSM, parses the **Mapped Proteins** column to find which
mutations the peptide is consistent with, looks up which TMT channels carry those
mutations (via per-sample FASTAs), then computes:

> **ratio = mean RI in channels that SHOULD have the mutation /
>            mean RI in channels that should NOT**

A ratio >> 1 confirms the peptide is concentrated in the expected patient channel(s).

In [None]:
import glob
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# ── CONFIG ────────────────────────────────────────────────────────────────────
PLEX_ID     = "01CPTAC_CCRCC_Proteome_JHU_20171007"
RESULTS_DIR = f"/scratch/leduc.an/AAS_Evo/MS_SEARCH/results/{PLEX_ID}"
REPO_DIR    = "/home/leduc.an/AAS_Evo_project/AAS_Evo"
TMT_MAP     = f"{REPO_DIR}/metadata/PDC_meta/pdc_file_tmt_map.tsv"
GDC_META    = f"{REPO_DIR}/metadata/GDC_Meta/gdc_meta_matched.tsv"
FASTA_DIR   = "/scratch/leduc.an/AAS_Evo/FASTA/per_sample"

# psm.tsv lives in {PLEX_ID}_1/ subdirectory (FragPipe experiment folder)
psm_matches = sorted(glob.glob(os.path.join(RESULTS_DIR, "*_1/psm.tsv")))
if not psm_matches:
    raise FileNotFoundError(f"No *_1/psm.tsv found under {RESULTS_DIR}")
PSM_FILE = psm_matches[0]
print(f"PSM file: {PSM_FILE}")

# ── LOAD PSM ──────────────────────────────────────────────────────────────────
psm = pd.read_csv(PSM_FILE, sep="\t", low_memory=False)
print(f"Total PSMs: {len(psm):,}")

# ── DETECT TMT INTENSITY COLUMNS ──────────────────────────────────────────────
CHANNEL_ORDER = ["126","127N","127C","128N","128C","129N","129C","130N","130C","131N","131C"]

def find_ri_cols(df, channel_order):
    """Return ordered {channel_label: column_name} for reporter ion columns."""
    found = {}
    for ch in channel_order:
        if ch in df.columns:
            found[ch] = ch; continue
        # 'Intensity {experiment}_{channel}' — FragPipe per-experiment TMT style
        candidates = [c for c in df.columns if c.startswith("Intensity") and c.endswith(f"_{ch}")]
        if candidates:
            found[ch] = candidates[0]; continue
        # '{channel} Intensity' fallback
        candidates = [c for c in df.columns if c.startswith(ch) and "intensity" in c.lower()]
        if candidates:
            found[ch] = candidates[0]
    return found

ri_col_map = find_ri_cols(psm, CHANNEL_ORDER)
print(f"RI columns found ({len(ri_col_map)}): {list(ri_col_map.keys())}")

if not ri_col_map:
    intensity_cols = [c for c in psm.columns if "intensity" in c.lower()]
    print("WARNING — no RI columns matched. Intensity-related columns found:")
    for c in intensity_cols[:20]: print(f"  {c}")

# ── FILTER: mutant PSMs with non-zero TMT signal ──────────────────────────────
# Mutant FASTA entries have Entry Name ending in '-mut'
mut_mask = psm["Entry Name"].str.endswith("-mut", na=False)
mut_all  = psm[mut_mask].copy()

ri_cols = list(ri_col_map.values())
if ri_cols:
    nonzero_mask = (mut_all[ri_cols].fillna(0) > 0).any(axis=1)
    mut_psm = mut_all[nonzero_mask].copy()
else:
    mut_psm = mut_all.copy()

print(f"\nMutant PSMs (any RI > 0): {len(mut_psm):,}  "
      f"(of {len(mut_all):,} mutant, {len(psm):,} total)")

In [None]:
# ── BUILD MUTATION → CHANNELS MAP ────────────────────────────────────────────
# Links each (accession, swap) mutation to the TMT channels (patients) that carry it.
# Pipeline: TMT map → channel/case_id → GDC meta → UUID → per-sample FASTA → mutations.

TMT_CHANNEL_MAP = {
    "tmt_126":"126",  "tmt_127n":"127N", "tmt_127c":"127C",
    "tmt_128n":"128N","tmt_128c":"128C", "tmt_129n":"129N",
    "tmt_129c":"129C","tmt_130n":"130N", "tmt_130c":"130C",
    "tmt_131":"131N", "tmt_131c":"131C",
}

tmt = pd.read_csv(TMT_MAP, sep="\t")
gdc = pd.read_csv(GDC_META, sep="\t")

# Filter to this plex; drop reference/pooled channels
plex_tmt = (tmt[tmt["run_metadata_id"] == PLEX_ID]
            [["tmt_channel","case_submitter_id","sample_type"]].drop_duplicates())
plex_tmt = plex_tmt[~plex_tmt["case_submitter_id"].str.lower()
                    .isin(["ref","reference","pooled","pool","nan"])]
plex_tmt["channel"] = plex_tmt["tmt_channel"].map(TMT_CHANNEL_MAP)

# Join GDC meta to get UUID for each patient channel
plex_meta = plex_tmt.merge(
    gdc[["gdc_file_id","case_submitter_id","sample_type"]],
    on=["case_submitter_id","sample_type"], how="left")

# All patient channels for this plex (used to compute "should NOT have" set)
all_patient_channels = set(plex_meta["channel"].dropna().tolist())
print(f"Patient channels ({len(all_patient_channels)}): {sorted(all_patient_channels)}")

# Scan per-sample FASTAs to map (accession, swap) → {channel, ...}
# Per-sample FASTAs use OLD header format: >mut|accession|gene|swap|source
mutation_to_channels = defaultdict(set)
missing_fastas = []

for _, row in plex_meta.iterrows():
    uuid, channel = row["gdc_file_id"], row["channel"]
    if pd.isna(uuid) or pd.isna(channel):
        continue
    fasta_path = os.path.join(FASTA_DIR, f"{uuid}_mutant.fasta")
    if not os.path.exists(fasta_path):
        missing_fastas.append(uuid)
        continue
    with open(fasta_path) as f:
        for line in f:
            if not line.startswith(">"):
                continue
            parts = line[1:].strip().split("|")
            if len(parts) >= 4 and parts[0] == "mut":   # >mut|acc|gene|swap|...
                mutation_to_channels[(parts[1], parts[3])].add(channel)

print(f"Unique (accession, swap) mutations mapped: {len(mutation_to_channels):,}")
print(f"Missing per-sample FASTAs:                 {len(missing_fastas)}")

In [None]:
# ── COMPUTE CHANNEL ENRICHMENT RATIO PER MUTANT PSM ─────────────────────────
# For each mutant PSM:
#   1. Parse 'Mapped Proteins' → all (accession, swap) mutations the peptide matches
#   2. Union the channels that SHOULD have the peptide (from mutation_to_channels)
#   3. Channels that should NOT have = all_patient_channels − have_channels
#   4. ratio = mean RI(have channels) / mean RI(not-have channels)
#      ratio >> 1  confirms signal is concentrated in the expected patient channel(s)

def parse_mapped_proteins(mapped_str):
    """Extract (accession, swap) pairs from the Mapped Proteins column.

    Input:  'sp|P01889-S35A-B4B8|HLA-B-mut, sp|P10321-D33Y-2921|HLA-C-mut'
    Output: set of (accession, swap) tuples, e.g. {('P01889','S35A'), ('P10321','D33Y')}
    """
    if pd.isna(mapped_str):
        return set()
    pairs = set()
    for entry in str(mapped_str).split(","):
        parts = entry.strip().split("|")
        if len(parts) >= 2:
            pid_parts = parts[1].split("-")   # accession-swap-hash
            if len(pid_parts) >= 3:
                pairs.add((pid_parts[0], pid_parts[1]))
    return pairs

results = []
n_no_mutations = 0
n_no_channel_info = 0
n_cant_split = 0

for _, row in mut_psm.iterrows():
    mutations = parse_mapped_proteins(row.get("Mapped Proteins", float("nan")))
    if not mutations:
        n_no_mutations += 1
        continue

    have_channels = set()
    for mut_key in mutations:
        have_channels |= mutation_to_channels.get(mut_key, set())
    have_channels &= all_patient_channels

    not_have_channels = all_patient_channels - have_channels

    if not have_channels or not not_have_channels:
        n_cant_split += 1
        continue

    have_ri = [row[ri_col_map[ch]] for ch in have_channels
               if ch in ri_col_map and pd.notna(row[ri_col_map[ch]])]
    not_ri  = [row[ri_col_map[ch]] for ch in not_have_channels
               if ch in ri_col_map and pd.notna(row[ri_col_map[ch]])]

    if not have_ri or not not_ri:
        n_no_channel_info += 1
        continue

    mean_have = np.mean(have_ri)
    mean_not  = np.mean(not_ri)
    ratio     = mean_have / mean_not if mean_not > 0 else np.nan

    results.append({
        "Peptide":       row.get("Peptide", ""),
        "n_mutations":   len(mutations),
        "n_have_ch":     len(have_channels),
        "n_not_have_ch": len(not_have_channels),
        "mean_have_ri":  mean_have,
        "mean_not_ri":   mean_not,
        "ratio":         ratio,
    })

ratios = pd.DataFrame(results).dropna(subset=["ratio"])
print(f"PSMs with computable ratio:   {len(ratios):,}  (of {len(mut_psm):,} mutant PSMs)")
print(f"  Skipped — no mutation key:  {n_no_mutations}")
print(f"  Skipped — no channel split: {n_cant_split}")
print(f"  Skipped — RI data missing:  {n_no_channel_info}")
print(f"\nRatio summary:")
print(ratios["ratio"].describe().round(2))

In [None]:
# ── PLOT & SUMMARY ────────────────────────────────────────────────────────────

med   = ratios["ratio"].median()
valid = ratios["ratio"].replace(0, np.nan).dropna()
bins  = np.logspace(np.log10(valid.min()), np.log10(valid.max()), 60)

fig, ax = plt.subplots(figsize=(7, 4))
ax.hist(valid, bins=bins, color="#4878d0", edgecolor="white", linewidth=0.4)
ax.axvline(x=1,   color="grey",    linestyle="--", linewidth=1.2, label="ratio = 1  (no enrichment)")
ax.axvline(x=med, color="#e74c3c", linestyle="-",  linewidth=1.5, label=f"median = {med:.2f}")
ax.set_xscale("log")
ax.set_xlabel("mean RI (should-have channels) / mean RI (should-NOT channels)  [log scale]")
ax.set_ylabel("Number of mutant PSMs")
ax.set_title(f"Per-PSM channel enrichment — {PLEX_ID}\n"
             f"ratio >> 1 confirms peptide is concentrated in the expected patient channel(s)")
ax.legend(fontsize=9)
plt.tight_layout()
fig.savefig(os.path.join(RESULTS_DIR, "mutant_channel_enrichment.pdf"), bbox_inches="tight")
plt.show()

# ── Summary ───────────────────────────────────────────────────────────────────
n_pat = len(all_patient_channels)
print(f"Median ratio:          {med:.2f}")
print(f"PSMs with ratio > 1:   {100*(valid>1).mean():.1f}%  (random baseline ~50%)")
print(f"PSMs with ratio > 2:   {100*(valid>2).mean():.1f}%")
print(f"PSMs with ratio > 5:   {100*(valid>5).mean():.1f}%")
print(f"\nContext: plex has {n_pat} patient channels.")
print(f"  A mutation in 1 patient → 1 have vs {n_pat-1} not-have channels.")
print(f"  If purely patient-specific, expected ratio ≈ {n_pat-1:.0f}×.")
print(f"  Median well above 1 = signal concentrated in expected channels → pipeline working.")