# FragPipe Mutant Peptide Channel Enrichment

For each detected mutant PSM, parses the **Mapped Proteins** column to find which
mutations the peptide is consistent with, looks up which TMT channels carry those
mutations (via per-sample FASTAs), then computes:

> **ratio = mean RI in channels that SHOULD have the mutation /
>            mean RI in channels that should NOT**

A ratio >> 1 confirms the peptide is concentrated in the expected patient channel(s).

In [None]:
import glob
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# ── CONFIG ────────────────────────────────────────────────────────────────────
PLEX_ID     = "01CPTAC_CCRCC_Proteome_JHU_20171007"
RESULTS_DIR = f"/scratch/leduc.an/AAS_Evo/MS_SEARCH/results/{PLEX_ID}"
REPO_DIR    = "/home/leduc.an/AAS_Evo_project/AAS_Evo"
TMT_MAP     = f"{REPO_DIR}/metadata/PDC_meta/pdc_file_tmt_map.tsv"
GDC_META    = f"{REPO_DIR}/metadata/GDC_Meta/gdc_meta_matched.tsv"
FASTA_DIR   = "/scratch/leduc.an/AAS_Evo/FASTA/per_sample"

# psm.tsv lives in {PLEX_ID}_1/ subdirectory (FragPipe experiment folder)
psm_matches = sorted(glob.glob(os.path.join(RESULTS_DIR, "*_1/psm.tsv")))
if not psm_matches:
    raise FileNotFoundError(f"No *_1/psm.tsv found under {RESULTS_DIR}")
PSM_FILE = psm_matches[0]
print(f"PSM file: {PSM_FILE}")

# ── LOAD PSM ──────────────────────────────────────────────────────────────────
psm = pd.read_csv(PSM_FILE, sep="\t", low_memory=False)
print(f"Total PSMs: {len(psm):,}")

# ── DETECT TMT INTENSITY COLUMNS ──────────────────────────────────────────────
CHANNEL_ORDER = ["126","127N","127C","128N","128C","129N","129C","130N","130C","131N","131C"]

def find_ri_cols(df, channel_order):
    found = {}
    for ch in channel_order:
        if ch in df.columns:
            found[ch] = ch; continue
        candidates = [c for c in df.columns if c.startswith("Intensity") and c.endswith(f"_{ch}")]
        if candidates:
            found[ch] = candidates[0]; continue
        candidates = [c for c in df.columns if c.startswith(ch) and "intensity" in c.lower()]
        if candidates:
            found[ch] = candidates[0]
    return found

ri_col_map = find_ri_cols(psm, CHANNEL_ORDER)
print(f"RI columns found ({len(ri_col_map)}): {list(ri_col_map.keys())}")

if not ri_col_map:
    intensity_cols = [c for c in psm.columns if "intensity" in c.lower()]
    print("WARNING — no RI columns matched. Intensity-related columns found:")
    for c in intensity_cols[:20]: print(f"  {c}")

# ── FILTER: mutant PSMs with non-zero TMT signal ──────────────────────────────
mut_mask = psm["Entry Name"].str.endswith("-mut", na=False)
mut_all  = psm[mut_mask].copy()

ri_cols = list(ri_col_map.values())
if ri_cols:
    nonzero_mask = (mut_all[ri_cols].fillna(0) > 0).any(axis=1)
    mut_psm = mut_all[nonzero_mask].copy()
else:
    mut_psm = mut_all.copy()

print(f"\nMutant PSMs (any RI > 0): {len(mut_psm):,}  "
      f"(of {len(mut_all):,} mutant, {len(psm):,} total)")

In [None]:
# ── BUILD MUTATION → CHANNELS MAP ────────────────────────────────────────────
TMT_CHANNEL_MAP = {
    "tmt_126":"126",  "tmt_127n":"127N", "tmt_127c":"127C",
    "tmt_128n":"128N","tmt_128c":"128C", "tmt_129n":"129N",
    "tmt_129c":"129C","tmt_130n":"130N", "tmt_130c":"130C",
    "tmt_131":"131N", "tmt_131c":"131C",
}

tmt = pd.read_csv(TMT_MAP, sep="\t")
gdc = pd.read_csv(GDC_META, sep="\t")

plex_tmt = (tmt[tmt["run_metadata_id"] == PLEX_ID]
            [["tmt_channel","case_submitter_id","sample_type"]].drop_duplicates())
plex_tmt = plex_tmt[~plex_tmt["case_submitter_id"].str.lower()
                    .isin(["ref","reference","pooled","pool","nan"])]
plex_tmt["channel"] = plex_tmt["tmt_channel"].map(TMT_CHANNEL_MAP)

plex_meta = plex_tmt.merge(
    gdc[["gdc_file_id","case_submitter_id","sample_type"]],
    on=["case_submitter_id","sample_type"], how="left")

all_patient_channels = set(plex_meta["channel"].dropna().tolist())
print(f"Patient channels ({len(all_patient_channels)}): {sorted(all_patient_channels)}")

mutation_to_channels = defaultdict(set)
missing_fastas = []

for _, row in plex_meta.iterrows():
    uuid, channel = row["gdc_file_id"], row["channel"]
    if pd.isna(uuid) or pd.isna(channel):
        continue
    fasta_path = os.path.join(FASTA_DIR, f"{uuid}_mutant.fasta")
    if not os.path.exists(fasta_path):
        missing_fastas.append(uuid)
        continue
    with open(fasta_path) as f:
        for line in f:
            if not line.startswith(">"):
                continue
            parts = line[1:].strip().split("|")
            if len(parts) >= 4 and parts[0] == "mut":
                mutation_to_channels[(parts[1], parts[3])].add(channel)

print(f"Unique (accession, swap) mutations mapped: {len(mutation_to_channels):,}")
print(f"Missing per-sample FASTAs:                 {len(missing_fastas)}")

In [None]:
# ── LOAD VAF PER (accession, swap, channel) FROM ALL-MISSENSE TABLE ──────────
# Uses the pre-consolidated VEP output. Filter to this plex's UUIDs — no
# per-file globbing needed.

MISSENSE  = "/scratch/leduc.an/AAS_Evo/VEP/all_missense_mutations.tsv"
REF_FASTA = "/scratch/leduc.an/AAS_Evo/SEQ_FILES/uniprot_human_canonical.fasta"

AA3TO1 = {
    "Ala":"A","Arg":"R","Asn":"N","Asp":"D","Cys":"C","Gln":"Q","Glu":"E",
    "Gly":"G","His":"H","Ile":"I","Leu":"L","Lys":"K","Met":"M","Phe":"F",
    "Pro":"P","Ser":"S","Thr":"T","Trp":"W","Tyr":"Y","Val":"V",
}

def parse_hgvsp_to_swap(hgvsp):
    """'ENSP000.4:p.Leu888Pro' or 'p.Leu888Pro' → 'L888P'.
    Uses re.search so the transcript prefix is ignored."""
    m = re.search(r'p\.([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})', str(hgvsp))
    if m:
        ref = AA3TO1.get(m.group(1))
        alt = AA3TO1.get(m.group(3))
        if ref and alt:
            return f"{ref}{m.group(2)}{alt}"
    return None

# gene symbol → UniProt accession (from GN= field in reference FASTA)
gene_to_acc = {}
with open(REF_FASTA) as f:
    for line in f:
        if line.startswith(">"):
            m = re.search(r'GN=(\S+)', line)
            if m:
                gene_to_acc[m.group(1)] = line.split("|")[1]
print(f"Gene → accession entries: {len(gene_to_acc):,}")

# UUID → channel for this plex
uuid_to_channel = plex_meta.dropna(subset=["gdc_file_id","channel"]) \
                            .set_index("gdc_file_id")["channel"].to_dict()

# Load missense table, filter to this plex's UUIDs
missense = pd.read_csv(MISSENSE, sep="\t", low_memory=False)
plex_missense = missense[missense["sample_id"].isin(uuid_to_channel)]
print(f"Missense rows for this plex: {len(plex_missense):,}")

# Build (accession, swap, channel) → VAF
mutation_channel_vaf = {}
for _, vrow in plex_missense.iterrows():
    vaf     = vrow.get("VAF", np.nan)
    channel = uuid_to_channel.get(vrow["sample_id"])
    if pd.isna(vaf) or not channel:
        continue
    acc  = gene_to_acc.get(str(vrow.get("SYMBOL", "")))
    swap = parse_hgvsp_to_swap(str(vrow.get("HGVSp", "")))
    if acc and swap:
        mutation_channel_vaf[(acc, swap, channel)] = float(vaf)

print(f"(accession, swap, channel) VAF entries: {len(mutation_channel_vaf):,}")

In [None]:
# ── COMPUTE CHANNEL ENRICHMENT RATIO PER MUTANT PSM ─────────────────────────
# Computes two ratios per PSM:
#   ratio     : mean RI(have channels) / mean RI(not-have channels)
#   ratio_vaf : VAF-weighted mean RI(have channels) / mean RI(not-have channels)

if "mutation_channel_vaf" not in dir():
    print("NOTE: VAF cell not run — ratio_vaf will be NaN for all PSMs.")
    mutation_channel_vaf = {}

def parse_mapped_proteins(mapped_str):
    """'sp|P01889-S35A-B4B8|HLA-B-mut, ...' → {('P01889','S35A'), ...}"""
    if pd.isna(mapped_str):
        return set()
    pairs = set()
    for entry in str(mapped_str).split(","):
        parts = entry.strip().split("|")
        if len(parts) >= 2:
            pid_parts = parts[1].split("-")
            if len(pid_parts) >= 3:
                pairs.add((pid_parts[0], pid_parts[1]))
    return pairs

results = []
n_no_mutations = n_no_channel_info = n_cant_split = 0

for _, row in mut_psm.iterrows():
    mutations = parse_mapped_proteins(row.get("Mapped Proteins", float("nan")))
    if not mutations:
        n_no_mutations += 1; continue

    have_channels = set()
    for mut_key in mutations:
        have_channels |= mutation_to_channels.get(mut_key, set())
    have_channels &= all_patient_channels
    not_have_channels = all_patient_channels - have_channels

    if not have_channels or not not_have_channels:
        n_cant_split += 1; continue

    have_ri = [(ch, row[ri_col_map[ch]]) for ch in have_channels
               if ch in ri_col_map and pd.notna(row[ri_col_map[ch]])]
    not_ri  = [row[ri_col_map[ch]] for ch in not_have_channels
               if ch in ri_col_map and pd.notna(row[ri_col_map[ch]])]

    if not have_ri or not not_ri:
        n_no_channel_info += 1; continue

    mean_not  = np.mean(not_ri)
    mean_have = np.mean([r for _, r in have_ri])
    ratio     = mean_have / mean_not if mean_not > 0 else np.nan

    # VAF-weighted: weight each have-channel's RI by its mutation VAF
    weighted_pairs = []
    for ch, ri_val in have_ri:
        vaf = None
        for acc, sw in mutations:
            v = mutation_channel_vaf.get((acc, sw, ch))
            if v is not None:
                vaf = v; break
        if vaf is not None:
            weighted_pairs.append((ri_val, vaf))

    if weighted_pairs:
        ris_v, wts_v = zip(*weighted_pairs)
        ratio_vaf = np.average(ris_v, weights=wts_v) / mean_not if mean_not > 0 else np.nan
    else:
        ratio_vaf = np.nan

    results.append({
        "Peptide":       row.get("Peptide", ""),
        "n_have_ch":     len(have_channels),
        "n_not_have_ch": len(not_have_channels),
        "mean_have_ri":  mean_have,
        "mean_not_ri":   mean_not,
        "ratio":         ratio,
        "ratio_vaf":     ratio_vaf,
    })

ratios = pd.DataFrame(results).dropna(subset=["ratio"])
print(f"PSMs with computable ratio:         {len(ratios):,}  (of {len(mut_psm):,} mutant PSMs)")
print(f"  of which with VAF-weighted ratio: {ratios['ratio_vaf'].notna().sum():,}")
print(f"  Skipped — no mutation key:        {n_no_mutations}")
print(f"  Skipped — no channel split:       {n_cant_split}")
print(f"  Skipped — RI data missing:        {n_no_channel_info}")
print(f"\nUnweighted ratio summary:")
print(ratios["ratio"].describe().round(2))
print(f"\nVAF-weighted ratio summary:")
print(ratios["ratio_vaf"].describe().round(2))

In [None]:
# ── PLOT: side-by-side unweighted vs VAF-weighted enrichment ratio ────────────

r_unw = ratios["ratio"].replace(0, np.nan).dropna()
r_vaf = ratios["ratio_vaf"].replace(0, np.nan).dropna()

all_vals = pd.concat([r_unw, r_vaf]) if len(r_vaf) else r_unw
bins = np.logspace(np.log10(all_vals.min()), np.log10(all_vals.max()), 60)

med_unw = r_unw.median()
med_vaf = r_vaf.median() if len(r_vaf) else None

fig, axes = plt.subplots(1, 2, figsize=(13, 4), sharey=True)

for ax, data, med, title, color in [
    (axes[0], r_unw, med_unw, "Unweighted",   "#4878d0"),
    (axes[1], r_vaf, med_vaf, "VAF-weighted", "#6acc65"),
]:
    if len(data) == 0:
        ax.text(0.5, 0.5, "No data\n(run VAF cell first)",
                ha="center", va="center", transform=ax.transAxes)
    else:
        ax.hist(data, bins=bins, color=color, edgecolor="white", linewidth=0.4)
        ax.axvline(x=1,   color="grey",    linestyle="--", linewidth=1.2, label="ratio = 1")
        ax.axvline(x=med, color="#e74c3c", linestyle="-",  linewidth=1.5,
                   label=f"median = {med:.2f}")
        ax.set_xscale("log")
        ax.legend(fontsize=9)
    ax.set_xlabel("mean RI (have) / mean RI (not-have)  [log scale]")
    ax.set_title(f"{title}  (n={len(data):,})")

axes[0].set_ylabel("Number of mutant PSMs")
fig.suptitle(f"Per-PSM channel enrichment — {PLEX_ID}\n"
             f"VAF-weighted upweights channels with higher allele frequency",
             fontsize=11)
plt.tight_layout()
fig.savefig(os.path.join(RESULTS_DIR, "mutant_channel_enrichment.pdf"), bbox_inches="tight")
plt.show()

# ── Summary ───────────────────────────────────────────────────────────────────
n_pat = len(all_patient_channels)
print(f"{'Metric':<30} {'Unweighted':>12} {'VAF-weighted':>14}")
print("-" * 58)
for label, s1, s2 in [
    ("Median ratio",          f"{med_unw:.2f}",                f"{med_vaf:.2f}" if med_vaf else "—"),
    ("% PSMs with ratio > 1", f"{100*(r_unw>1).mean():.1f}%", f"{100*(r_vaf>1).mean():.1f}%" if len(r_vaf) else "—"),
    ("% PSMs with ratio > 2", f"{100*(r_unw>2).mean():.1f}%", f"{100*(r_vaf>2).mean():.1f}%" if len(r_vaf) else "—"),
    ("% PSMs with ratio > 5", f"{100*(r_unw>5).mean():.1f}%", f"{100*(r_vaf>5).mean():.1f}%" if len(r_vaf) else "—"),
    ("N PSMs",                f"{len(r_unw):,}",               f"{len(r_vaf):,}"),
]:
    print(f"{label:<30} {s1:>12} {s2:>14}")
print(f"\nContext: plex has {n_pat} patient channels.")
print(f"  A mutation in 1 patient → 1 have vs {n_pat-1} not-have channels.")
print(f"  VAF-weighted median > unweighted = signal scales with allele frequency.")