# FragPipe Search Validation

Validates that mutant peptides (`mut|`) are identified in the correct TMT channel.

For each TMT channel (patient sample), plots a boxplot of reporter ion (RI) intensities
across all channels, restricted to mutant peptides attributed to that channel's patient.

If the pipeline is working correctly, each patient's mutant peptides should show
elevated RI intensity in their own TMT channel vs other channels.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os

# ── CONFIG ────────────────────────────────────────────────────────────────────
PLEX_ID      = "01CPTAC_CCRCC_Proteome_JHU_20171007"   # change to target plex
RESULTS_DIR  = f"/scratch/leduc.an/AAS_Evo/MS_SEARCH/results/{PLEX_ID}"
ANNOT_FILE   = f"/scratch/leduc.an/AAS_Evo/MS_SEARCH/annotations/{PLEX_ID}_annotation.tsv"
PSM_FILE     = os.path.join(RESULTS_DIR, "psm.tsv")
# ─────────────────────────────────────────────────────────────────────────────

for f in [PSM_FILE, ANNOT_FILE]:
    if not os.path.exists(f):
        raise FileNotFoundError(f"Not found: {f}")

print(f"Plex: {PLEX_ID}")
print(f"PSM file:    {PSM_FILE}")
print(f"Annotation:  {ANNOT_FILE}")

In [None]:
# ── LOAD DATA ─────────────────────────────────────────────────────────────────
psm  = pd.read_csv(PSM_FILE,   sep="\t", low_memory=False)
annot = pd.read_csv(ANNOT_FILE, sep="\t")

print(f"Total PSMs:  {len(psm):,}")
print(f"Columns ({len(psm.columns)}): {list(psm.columns[:20])} ...")

In [None]:
# ── DETECT TMT INTENSITY COLUMNS ──────────────────────────────────────────────
# FragPipe names them '126', '127N', '127C', ... or '126 Intensity', etc.
CHANNEL_ORDER = [
    "126", "127N", "127C", "128N", "128C",
    "129N", "129C", "130N", "130C", "131N", "131C"
]

def find_ri_cols(df, channel_order):
    """Find reporter ion intensity columns, return ordered list."""
    found = {}
    for ch in channel_order:
        # exact match first, then 'ch Intensity' style
        if ch in df.columns:
            found[ch] = ch
        else:
            candidates = [c for c in df.columns
                          if c.startswith(ch) and "intensity" in c.lower()]
            if candidates:
                found[ch] = candidates[0]
    return found  # {channel_label: column_name}

ri_col_map = find_ri_cols(psm, CHANNEL_ORDER)
print(f"RI columns found ({len(ri_col_map)}):")
for ch, col in ri_col_map.items():
    print(f"  {ch:6s} -> {col}")

In [None]:
# ── FILTER TO MUTANT PSMs ─────────────────────────────────────────────────────
# Our headers: mut|accession|gene|swap|source|patient|sample_type
mut_psm = psm[psm["Protein"].str.startswith("mut|", na=False)].copy()
print(f"Mutant PSMs: {len(mut_psm):,} of {len(psm):,} total ")

def parse_patient(protein):
    parts = str(protein).split("|")
    return parts[5] if len(parts) >= 7 else None

mut_psm["patient"] = mut_psm["Protein"].apply(parse_patient)
print(f"Unique patients in mutant PSMs: {mut_psm['patient'].nunique()}")
print(mut_psm["patient"].value_counts().head(10))

In [None]:
# ── BUILD CHANNEL → PATIENT MAPPING ──────────────────────────────────────────
# annotation: sample, plex, channel, condition
# sample = "{case_id}_{sample_type}" or "Reference"
print(annot)

channel_to_patient = {}  # FragPipe channel label -> case_submitter_id
for _, row in annot.iterrows():
    sample  = str(row["sample"])
    channel = str(row["channel"]).upper()  # normalise to '126', '127N', etc.
    if sample.lower() in ("reference", "ref", "pooled", "pool"):
        continue
    patient = sample.split("_")[0]
    channel_to_patient[channel] = patient

print(f"\nChannel → patient mapping ({len(channel_to_patient)} channels):")
for ch, pt in sorted(channel_to_patient.items(),
                     key=lambda x: CHANNEL_ORDER.index(x[0])
                     if x[0] in CHANNEL_ORDER else 99):
    print(f"  {ch:6s} -> {pt}")

In [None]:
# ── PLOT: RI BOXPLOTS PER TMT CHANNEL ────────────────────────────────────────
#
# One subplot per sample channel (patient).
# Each subplot shows boxplots of RI intensities across ALL channels,
# restricted to mutant peptides belonging to that subplot's patient.
#
# Expected: the patient's own channel (highlighted in red) should show
# higher RI than the other channels if the mutant peptide is real.

channels_ordered = [ch for ch in CHANNEL_ORDER if ch in channel_to_patient]
n_plots = len(channels_ordered)
ncols = 5
nrows = int(np.ceil(n_plots / ncols))

fig, axes = plt.subplots(nrows, ncols,
                          figsize=(ncols * 3.5, nrows * 3.5),
                          sharey=False)
axes = np.array(axes).flatten()

ri_labels = list(ri_col_map.keys())   # channel labels present in data
ri_cols   = list(ri_col_map.values()) # corresponding column names in psm

for idx, focal_ch in enumerate(channels_ordered):
    ax = axes[idx]
    focal_patient = channel_to_patient[focal_ch]

    # Mutant PSMs attributed to this patient
    subset = mut_psm[mut_psm["patient"] == focal_patient][ri_cols]

    if subset.empty:
        ax.text(0.5, 0.5, "No mut PSMs", ha="center", va="center",
                transform=ax.transAxes, fontsize=9, color="grey")
        ax.set_title(f"{focal_ch}\n{focal_patient}", fontsize=8)
        ax.set_xticks([])
        continue

    # Replace 0 with NaN (absent signal)
    subset = subset.replace(0, np.nan)

    # Build list of arrays for boxplot, log10-transform
    data = []
    for col in ri_cols:
        vals = subset[col].dropna().values
        data.append(np.log10(vals + 1))

    colors = ["#e74c3c" if lbl == focal_ch else "#aec6cf"
              for lbl in ri_labels]

    bp = ax.boxplot(data, patch_artist=True, widths=0.6,
                    medianprops=dict(color="black", linewidth=1.5),
                    flierprops=dict(marker=".", markersize=2, alpha=0.4))
    for patch, color in zip(bp["boxes"], colors):
        patch.set_facecolor(color)

    ax.set_xticks(range(1, len(ri_labels) + 1))
    ax.set_xticklabels(ri_labels, rotation=45, ha="right", fontsize=7)
    ax.set_ylabel("log10(RI + 1)", fontsize=7)
    ax.set_title(f"{focal_ch}  ({focal_patient})\nn={len(subset)}",
                 fontsize=8)
    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter("%.1f"))

# Hide unused axes
for ax in axes[n_plots:]:
    ax.set_visible(False)

fig.suptitle(f"Mutant Peptide RI Intensities by TMT Channel\n{PLEX_ID}",
             fontsize=11, fontweight="bold", y=1.01)
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, "mutant_ri_validation.pdf"),
            bbox_inches="tight")
plt.show()
print(f"Saved: {RESULTS_DIR}/mutant_ri_validation.pdf")

In [None]:
# ── SUMMARY TABLE ─────────────────────────────────────────────────────────────
# For each channel: n mutant PSMs, median RI in own channel vs median across others

rows = []
for focal_ch in channels_ordered:
    focal_patient = channel_to_patient[focal_ch]
    if focal_ch not in ri_col_map:
        continue
    own_col = ri_col_map[focal_ch]
    other_cols = [ri_col_map[ch] for ch in ri_labels if ch != focal_ch
                  and ch in ri_col_map]

    subset = mut_psm[mut_psm["patient"] == focal_patient].replace(0, np.nan)
    if subset.empty:
        continue

    own_median   = np.nanmedian(subset[own_col].values)
    other_median = np.nanmedian(subset[other_cols].values.flatten())
    ratio = own_median / other_median if other_median > 0 else np.nan

    rows.append({
        "channel":      focal_ch,
        "patient":      focal_patient,
        "n_mut_psms":   len(subset),
        "median_own":   round(own_median, 0),
        "median_other": round(other_median, 0),
        "own/other":    round(ratio, 2),
    })

summary = pd.DataFrame(rows)
print(summary.to_string(index=False))