# FragPipe Search Validation

Validates that mutant peptides are identified in the correct TMT channel.

**New FASTA header format** (Philosopher-compatible):
`>sp|{accession}-{swap}-{hash}|{gene}-mut ... GN={gene} PE=1 SV=1`

For each TMT channel, identifies mutant PSMs where that channel has the
highest reporter ion (RI) intensity (data-driven channel assignment), then
plots boxplots of RI across all channels for those PSMs.

If the pipeline is working correctly, the channel with the highest RI for
a given mutant peptide should strongly dominate over other channels — i.e.,
the red (focal) box should sit above the blue boxes.

In [None]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os

# ── CONFIG ────────────────────────────────────────────────────────────────────
PLEX_ID     = "01CPTAC_CCRCC_Proteome_JHU_20171007"   # change to target plex
RESULTS_DIR = f"/scratch/leduc.an/AAS_Evo/MS_SEARCH/results/{PLEX_ID}"
ANNOT_FILE  = f"/scratch/leduc.an/AAS_Evo/MS_SEARCH/annotations/{PLEX_ID}_annotation.tsv"

# psm.tsv lives in a sub-directory named {PLEX_ID}_1/ (FragPipe experiment folder).
# Use *_1 glob to skip tmt-report/ and any other output directories.
psm_matches = sorted(glob.glob(os.path.join(RESULTS_DIR, "*_1/psm.tsv")))
if not psm_matches:
    raise FileNotFoundError(f"No *_1/psm.tsv found under {RESULTS_DIR}")
PSM_FILE = psm_matches[0]
# ─────────────────────────────────────────────────────────────────────────────

for f in [PSM_FILE, ANNOT_FILE]:
    if not os.path.exists(f):
        raise FileNotFoundError(f"Not found: {f}")

print(f"Plex:       {PLEX_ID}")
print(f"PSM file:   {PSM_FILE}")
print(f"Annotation: {ANNOT_FILE}")

In [None]:
# ── LOAD DATA ─────────────────────────────────────────────────────────────────
psm   = pd.read_csv(PSM_FILE,   sep="\t", low_memory=False)
annot = pd.read_csv(ANNOT_FILE, sep="\t")

print(f"Total PSMs:  {len(psm):,}")
print(f"Columns ({len(psm.columns)}): {list(psm.columns[:20])} ...")
print(f"\nAnnotation:\n{annot.to_string(index=False)}")

In [None]:
# ── DETECT TMT INTENSITY COLUMNS ──────────────────────────────────────────────
# FragPipe names them '126', '127N', '127C', ... or '126 Intensity', etc.
CHANNEL_ORDER = [
    "126", "127N", "127C", "128N", "128C",
    "129N", "129C", "130N", "130C", "131N", "131C"
]

def find_ri_cols(df, channel_order):
    """Find reporter ion intensity columns, return ordered list."""
    found = {}
    for ch in channel_order:
        # exact match first, then 'ch Intensity' style
        if ch in df.columns:
            found[ch] = ch
        else:
            candidates = [c for c in df.columns
                          if c.startswith(ch) and "intensity" in c.lower()]
            if candidates:
                found[ch] = candidates[0]
    return found  # {channel_label: column_name}

ri_col_map = find_ri_cols(psm, CHANNEL_ORDER)
print(f"RI columns found ({len(ri_col_map)}):")
for ch, col in ri_col_map.items():
    print(f"  {ch:6s} -> {col}")

In [None]:
# ── FILTER TO MUTANT PSMs ─────────────────────────────────────────────────────
# New FASTA format: >sp|{accession}-{swap}-{hash}|{gene}-mut ... GN={gene} ...
#
# FragPipe PSM columns:
#   Protein    = "sp|Q02952-K117E-A3F2|AKAP12-mut"  (raw FASTA entry)
#   Protein ID = "Q02952-K117E-A3F2"                 (accession field)
#   Entry Name = "AKAP12-mut"                         (entry name field)
#
# Filter: Entry Name ends with "-mut"  (excludes reference sp| and comp| entries)

entry_col = "Entry Name"
protid_col = "Protein ID"

if entry_col not in psm.columns:
    # Fallback: derive from Protein column (3rd pipe-delimited field)
    psm[entry_col] = psm["Protein"].str.split("|").str[2].str.split().str[0]

mut_psm = psm[psm[entry_col].str.endswith("-mut", na=False)].copy()
print(f"Mutant PSMs: {len(mut_psm):,} of {len(psm):,} total")

def parse_mutation(protein_id):
    """Extract mutation code from Protein ID like 'Q02952-K117E-A3F2' -> 'K117E'."""
    parts = str(protein_id).split("-")
    return parts[1] if len(parts) >= 3 else None

def parse_accession(protein_id):
    """Extract base UniProt accession like 'Q02952-K117E-A3F2' -> 'Q02952'."""
    return str(protein_id).split("-")[0]

if protid_col in mut_psm.columns:
    mut_psm["mutation"]  = mut_psm[protid_col].apply(parse_mutation)
    mut_psm["accession"] = mut_psm[protid_col].apply(parse_accession)
else:
    mut_psm["mutation"]  = None
    mut_psm["accession"] = None

mut_psm["gene"] = mut_psm[entry_col].str.replace("-mut", "", regex=False)

print(f"Unique mutations:  {mut_psm['mutation'].nunique()}")
print(f"Unique genes:      {mut_psm['gene'].nunique()}")
print(f"\nTop mutations:\n{mut_psm['mutation'].value_counts().head(10)}")

In [None]:
# ── BUILD CHANNEL → PATIENT MAPPING ──────────────────────────────────────────
# annotation: sample, plex, channel, condition
# sample = "{case_id}_{sample_type}" or "Reference"
print(annot.to_string(index=False))

channel_to_patient = {}  # FragPipe channel label -> case_submitter_id
for _, row in annot.iterrows():
    sample  = str(row["sample"])
    channel = str(row["channel"]).upper()  # normalise to '126', '127N', etc.
    if sample.lower() in ("reference", "ref", "pooled", "pool"):
        continue
    patient = sample.split("_")[0]
    channel_to_patient[channel] = patient

print(f"\nChannel → patient mapping ({len(channel_to_patient)} channels):")
for ch, pt in sorted(channel_to_patient.items(),
                     key=lambda x: CHANNEL_ORDER.index(x[0])
                     if x[0] in CHANNEL_ORDER else 99):
    print(f"  {ch:6s} -> {pt}")

In [None]:
# ── ASSIGN EACH MUTANT PSM TO ITS DOMINANT CHANNEL ───────────────────────────
#
# Patient info is no longer embedded in the FASTA header (Philosopher requires
# standard UniProt-style accessions).  Instead we use a data-driven approach:
# for each mutant PSM, find the TMT channel with the highest reporter ion (RI).
# If the search is working, each patient's mutation should light up strongly in
# exactly ONE channel (theirs) and be near-background in all others.

channels_ordered = [ch for ch in CHANNEL_ORDER if ch in channel_to_patient]
ri_labels = list(ri_col_map.keys())
ri_cols   = list(ri_col_map.values())

# RI matrix for all mutant PSMs (0 → NaN = no signal)
ri_vals = mut_psm[ri_cols].copy().replace(0, np.nan)

# For each PSM, which channel has the highest RI?
ri_filled = ri_vals.fillna(0)
max_ch_idx  = ri_filled.values.argmax(axis=1)
max_ch_col  = [ri_labels[i] for i in max_ch_idx]   # e.g. "127N"
mut_psm = mut_psm.copy()
mut_psm["max_channel"] = max_ch_col

print(f"Total mutant PSMs: {len(mut_psm):,}")
print(f"Channel distribution of max-RI:")
print(mut_psm["max_channel"].value_counts().reindex(CHANNEL_ORDER).dropna().astype(int))

# ── PLOT ──────────────────────────────────────────────────────────────────────
#
# One subplot per sample channel.
# Each subplot shows boxplots of RI across ALL channels, restricted to the
# mutant PSMs that are MAX in that channel.
# Expected: the focal channel (red) should clearly dominate over the others.

ncols = 5
nrows = int(np.ceil(len(channels_ordered) / ncols))

fig, axes = plt.subplots(nrows, ncols,
                          figsize=(ncols * 3.5, nrows * 3.5),
                          sharey=False)
axes = np.array(axes).flatten()

for idx, focal_ch in enumerate(channels_ordered):
    ax = axes[idx]
    patient = channel_to_patient.get(focal_ch, "?")

    # Mutant PSMs where this channel dominates
    subset = mut_psm[mut_psm["max_channel"] == focal_ch][ri_cols].replace(0, np.nan)

    if subset.empty:
        ax.text(0.5, 0.5, "No mut PSMs", ha="center", va="center",
                transform=ax.transAxes, fontsize=9, color="grey")
        ax.set_title(f"{focal_ch}\n{patient}", fontsize=8)
        ax.set_xticks([])
        continue

    data = [np.log10(subset[col].dropna().values + 1) for col in ri_cols]

    colors = ["#e74c3c" if lbl == focal_ch else "#aec6cf" for lbl in ri_labels]
    bp = ax.boxplot(data, patch_artist=True, widths=0.6,
                    medianprops=dict(color="black", linewidth=1.5),
                    flierprops=dict(marker=".", markersize=2, alpha=0.4))
    for patch, color in zip(bp["boxes"], colors):
        patch.set_facecolor(color)

    ax.set_xticks(range(1, len(ri_labels) + 1))
    ax.set_xticklabels(ri_labels, rotation=45, ha="right", fontsize=7)
    ax.set_ylabel("log10(RI + 1)", fontsize=7)
    short_pt = patient[:11] + "…" if len(patient) > 12 else patient
    ax.set_title(f"{focal_ch}  ({short_pt})\nn={len(subset)}", fontsize=8)
    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter("%.1f"))

for ax in axes[len(channels_ordered):]:
    ax.set_visible(False)

fig.suptitle(
    f"Mutant Peptide RI — PSMs grouped by dominant channel\n{PLEX_ID}",
    fontsize=11, fontweight="bold", y=1.01)
plt.tight_layout()
out_pdf = os.path.join(RESULTS_DIR, "mutant_ri_validation.pdf")
plt.savefig(out_pdf, bbox_inches="tight")
plt.show()
print(f"Saved: {out_pdf}")

In [None]:
# ── SUMMARY TABLE ─────────────────────────────────────────────────────────────
# For each channel: n mutant PSMs assigned to it (by max-RI),
# median own-channel RI vs median across all other channels, and the ratio.
# A ratio >> 1 confirms that mutant peptides are enriched in the correct channel.

rows = []
for focal_ch in channels_ordered:
    patient = channel_to_patient.get(focal_ch, "?")
    if focal_ch not in ri_col_map:
        continue
    own_col    = ri_col_map[focal_ch]
    other_cols = [ri_col_map[ch] for ch in ri_labels
                  if ch != focal_ch and ch in ri_col_map]

    subset = mut_psm[mut_psm["max_channel"] == focal_ch].replace(0, np.nan)
    if subset.empty:
        continue

    own_median   = np.nanmedian(subset[own_col].values)
    other_median = np.nanmedian(subset[other_cols].values.flatten())
    ratio = own_median / other_median if other_median > 0 else np.nan

    rows.append({
        "channel":      focal_ch,
        "patient":      patient,
        "n_mut_psms":   len(subset),
        "median_own":   int(round(own_median, 0)),
        "median_other": int(round(other_median, 0)),
        "own/other":    round(ratio, 2),
    })

summary = pd.DataFrame(rows)
print(summary.to_string(index=False))

# Overall enrichment score
total_mut     = len(mut_psm)
n_sample_ch   = len(channels_ordered)
random_expect = 1 / max(len(ri_labels), 1)
actual_frac   = summary["n_mut_psms"].sum() / total_mut if total_mut else 0
mean_ratio    = summary["own/other"].mean()

print(f"\nTotal mutant PSMs:          {total_mut:,}")
print(f"PSMs in a sample channel:   {summary['n_mut_psms'].sum():,} "
      f"({100*actual_frac:.1f}%)")
print(f"Mean own/other ratio:       {mean_ratio:.2f}  "
      f"(>1 = signal concentrated in one channel — expected if working)")

In [None]:
# ── ENRICHMENT SCORE DISTRIBUTION ────────────────────────────────────────────
# For every mutant PSM: max_channel_RI / mean_other_channels_RI
# A score >> 1 means signal is concentrated in one channel (as expected).
# This doesn't depend on knowing which patient owns which channel.

ri_matrix = mut_psm[ri_cols].replace(0, np.nan)
max_ri     = ri_matrix.max(axis=1)
mean_ri    = ri_matrix.mean(axis=1)           # mean of all channels
# mean of non-max channels: (sum - max) / (n - 1)
sum_ri     = ri_matrix.sum(axis=1)
n_nonnan   = ri_matrix.notna().sum(axis=1)
mean_other = (sum_ri - max_ri) / (n_nonnan - 1).clip(lower=1)

enrich = (max_ri / mean_other.replace(0, np.nan)).dropna()

fig, ax = plt.subplots(figsize=(6, 4))
ax.hist(enrich.clip(upper=20), bins=60, color="#4878d0", edgecolor="white", linewidth=0.4)
ax.axvline(x=1, color="grey", linestyle="--", linewidth=1, label="ratio = 1 (no enrichment)")
ax.axvline(x=enrich.median(), color="#e74c3c", linestyle="-", linewidth=1.5,
           label=f"median = {enrich.median():.1f}")
ax.set_xlabel("max-channel RI / mean-other-channels RI  (clipped at 20)")
ax.set_ylabel("Number of mutant PSMs")
ax.set_title(f"Per-PSM channel enrichment score\n{PLEX_ID}")
ax.legend(fontsize=9)
plt.tight_layout()
fig.savefig(os.path.join(RESULTS_DIR, "mutant_enrichment_dist.pdf"), bbox_inches="tight")
plt.show()

pct_enriched = 100 * (enrich > 2).sum() / len(enrich)
print(f"Median enrichment ratio: {enrich.median():.2f}")
print(f"PSMs with ratio > 2:     {pct_enriched:.1f}%  "
      f"(higher = more channel-specific signal)")