In [3]:
import dataclasses
from collections import Counter
from typing import Tuple, List, Dict, Union, Set, Iterator, Optional
from enum import Enum, auto
import time

from protein import trypsin
from measurement import read_mgf, PeptideMeasurement
from pyteomics import mass
from common import LYS, BSA


@dataclasses.dataclass
class Mod:
    description: str
    mass: float

    def __hash__(self):
        return (self.description, self.mass).__hash__()


@dataclasses.dataclass
class Residue:
    name: str
    mass: float

    def __init__(self, name: str, modifications=None):
        if modifications is None:
            modifications = []

        self.name = name
        self.mass = (
            mass.calculate_mass(sequence=name)
            - mass.calculate_mass(formula="H2O")
            + sum(m.mass for m in modifications)
        )


class Peptide:
    beginning: int
    end: int
    seq: str
    min_mass: float
    mid_mass: float
    max_mass: float

    _modifications: Dict[str, Tuple[Mod, int]]
    _residues: List[Residue]
    _residue_counts: Dict[str, int] = None
    _mass = None
    _minmass = None
    _maxmass = None

    def __init__(
        self,
        beginning: int,
        end: int,
        seq: str,
        modifications: Dict[str, Tuple[Mod, int]],
    ):
        self.beginning = beginning
        self.end = end
        self.seq = seq
        self._residues = [Residue(resname) for resname in seq]
        self._modifications = modifications

        zwitterion_mass = mass.calculate_mass(
            sequence=self.seq, ion_type="M", charge=0
        ) - mass.calculate_mass(formula="H2O")

        pos, neg = 0, 0
        for m, count in modifications.values():
            if m.mass < 0:
                neg += m.mass * count
            else:
                pos += m.mass * count

        self.min_mass = zwitterion_mass + neg
        self.mid_mass = zwitterion_mass
        self.max_mass = zwitterion_mass + pos

    def __getitem__(self, index: int):
        if self.beginning <= index < self.end:
            return self._residues[index - self.beginning]
        return None

    def __iter__(self):
        return range(self.beginning, self.end).__iter__()

    def __add__(self, other):
        if other.beginning != self.end:
            raise ValueError(
                f"Peptides can only be added when they are contiguous. Got {(self.beginning, self.end)} + {other.beginning, other.end} instead."
            )

        merged_mods = self._modifications
        for target, (mod, c2) in other._modifications.items():
            mod2, c1 = merged_mods.setdefault(target, (mod, 0))
            if mod != mod2:
                raise ValueError(
                    f"Peptides can only be added when they have compatible modifications. These two differ at {target}"
                )
            merged_mods[target] = mod, c1 + c2

        return Peptide(self.beginning, other.end, self.seq + other.seq, merged_mods)

    def count(self, amino_acid):
        if self._residue_counts is None:
            self._residue_counts = Counter(self.seq)
        return self._residue_counts[amino_acid]

    @property
    def modifications_anywhere(self) -> Iterator[Tuple[Mod, int]]:
        return (x for x in self._modifications.values())

    @property
    def cysteines(self) -> Iterator[int]:
        return (i for i, res in enumerate(self._residues) if res.name == "C")

    def __repr__(self):
        return f"Peptide(beginning={self.beginning}, end={self.end}, seq={self.seq}, modifications={self._modifications})"


def within_bounds(reference_mass, measured_mass, error_ppm: float = 10):
    return abs(reference_mass - measured_mass) <= err_margin(reference_mass, error_ppm)


def err_margin(reference_mass, error_ppm: float = 10):
    return (error_ppm / 1e6) * reference_mass


def compute_error(reference_mass, measured_mass):
    return 1e6 * abs(measured_mass - reference_mass) / reference_mass


# Pass None when you want to allow to skip a mod
def combine_modifications(
    modifications: List[List[Union[Mod, None]]],
    starting_mass: float,
    target_mass: float,
    error_ppm: float = 10,
) -> List[List[Mod]]:
    result = []

    def go(i: int, current_mass: float, selection: Tuple[Mod, ...] = ()):
        if i == len(modifications):
            if within_bounds(current_mass, target_mass, error_ppm):
                result.append(selection)
        else:
            for m in modifications[i]:
                if m is None:
                    go(i + 1, current_mass, selection)
                else:
                    go(i + 1, current_mass + m.mass, selection + (m,))

    go(0, current_mass=starting_mass)
    return list(set(result))


In [4]:
measurements = {m.scan: m for m in read_mgf("../data/mgf/190318_LYS_AT_50x_05.mgf")}
list(measurements.items())[:10]

[(3, <measurement.PeptideMeasurement at 0x1251ccee0>),
 (7, <measurement.PeptideMeasurement at 0x1251cce50>),
 (9, <measurement.PeptideMeasurement at 0x10fceb5e0>),
 (12, <measurement.PeptideMeasurement at 0x124fbe760>),
 (13, <measurement.PeptideMeasurement at 0x124fbe0d0>),
 (29, <measurement.PeptideMeasurement at 0x124fbe280>),
 (37, <measurement.PeptideMeasurement at 0x124fbe0a0>),
 (42, <measurement.PeptideMeasurement at 0x124fbe9a0>),
 (54, <measurement.PeptideMeasurement at 0x124fbe490>),
 (57, <measurement.PeptideMeasurement at 0x124fbebb0>)]

In [5]:
peptides = []
for b, e in trypsin(LYS):
    seq = LYS[b:e]
    met_ox = (Mod("met_ox", 15.9949), sum(aa == "M" for aa in seq))
    mods = {"M": met_ox} if "M" in seq else {}
    peptides.append(Peptide(b, e, seq, modifications=mods))

peptides

[Peptide(beginning=0, end=1, seq=K, modifications={}),
 Peptide(beginning=1, end=5, seq=VFGR, modifications={}),
 Peptide(beginning=5, end=13, seq=CELAAAMK, modifications={'M': (Mod(description='met_ox', mass=15.9949), 1)}),
 Peptide(beginning=13, end=14, seq=R, modifications={}),
 Peptide(beginning=14, end=21, seq=HGLDNYR, modifications={}),
 Peptide(beginning=21, end=33, seq=GYSLGNWVCAAK, modifications={}),
 Peptide(beginning=33, end=45, seq=FESNFNTQATNR, modifications={}),
 Peptide(beginning=45, end=61, seq=NTDGSTDYGILQINSR, modifications={}),
 Peptide(beginning=61, end=68, seq=WWCNDGR, modifications={}),
 Peptide(beginning=68, end=73, seq=TPGSR, modifications={}),
 Peptide(beginning=73, end=96, seq=NLCNIPCSALLSSDITASVNCAK, modifications={}),
 Peptide(beginning=96, end=97, seq=K, modifications={}),
 Peptide(beginning=97, end=112, seq=IVSDGNGMNAWVAWR, modifications={'M': (Mod(description='met_ox', mass=15.9949), 1)}),
 Peptide(beginning=112, end=114, seq=NR, modifications={}),
 Pepti

In [38]:

def match_precursors(
    peptides: List[Peptide],
    measurement: PeptideMeasurement,
    max_segments: int,
    alkylation_mass: float = 57.0214,
    error_ppm: int = 10,
) -> List[str]:
    target_mass = measurement.peptide_mass_estimate
    h2o = mass.calculate_mass(formula="H2O")
    h2 = mass.calculate_mass(formula="H2")

    result = []

    def go(
        i: int,
        segments_left: int,
        selection: Tuple[int, ...],
        min_mass: float = h2o,
        base_mass: float = h2o,
        max_mass: float = h2o,
        free_cys_count: int = 0,
        waiting_for_cys: bool = False,
    ) -> None:
        has_alkylated_cys = free_cys_count % 2 == 1
        min_realistic_mass = min_mass + alkylation_mass * has_alkylated_cys
        lower_bound = min_realistic_mass - err_margin(min_realistic_mass, error_ppm)

        if not waiting_for_cys:
            max_realistic_mass = max_mass + alkylation_mass * free_cys_count
            upper_bound = max_realistic_mass + err_margin(max_realistic_mass, error_ppm)

            if lower_bound <= target_mass <= upper_bound:
                ranges = list(zip(selection[::2], (selection + (i,))[1::2]))
                possible_mods: List[List[Mod]] = []

                for b, e in ranges:
                    for p in peptides[b:e]:
                        for m, count in p.modifications_anywhere:
                            possible_mods += [
                                [Mod(m.description, m.mass), None]
                            ] * count

                max_other_bonds = free_cys_count // 2
                for _ in range(max_other_bonds):
                    possible_mods.append(
                        [Mod("Alkylated Cys Pair", alkylation_mass * 2), None]
                    )

                if has_alkylated_cys:
                    # One Cys has to be alkylated, because it can't be in a bond
                    possible_mods.append([Mod("Alkylated Cys", alkylation_mass)])

                mod_combinations = combine_modifications(
                    possible_mods,
                    starting_mass=base_mass,
                    target_mass=target_mass,
                    error_ppm=error_ppm,
                )

                if mod_combinations:
                    segments = (
                        "".join(p.seq for p in peptides[b:e]) for b, e in ranges
                    )
                    seq = "+".join(segments)

                    for modifications in mod_combinations:
                        total_mass = base_mass + sum(m.mass for m in modifications)

                        alkylated_pairs = sum(
                            m.description == "Alkylated Cys Pair" for m in modifications
                        )
                        joining_bonds = (max_segments - segments_left) - 1
                        other_bonds = max_other_bonds - alkylated_pairs

                        result.append(
                            {
                                "sequence": seq,
                                "ranges": ranges,
                                "missed_cleavages": max(e - b for b, e in ranges) - 1,
                                "cys_bonds": other_bonds + joining_bonds,
                                "mass": total_mass,
                                "error": compute_error(total_mass, target_mass),
                                "mods": modifications,
                            }
                        )

                    return

        if i == len(peptides) or lower_bound > target_mass:
            # Either we're out of peptides to add
            # Or our mass is too high beyond repair
            return

        if (
            not waiting_for_cys
            and min(segments_left, free_cys_count) > 0
            and selection[-1] != i  # Can't end if we just started
        ):
            # End current run, begin the next one
            for beginning in range(i, len(peptides)):
                go(
                    beginning,
                    segments_left=segments_left - 1,
                    selection=selection + (i, beginning),
                    min_mass=min_mass + (h2o - h2),
                    base_mass=base_mass + (h2o - h2),
                    max_mass=max_mass + (h2o - h2),
                    free_cys_count=free_cys_count - 1,
                    waiting_for_cys=True,
                )

        # Add current peptide
        new_free_cys = peptides[i].count("C") - waiting_for_cys
        go(
            i + 1,
            segments_left=segments_left,
            selection=selection,
            min_mass=min_mass + peptides[i].min_mass,
            base_mass=base_mass + peptides[i].mid_mass,
            max_mass=max_mass + peptides[i].max_mass,
            free_cys_count=free_cys_count + max(new_free_cys, 0),
            waiting_for_cys=new_free_cys < 0,
        )

    for beginning in range(0, len(peptides)):
        go(beginning, segments_left=max_segments - 1, selection=(beginning,))

    return result


In [52]:
import tqdm

start_time = time.time()

with open("../out/precursor_matches.txt", "w") as file:
    for scan, measurement in tqdm.tqdm(list(measurements.items())):
        for match in sorted(
            match_precursors(
                peptides,
                measurement,
                max_segments=3,
                error_ppm=15,
            ),
            key=lambda m: m["sequence"],
        ):
            file.write(f"{scan}: {match}\n")

end_time = time.time()
print(f"This takes {end_time - start_time} seconds")


100%|██████████| 12479/12479 [00:33<00:00, 376.96it/s]

This takes 33.11788010597229 seconds





In [None]:
import pickle
from tqdm import tqdm

FILE_PATH = "../out/precursor_matches_lys_at_2_inter_bonds.pickle"

start_time = time.time()

with open(FILE_PATH, "wb") as f:
    for scan, measurement in measurements.items():
        for match in precursor_mass_matches(
            peptides,
            measurement,
            alkylation_mass=57.0214,
            max_inter_bonds=1,
            ppm_error=10,
        ):
            print(f"{scan}: {match}")
        # pickle.dump({"scan": scan, "matches": matches}, f)

end_time = time.time()

print(f"This takes {end_time - start_time} seconds")

In [211]:
frags = []

for m in measurements.values():
    frags.append(m.peptide_mass_estimate)

sorted(frags)[:10]

[701.407628744454,
 701.844091146798,
 702.409948080392,
 702.410009115548,
 703.375402181954,
 703.375524252266,
 703.376195638986,
 703.65720149836,
 705.380529135078,
 707.396703451486]

In [58]:
import pickle

with open("../out/precursor_matches_lys_at_2_inter_bonds.pickle", "rb") as f:
    while True:
        mch = pickle.load(f)
        scan = mch["measurement"].scan
        for m in mch["matches"]:
            print(f"{scan}: {m}")

845: {'sequence': 'CELAAAMK+GCR', 'ranges': [(2, 3), (16, 17)], 'cys_bonds': 1, 'mass': 1183.51476996307, 'error': 0.17497809442811452, 'mods': (Mod(description='met_ox', mass=15.9949),)}
846: {'sequence': 'RHGLDNYR', 'ranges': [(3, 5)], 'cys_bonds': 0, 'mass': 1029.51042528457, 'error': 0.046330920919352085, 'mods': ()}
848: {'sequence': 'CELAAAMK+GCR', 'ranges': [(2, 3), (16, 17)], 'cys_bonds': 1, 'mass': 1183.51476996307, 'error': 0.7012779318072832, 'mods': (Mod(description='met_ox', mass=15.9949),)}
849: {'sequence': 'CELAAAMK+CK', 'ranges': [(2, 3), (14, 15)], 'cys_bonds': 1, 'mass': 1098.4871582329, 'error': 0.45653075413820915, 'mods': (Mod(description='met_ox', mass=15.9949),)}
852: {'sequence': 'CELAAAMKR+CK', 'ranges': [(2, 4), (14, 15)], 'cys_bonds': 1, 'mass': 1238.5933692565, 'error': 0.363565438838716, 'mods': ()}
854: {'sequence': 'CELAAAMKR+GCR', 'ranges': [(2, 4), (16, 17)], 'cys_bonds': 1, 'mass': 1339.61588098667, 'error': 0.07791330145198574, 'mods': (Mod(descripti

EOFError: Ran out of input

In [56]:
from pyteomics import fasta

trypsin([r.sequence for r in fasta.read("../data/fasta/lysozyme.fasta")][0])

[(0, 2),
 (2, 19),
 (19, 23),
 (23, 31),
 (31, 32),
 (32, 39),
 (39, 51),
 (51, 63),
 (63, 79),
 (79, 86),
 (86, 91),
 (91, 114),
 (114, 115),
 (115, 130),
 (130, 132),
 (132, 134),
 (134, 143),
 (143, 146),
 (146, 147)]

In [39]:
sc = [sc for sc, m in measurements.items() if m.id == 7474]
match_precursors(
    peptides,
    measurements[sc[0]],
    alkylation_mass=57.0214,
    max_segments=2,
    error_ppm=1000,
)

[{'sequence': 'NTDGSTDYGILQINSR',
  'ranges': [(7, 8)],
  'missed_cleavages': 0,
  'cys_bonds': 0,
  'mass': 1752.82785579296,
  'error': 1.2535560139800361,
  'mods': ()}]