In [115]:
import dataclasses
from collections import Counter
from typing import Tuple, List, Dict, Union, Set, Iterator, Optional
from enum import Enum, auto
import time

from protein import trypsin
from measurement import read_mgf, PeptideMeasurement
from pyteomics import mass
from common import LYS, BSA


@dataclasses.dataclass
class AminoAcid:
    name: str
    # TODO: [i].mass should include C modifications
    mass: float


@dataclasses.dataclass
class Mod:
    description: str
    mass: float

    def __hash__(self):
        return (self.description, self.mass).__hash__()


@dataclasses.dataclass
class Modification:
    name: str
    mass: float
    count: int


class Peptide:
    beginning: int
    end: int
    seq: str

    _modifications: Dict[str, Tuple[Mod, int]]
    _amino_acids: List[AminoAcid]
    _aas = None
    _mass = None
    _minmass = None
    _maxmass = None

    def __init__(
        self,
        beginning: int,
        end: int,
        seq: str,
        modifications: Dict[str, Tuple[Mod, int]],
    ):
        self.beginning = beginning
        self.end = end
        self.seq = seq
        h2o = mass.calculate_mass(formula="H2O")
        self._amino_acids = [
            AminoAcid(aa, mass.calculate_mass(sequence=aa) - h2o) for aa in seq
        ]
        self._modifications = modifications

    def __getitem__(self, index: int):
        if self.beginning <= index < self.end:
            return self._amino_acids[index - self.beginning]
        return None

    def __iter__(self):
        return range(self.beginning, self.end).__iter__()

    def __add__(self, other):
        if other.beginning != self.end:
            raise ValueError(
                f"Peptides can only be added when they are contiguous. Got {(self.beginning, self.end)} + {other.beginning, other.end} instead."
            )

        merged_mods = self._modifications
        for target, (mod, c2) in other._modifications.items():
            mod2, c1 = merged_mods.setdefault(target, (mod, 0))
            if mod != mod2:
                raise ValueError(
                    f"Peptides can only be added when they have compatible modifications. These two differ at {target}"
                )
            merged_mods[target] = mod, c1 + c2

        return Peptide(self.beginning, other.end, self.seq + other.seq, merged_mods)

    def count(self, amino_acid):
        if self._aas is None:
            self._aas = Counter(self.seq)
        return self._aas[amino_acid]

    @property
    def zwitterion_mass(self):
        if self._mass is None:
            self._mass = mass.calculate_mass(
                sequence=self.seq, ion_type="M", charge=0
            ) - mass.calculate_mass(formula="H2O")
        return self._mass

    @property
    def min_mass(self):
        if self._minmass is None:
            neg = sum(
                m.mass * count for m, count in self.modifications_anywhere if m.mass < 0
            )
            self._minmass = self.zwitterion_mass + neg
        return self._minmass

    @property
    def max_mass(self):
        if self._maxmass is None:
            pos = sum(
                m.mass * count for m, count in self.modifications_anywhere if m.mass > 0
            )
            self._maxmass = self.zwitterion_mass + pos
        return self._maxmass

    @property
    def modifications_anywhere(self) -> Iterator[Tuple[Mod, int]]:
        return (x for x in self._modifications.values())

    def __repr__(self):
        return f"Peptide(beginning={self.beginning}, end={self.end}, seq={self.seq}, modifications={self._modifications})"


In [100]:
measurements = {m.scan: m for m in read_mgf("../data/mgf/190318_LYS_AT_50x_05.mgf")}
list(measurements.items())[:10]

[(3, <measurement.PeptideMeasurement at 0x121a5ee50>),
 (7, <measurement.PeptideMeasurement at 0x121a5eaf0>),
 (9, <measurement.PeptideMeasurement at 0x1245d89a0>),
 (12, <measurement.PeptideMeasurement at 0x12457fd00>),
 (13, <measurement.PeptideMeasurement at 0x12457fcd0>),
 (29, <measurement.PeptideMeasurement at 0x121d95190>),
 (37, <measurement.PeptideMeasurement at 0x121d95bb0>),
 (42, <measurement.PeptideMeasurement at 0x121d95f10>),
 (54, <measurement.PeptideMeasurement at 0x121d95ca0>),
 (57, <measurement.PeptideMeasurement at 0x121f7d9d0>)]

In [116]:
peptides = []
for b, e in trypsin(LYS):
    seq = LYS[b:e]
    met_ox = (Mod("met_ox", 15.9949), sum(aa == "M" for aa in seq))
    mods = {"M": met_ox} if "M" in seq else {}
    peptides.append(Peptide(b, e, seq, modifications=mods))

peptides

[Peptide(beginning=0, end=1, seq=K, modifications={}),
 Peptide(beginning=1, end=5, seq=VFGR, modifications={}),
 Peptide(beginning=5, end=13, seq=CELAAAMK, modifications={'M': (Mod(description='met_ox', mass=15.9949), 1)}),
 Peptide(beginning=13, end=14, seq=R, modifications={}),
 Peptide(beginning=14, end=21, seq=HGLDNYR, modifications={}),
 Peptide(beginning=21, end=33, seq=GYSLGNWVCAAK, modifications={}),
 Peptide(beginning=33, end=45, seq=FESNFNTQATNR, modifications={}),
 Peptide(beginning=45, end=61, seq=NTDGSTDYGILQINSR, modifications={}),
 Peptide(beginning=61, end=68, seq=WWCNDGR, modifications={}),
 Peptide(beginning=68, end=73, seq=TPGSR, modifications={}),
 Peptide(beginning=73, end=96, seq=NLCNIPCSALLSSDITASVNCAK, modifications={}),
 Peptide(beginning=96, end=97, seq=K, modifications={}),
 Peptide(beginning=97, end=112, seq=IVSDGNGMNAWVAWR, modifications={'M': (Mod(description='met_ox', mass=15.9949), 1)}),
 Peptide(beginning=112, end=114, seq=NR, modifications={}),
 Pepti

In [102]:

def within_bounds(reference_mass, measured_mass, ppm_error=10):
    return abs(reference_mass - measured_mass) <= err_margin(reference_mass, ppm_error)


def err_margin(reference_mass, ppm_error=10):
    return (ppm_error / 1e6) * reference_mass


def compute_error(reference_mass, measured_mass):
    return 1e6 * abs(measured_mass - reference_mass) / reference_mass


class State(Enum):
    BEFORE = auto()
    DURING = auto()


def set_tuple(t, i, x):
    return t[:i] + (x,) + t[i + 1 :]


# Pass None when you want to allow to skip a mod
def combine_modifications_2(
    modifications: List[List[Union[Mod, None]]],
    starting_mass: float,
    target_mass: float,
    ppm_error: float = 10,
) -> List[List[Mod]]:
    result = []

    def go(i, current, selection):
        if i == len(modifications):
            if within_bounds(current, target_mass, ppm_error):
                result.append(selection)
        else:
            for m in modifications[i]:
                if m is None:
                    go(i + 1, current, selection)
                else:
                    go(i + 1, current + m.mass, selection + (m,))

    go(0, current=starting_mass, selection=())
    return list(set(result))


# peptide_masses should be digestion peptides with H2O loss
# TODO: Přepsat na čitelnější verzi
def precursor_mass_matches(
    peptides: List[Peptide],
    measurement: PeptideMeasurement,
    alkylation_mass: float,
    max_inter_bonds: int,
    ppm_error: int = 10,
) -> List[str]:
    target = measurement.peptide_mass_estimate
    h2o = mass.calculate_mass(formula="H2O")
    h2 = mass.calculate_mass(formula="H2")

    result = []

    def go(
        i: int,
        current: float,
        min_raw_mass: float,
        max_raw_mass: float,
        selection: Tuple[int, ...],
        state: State,
        inter_bonds_left: int,
        cysteines_before: int,
        cysteines_now: int,
    ) -> None:
        # TODO: Those aren't really internal
        internal_cysteines = cysteines_before + cysteines_now

        # Non-bonded cysteines are alkylated, or modified in another way
        if cysteines_now >= 0:
            max_posibble_mass = max_raw_mass + alkylation_mass * internal_cysteines
            upper_bound = max_posibble_mass + err_margin(max_posibble_mass, ppm_error)

            if target <= upper_bound:
                has_alkylated_cys = internal_cysteines % 2 == 1
                min_possible_mass = min_raw_mass + alkylation_mass * has_alkylated_cys
                lower_bound = min_possible_mass - err_margin(
                    min_possible_mass, ppm_error
                )

                if lower_bound <= target:
                    ranges = list(zip(selection[::2], (selection + (i,))[1::2]))

                    possible_mods: List[List[Mod]] = []

                    for b, e in ranges:
                        for p in peptides[b:e]:
                            for m, count in p.modifications_anywhere:
                                possible_mods += [
                                    [Mod(m.description, m.mass), None]
                                ] * count

                    max_intra_bonds = internal_cysteines // 2
                    for _ in range(max_intra_bonds):
                        possible_mods.append(
                            [Mod("cys_pair_alk", alkylation_mass * 2), None]
                        )

                    seq = "+".join(
                        "".join(p.seq for p in peptides[b:e]) for b, e in ranges
                    )

                    if has_alkylated_cys:
                        # One Cys has to be alkylated, as it can't be in a bond
                        possible_mods.append([Mod("cys_alk", alkylation_mass)])

                    combinations = combine_modifications_2(
                        possible_mods,
                        starting_mass=current,
                        target_mass=target,
                        ppm_error=ppm_error,
                    )

                    for modifications in combinations:
                        total_mass = current + sum(m.mass for m in modifications)

                        alkylated_pairs = sum(
                            m.description == "cys_pair_alk" for m in modifications
                        )
                        intra_bonds = max_intra_bonds - alkylated_pairs
                        inter_bonds = max_inter_bonds - inter_bonds_left

                        result.append(
                            {
                                "sequence": seq,
                                "ranges": ranges,
                                "cysteine_bonds": intra_bonds + inter_bonds,
                                "inter_bonds": inter_bonds,
                                "intra_bonds": intra_bonds,
                                "mass": total_mass,
                                "error": compute_error(total_mass, target),
                                "mods": modifications,
                            }
                        )

        if (
            i == len(peptides)
            or min_raw_mass - err_margin(min_raw_mass, ppm_error) > target
        ):
            # Either we're out of peptides to add
            # Or we're too high and we'll never correct it
            return
        else:
            if state == State.BEFORE:
                # Don't start yet
                go(
                    i + 1,
                    current,
                    min_raw_mass,
                    max_raw_mass,
                    selection,
                    state.BEFORE,
                    inter_bonds_left,
                    cysteines_before,
                    cysteines_now,
                )
            elif (
                state == State.DURING and min(inter_bonds_left, internal_cysteines) > 0
            ):
                # End this run, begin next one
                go(
                    i,
                    current + h2o - h2,
                    min_raw_mass + h2o - h2,
                    max_raw_mass + h2o - h2,
                    selection + (i,),
                    state.BEFORE,
                    inter_bonds_left - 1,
                    internal_cysteines - 1,
                    -1,
                )

            # Take this one, and either begin or continue this run
            go(
                i + 1,
                current + peptides[i].zwitterion_mass,
                min_raw_mass + peptides[i].min_mass,
                max_raw_mass + peptides[i].max_mass,
                selection + (i,) if state == State.BEFORE else selection,
                State.DURING,
                inter_bonds_left,
                cysteines_before,
                cysteines_now + peptides[i].count("C"),
            )

    go(
        0,
        current=h2o,
        min_raw_mass=h2o,
        max_raw_mass=h2o,
        selection=(),
        state=State.BEFORE,
        inter_bonds_left=max_inter_bonds,
        cysteines_before=0,
        cysteines_now=0,
    )

    return result


In [31]:
import csv

FILE_PATH = "../out/precursor_matches_lys_at_2_inter_bonds.csv"

start_time = time.time()

with open(FILE_PATH, "w") as f:
    field_names = [
        "scan",
        "sequence",
        "mass",
        "error",
        "cysteine_bonds",
        "inter_bonds",
        "intra_bonds",
        "mods",
    ]
    writer = csv.DictWriter(f, fieldnames=field_names)
    writer.writeheader()

    for scan, measurement in measurements.items():
        for match in precursor_mass_matches(
            peptides,
            measurement,
            alkylation_mass=57.0214,
            max_inter_bonds=2,
            ppm_error=15,
        ):
            writer.writerow({"scan": scan} | match)
end_time = time.time()

print(f"This takes {end_time - start_time} seconds")

This takes 48.51588582992554 seconds


In [647]:
TAKE = 100

start_time = time.time()

for scan, measurement in list(measurements.items())[:TAKE]:
    for match in sorted(
        precursor_mass_matches(
            peptides,
            measurement,
            alkylation_mass=57.0214,
            max_inter_bonds=2,
            ppm_error=15,
        ),
        key=lambda m: m["sequence"],
    ):
        print(f"{scan}: {match}")

end_time = time.time()
print(f"This takes {end_time - start_time} seconds")

845: {'sequence': 'CELAAAMK+GCR', 'ranges': [(2, 3), (16, 17)], 'cysteine_bonds': 1, 'inter_bonds': 1, 'intra_bonds': 0, 'mass': 1183.51476996307, 'error': 0.17497809442811452, 'mods': (Mod(description='met_ox', mass=15.9949),)}
846: {'sequence': 'RHGLDNYR', 'ranges': [(3, 5)], 'cysteine_bonds': 0, 'inter_bonds': 0, 'intra_bonds': 0, 'mass': 1029.51042528457, 'error': 0.046330920919352085, 'mods': ()}
848: {'sequence': 'CELAAAMK+GCR', 'ranges': [(2, 3), (16, 17)], 'cysteine_bonds': 1, 'inter_bonds': 1, 'intra_bonds': 0, 'mass': 1183.51476996307, 'error': 0.7012779318072832, 'mods': (Mod(description='met_ox', mass=15.9949),)}
849: {'sequence': 'CELAAAMK+CK', 'ranges': [(2, 3), (14, 15)], 'cysteine_bonds': 1, 'inter_bonds': 1, 'intra_bonds': 0, 'mass': 1098.4871582329, 'error': 0.45653075413820915, 'mods': (Mod(description='met_ox', mass=15.9949),)}
852: {'sequence': 'CELAAAMKR+CK', 'ranges': [(2, 4), (14, 15)], 'cysteine_bonds': 1, 'inter_bonds': 1, 'intra_bonds': 0, 'mass': 1238.593369

In [103]:
# 7012, 7013

for sc in [10973]:
    print(sc)

    matches = precursor_mass_matches(
        peptides,
        measurements[sc],
        alkylation_mass=57.0214,
        max_inter_bonds=2,
        ppm_error=15,
    )
    for m in matches:
        print(m)

10973
{'sequence': 'NTDGSTDYGILQINSRWWCNDGR+TPGSRNLCNIPCSALLSSDITASVNCAKK+GTDVQAWIRGCRL', 'ranges': [(7, 9), (9, 12), (15, 18)], 'cysteine_bonds': 2, 'inter_bonds': 2, 'intra_bonds': 0, 'mass': 7159.396725080231, 'error': 0.48568250074936176, 'mods': (Mod(description='cys_alk', mass=57.0214),)}
{'sequence': 'NTDGSTDYGILQINSRWWCNDGRTPGSR+NLCNIPCSALLSSDITASVNCAKK+GTDVQAWIRGCRL', 'ranges': [(7, 10), (10, 12), (15, 18)], 'cysteine_bonds': 2, 'inter_bonds': 2, 'intra_bonds': 0, 'mass': 7159.396725080231, 'error': 0.48568250074936176, 'mods': (Mod(description='cys_alk', mass=57.0214),)}
{'sequence': 'NTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAK+K+GTDVQAWIRGCRL', 'ranges': [(7, 11), (11, 12), (15, 18)], 'cysteine_bonds': 2, 'inter_bonds': 2, 'intra_bonds': 0, 'mass': 7159.396725080232, 'error': 0.4856825008763968, 'mods': (Mod(description='cys_alk', mass=57.0214),)}
{'sequence': 'NTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKK+GTDVQAWIR+GCRL', 'ranges': [(7, 12), (15, 16), (16, 18

In [85]:
start_time = time.time()

TAKE = 100

for scan, measurement in list(measurements.items())[:TAKE]:
    for match in sorted(
        precursor_mass_matches(
            peptides,
            measurement,
            alkylation_mass=57.0214,
            max_inter_bonds=2,
            ppm_error=15,
        ),
        key=lambda m: m["sequence"],
    ):
        seq = match["sequence"]
        mods = match["mods"]
        print(f"{scan}: {seq}")

end_time = time.time()
print(f"This takes {end_time - start_time} seconds")


845: CELAAAMK+GCR
846: RHGLDNYR
848: CELAAAMK+GCR
849: CELAAAMK+CK
852: CELAAAMKR+CK
854: CELAAAMKR+GCR
858: NRCK+GCR
889: CELAAAMK+CK
This takes 0.0611879825592041 seconds


In [657]:

class MultiP:
    _segments: List[Peptide]
    _disulfide_bond: Dict[int, int] = {}
    _modifications: Dict[str, Tuple[Mod, int]]

    def __init__(
        self,
        segments: List[Peptide],
        disulfide_bonds: List[Tuple[int, int]],
        modifications: Dict[str, Tuple[Mod, int]],
    ):
        self._segments = sorted(segments, key=lambda s: s.beginning)
        self.segments = len(segments)
        for c1, c2 in disulfide_bonds:
            self._disulfide_bond[c1] = c2
            self._disulfide_bond[c2] = c1
        self._modifications = modifications

    def __getitem__(self, index: int) -> AminoAcid:
        for segment in self._segments:
            residue = segment[index]
            if residue is not None:
                return residue

    def __iter__(self):
        for segment in self._segments:
            for i in segment:
                yield i

    def count(self, residue: str) -> int:
        return sum(s.count(residue) for s in self._segments)

    def bond_partner(self, residue: int) -> Optional[int]:
        return self._disulfide_bond.get(residue, None)

    def modification_on(self, residue: str) -> (Mod, int):
        return self._modifications[residue]

    def can_be_modified(self, residue: int) -> bool:
        return self[residue].name in self._modifications

    def segment(self, residue: int) -> int:
        for i, s in enumerate(self._segments):
            if s.beginning <= residue <= s.end:
                return i

    def segment_beginning(self, segment: int) -> int:
        return self._segments[segment].beginning

    def segment_end(self, segment: int) -> int:
        return self._segments[segment].end


In [740]:
# TODO: Target should be with fixed charge (i.e. try all charges in a loop)
# TODO: Peptide should have fixed cysteine bonds (i.e. try all cysteine combinations in a loop)

# TODO: Optimize using binary search
def sort_into(x, xs: Tuple):
    return tuple(sorted(xs + (x,)))


OH = mass.calculate_mass(formula="OH")
PROTON = mass.calculate_mass(formula="H")
H2 = mass.calculate_mass(formula="H2")
H2O = mass.calculate_mass(formula="H2O")
NH3 = mass.calculate_mass(formula="NH3")
SULPHUR = mass.calculate_mass(formula="S")
Y_ION_MOD = PROTON
B_ION_MOD = -PROTON


def fragments(target_mass, peptide: MultiP, allowed_breaks, ppm_error=10):

    result = []

    def go_run(
        i: int,
        min_end: int,
        max_end: int,
        current_mass: float,
        breaks_left: int,
        new_runs: Tuple[int, ...],
        old_runs: Tuple[int, ...],
        unbroken_cysteines: Tuple[int, ...],
        broken_cysteines: Tuple[int, ...],
        neutral_losses_count: int,
        max_i_per_segment: Dict[int, int],
        fragment_start: int,
        modded_residues: Dict[str, int],
    ):
        # TODO: Add support for negative-mass modifications
        # TODO: Add neutral losses
        # TODO: Tighten the lower bound
        # - add must-have modifications
        # - properly count Cys modification
        min_possible_mass = (
            current_mass
            + B_ION_MOD
            # + (len(broken_cysteines) // 2) * (-H2)
            + (len(broken_cysteines)) * (-H2 - SULPHUR)
        )
        lower_bound = min_possible_mass - err_margin(min_possible_mass, ppm_error)
        if lower_bound > target_mass:
            # Too heavy, beyond repair, end the whole branch
            return

        if i >= max_end:
            if i > max_end:
                raise AssertionError("This should never happen, i > max_end")

            # We can't grow any longer, end the run
            go(
                i,
                max_i_per_segment,
                current_mass + OH,
                breaks_left,
                broken_cysteines,
                unbroken_cysteines,
                neutral_losses_count,
                new_runs,
                old_runs + (i,),
                fragment_start,
                modded_residues,
            )
            return
        else:
            residue = peptide[i]

            # This residue is (was) part of a disulfide bond
            if residue.name == "C" and ((j := peptide.bond_partner(i)) is not None):
                if j in broken_cysteines:
                    # This Cys (i) has a broken partner, so it has to be broken, too
                    go_run(
                        i + 1,
                        min_end,
                        max_end,
                        current_mass + residue.mass,
                        breaks_left,  # Already added when we were breaking j
                        new_runs,
                        old_runs,
                        broken_cysteines + (i,),
                        unbroken_cysteines,
                        neutral_losses_count,
                        max_i_per_segment,
                        fragment_start,
                        modded_residues,
                    )
                elif j in unbroken_cysteines:
                    # We have already seen this bond
                    # We can't break it, and neither can we jump through it
                    # So, just add the Cys (i) and go on

                    go_run(
                        i + 1,
                        min_end,
                        max_end,
                        current_mass + residue.mass,
                        breaks_left,
                        new_runs,
                        old_runs,
                        broken_cysteines,
                        unbroken_cysteines + (i,),
                        neutral_losses_count,
                        max_i_per_segment,
                        fragment_start,
                        modded_residues,
                    )
                else:
                    # We haven't seen this Cys (i) nor its bond partner yet
                    if breaks_left > 0:
                        # Break the bond
                        go_run(
                            i + 1,
                            min_end,
                            max_end,
                            current_mass + residue.mass,
                            breaks_left - 1,
                            new_runs,
                            old_runs,
                            broken_cysteines + (i,),
                            unbroken_cysteines,
                            neutral_losses_count,
                            max_i_per_segment,
                            fragment_start,
                            modded_residues,
                        )

                    if j > fragment_start:
                        # Keep the bond, add new run
                        go_run(
                            i + 1,
                            min_end,
                            max_end,
                            # Subtract H2 for the bond
                            current_mass + residue.mass - H2,
                            breaks_left,
                            sort_into(j, new_runs),
                            old_runs,
                            broken_cysteines,
                            unbroken_cysteines + (i,),
                            neutral_losses_count,
                            max_i_per_segment,
                            fragment_start,
                            modded_residues,
                        )

            else:
                # Add current residue, continue the run
                if peptide.can_be_modified(i):
                    new_modded_residues = modded_residues.copy()
                    new_modded_residues.setdefault(residue.name, 0)
                    new_modded_residues[residue.name] += 1
                else:
                    new_modded_residues = modded_residues
                go_run(
                    i + 1,
                    min_end,
                    max_end,
                    current_mass + residue.mass,
                    breaks_left,
                    new_runs,
                    old_runs,
                    broken_cysteines,
                    unbroken_cysteines,
                    neutral_losses_count,
                    max_i_per_segment,
                    fragment_start,
                    new_modded_residues,
                )

            # Break this run and end it
            if i >= min_end and breaks_left > 0:
                # End the run
                go(
                    i,
                    max_i_per_segment,
                    current_mass + B_ION_MOD,
                    breaks_left - 1,
                    broken_cysteines,
                    unbroken_cysteines,
                    neutral_losses_count + 1,
                    new_runs,
                    old_runs + (i,),
                    fragment_start,
                    modded_residues,
                )
                return

    def go(
        max_i: int,
        max_i_per_segment: Dict[int, int],
        current_mass: float,
        breaks_left: int,
        broken_cysteines: Tuple[int, ...],
        unbroken_cysteines,
        neutral_losses_count: int,
        runs: Tuple[int, ...],
        old_runs: Tuple[int, ...],
        fragment_start: int,
        modded_residues: Dict[str, int],
    ):
        if len(runs) == 0:
            potential_mods = []

            final_mass = current_mass

            for res, seen in modded_residues.items():
                peptide_mod, must_have = peptide.modification_on(res)

                minimum_mods = max(must_have - (peptide.count(res) - seen), 0)
                final_mass += minimum_mods * peptide_mod.mass

                # How many can I have
                maximum_mods = min(must_have, seen)
                # Optional mods
                for _ in range(maximum_mods - minimum_mods):
                    potential_mods.append([None, peptide_mod])

            for _ in range(neutral_losses_count):
                # MAYBE: Make this more granular? Or ditch this altogether
                potential_mods.append(
                    [
                        Mod("–H2O neutral loss", -H2O),
                        Mod("–NH3 neutral loss", -NH3),
                        None,
                    ]
                )

            for c in broken_cysteines:
                symmetric = (
                    (j := peptide.bond_partner(c)) is not None
                ) and j in broken_cysteines

                # Symmetry breaking
                if symmetric and c > j:
                    continue

                if symmetric:
                    potential_mods.append([Mod("-SSH + () or -SH + =S", -H2)])
                else:
                    potential_mods.append(
                        [
                            Mod("–SSH", SULPHUR),
                            Mod("– ()", -(SULPHUR + H2)),
                            Mod("=S", -H2),
                            Mod("–SH", 0),
                        ]
                    )

            ranges = list(zip(old_runs[::2], old_runs[1::2]))

            seq = []
            for b, e in ranges:
                s = ""
                for k in range(b, e):
                    s += peptide[k].name
                seq.append(s)
            seq = "+".join(seq)

            # if abs(current_mass - target_mass) < 100:
            #     print(
            #         ranges,
            #         seq,
            #         potential_mods,
            #         current_mass - target_mass,
            #         final_mass - current_mass,
            #     )

            combinations = combine_modifications_2(
                potential_mods,
                starting_mass=final_mass,
                target_mass=target_mass,
                ppm_error=ppm_error,
            )

            for modifications in combinations:
                total_mass = final_mass + sum(m.mass for m in modifications)
                result.append(
                    {
                        "seq": seq,
                        "ranges": ranges,
                        "mass": total_mass,
                        "error": compute_error(total_mass, target_mass),
                        "mods": modifications,
                    }
                )

            return

        segment = peptide.segment(max_i)
        new_max_i_per_segment = max_i_per_segment.copy()
        new_max_i_per_segment[segment] = max_i

        cys = runs[0]
        current_segment = peptide.segment(cys)
        current_segment_max_i = new_max_i_per_segment[current_segment]

        beg_start = max(
            peptide.segment_beginning(segment), current_segment_max_i, fragment_start
        )
        beg_end = cys

        end_start = cys + 1
        end_end = peptide.segment_end(current_segment)

        at_segment_start = current_segment_max_i == peptide.segment_beginning(segment)
        shift_optim = (
            not at_segment_start
            and current_segment_max_i != cys
            and not beg_start == fragment_start
        )

        # TODO: Add back shift optim
        for b in range(beg_start + 0, beg_end + 1):
            is_break = b > beg_start or (
                b == fragment_start and b > peptide.segment_beginning(current_segment)
            )

            if not is_break:
                go_run(
                    b,
                    end_start,
                    end_end,
                    current_mass + PROTON,
                    breaks_left,
                    runs[1:],
                    old_runs + (b,),
                    broken_cysteines,
                    unbroken_cysteines,
                    neutral_losses_count,
                    new_max_i_per_segment,
                    fragment_start,
                    modded_residues,
                )

            if is_break and breaks_left > 0:
                go_run(
                    b,
                    end_start,
                    end_end,
                    current_mass + Y_ION_MOD,
                    breaks_left - 1,
                    runs[1:],
                    old_runs + (b,),
                    broken_cysteines,
                    unbroken_cysteines,
                    neutral_losses_count + 1,
                    new_max_i_per_segment,
                    fragment_start,
                    modded_residues,
                )

    for b in peptide:
        go(
            0,
            {s: peptide.segment_beginning(s) for s in range(peptide.segments)},
            0,
            allowed_breaks,
            (),
            (),
            0,
            (b,),
            (),
            b,
            modded_residues={},
        )

    return result


In [744]:

def test_fragment_matching(
    seqs, bonds, target, allowed_breaks=2, charges=3, offset=10, ppm_error=15
):
    segments = []
    beg = 0
    for seq in seqs:
        segments.append(Peptide(beg, beg + len(seq), seq, modifications={}))
        beg = beg + len(seq) + offset

    peptide = MultiP(segments, bonds, modifications={})

    for ch in range(1, charges + 1):
        print("Charge", ch)
        frags = fragments(
            target * ch - PROTON * ch, peptide, allowed_breaks, ppm_error=ppm_error
        )
        for f in frags:
            print(f)


In [745]:
SEQ1 = "NTDGSTDYGILQINSRWWCNDGR"
SEQ2 = "NTDGSTDYGILQINSRWWCNDGR"

In [747]:
test_fragment_matching(
    [SEQ1, SEQ2],
    bonds=[(18, 30 + 18)],
    target=mass.calculate_mass("N", ion_type="b", charge=1),
    allowed_breaks=1,
)


Charge 1
{'seq': 'N', 'ranges': [(0, 1)], 'mass': 114.04292744114001, 'error': 4.8101650170743415, 'mods': ()}
{'seq': 'N', 'ranges': [(33, 34)], 'mass': 114.04292744114001, 'error': 4.8101650170743415, 'mods': ()}
Charge 2
Charge 3


In [749]:
test_fragment_matching(
    [SEQ1, SEQ2],
    bonds=[(18, 30 + 18)],
    target=mass.calculate_mass("NTDGSTDYGILQINSRWWCNDGR", ion_type="M", charge=1),
)

Charge 1
{'seq': 'NTDGSTDYGILQINSRWWCNDGR', 'ranges': [(0, 23)], 'mass': 2670.1881116865293, 'error': 0.2054406941369252, 'mods': ()}
{'seq': 'NTDGSTDYGILQINSRWWCNDGR', 'ranges': [(33, 56)], 'mass': 2670.1881116865293, 'error': 0.2054406941369252, 'mods': ()}
Charge 2
Charge 3


In [765]:

test_fragment_matching(
    [SEQ1, SEQ2],
    bonds=[(18, 30 + 18)],
    target=mass.calculate_mass("NTDGSTDYGILQINSRWWCNDGR", ion_type="M", charge=2),
)

# [[(0, 23)], [(30, 53)]]

Charge 1
Charge 2
{'seq': 'NTDGSTDYGILQINSRWWCNDGR', 'ranges': [(0, 23)], 'mass': 2670.1881116865293, 'error': 0.4108813886144611, 'mods': ()}
{'seq': 'NTDGSTDYGILQINSRWWCNDGR', 'ranges': [(33, 56)], 'mass': 2670.1881116865293, 'error': 0.4108813886144611, 'mods': ()}
Charge 3


In [756]:
test_fragment_matching(
    [SEQ1, SEQ2],
    bonds=[(18, 30 + 18)],
    target=mass.calculate_mass("NSRWWCNDGR", ion_type="y", charge=2),
)

Charge 1
Charge 2
{'seq': 'NSRWWCNDGR', 'ranges': [(13, 23)], 'mass': 1292.54688744628, 'error': 0.8488129989940549, 'mods': ()}
{'seq': 'NSRWWCNDGR', 'ranges': [(46, 56)], 'mass': 1292.54688744628, 'error': 0.8488129989940549, 'mods': ()}
Charge 3


In [764]:
test_fragment_matching(
    [SEQ1, SEQ2],
    bonds=[(18, 30 + 18)],
    target=mass.calculate_mass("NTDGSTDYGILQI", ion_type="b", charge=1),
)

Charge 1
{'seq': 'NTDGSTDYGILQI', 'ranges': [(0, 13)], 'mass': 1377.6412242402498, 'error': 0.39819169925977427, 'mods': ()}
{'seq': 'TDGSTDYGILQIN', 'ranges': [(1, 14)], 'mass': 1377.6412242402498, 'error': 0.39819169925977427, 'mods': ()}
{'seq': 'NTDGSTDYGILQI', 'ranges': [(33, 46)], 'mass': 1377.6412242402498, 'error': 0.39819169925977427, 'mods': ()}
{'seq': 'TDGSTDYGILQIN', 'ranges': [(34, 47)], 'mass': 1377.6412242402498, 'error': 0.39819169925977427, 'mods': ()}
Charge 2
Charge 3


In [760]:
target = (
    mass.calculate_mass("NTDGSTDYGILQINSRWWCNDGR", ion_type="M", charge=1)
    + mass.calculate_mass("TDGSTDYGILQINSRWWCNDGR", ion_type="y", charge=0)
    - H2
)

test_fragment_matching([SEQ1, SEQ2], bonds=[(18, 30 + 18)], target=target)

Charge 1
{'seq': 'NTDGSTDYGILQINSRWWCNDGR+TDGSTDYGILQINSRWWCNDGR', 'ranges': [(0, 23), (34, 56)], 'mass': 5224.317645867778, 'error': 0.10500228632842597, 'mods': ()}
{'seq': 'TDGSTDYGILQINSRWWCNDGR+NTDGSTDYGILQINSRWWCNDGR', 'ranges': [(1, 23), (33, 56)], 'mass': 5224.317645867778, 'error': 0.10500228632842597, 'mods': (Mod(description='–SH', mass=0),)}
Charge 2
Charge 3


In [761]:
# TODO: Symmetric cases are missing? And Cys mods on bonds
target = (
    mass.calculate_mass("NTDGSTDYGILQINSRWWC", ion_type="b", charge=1)
    + mass.calculate_mass("CNDGR", ion_type="y", charge=0)
    - H2
)

test_fragment_matching([SEQ1, SEQ2], bonds=[(18, 30 + 18)], target=target)

Charge 1
{'seq': 'INSRWWCNDGR+INSRWWCNDGR', 'ranges': [(12, 23), (45, 56)], 'mass': 2771.20947335114, 'error': 10.239395329911941, 'mods': (Mod(description='–H2O neutral loss', mass=-18.0105646837), Mod(description='–H2O neutral loss', mass=-18.0105646837), Mod(description='=S', mass=-2.01565006414))}
{'seq': 'CNDGR+NTDGSTDYGILQINSRWWC', 'ranges': [(18, 23), (33, 52)], 'mass': 2771.1816464071, 'error': 0.19795356984520335, 'mods': (Mod(description='–SH', mass=0),)}
{'seq': 'C+NTDGSTDYGILQINSRWWCNDGR', 'ranges': [(18, 19), (33, 56)], 'mass': 2771.1816464070994, 'error': 0.19795356968110467, 'mods': (Mod(description='–SH', mass=0),)}
Charge 2
Charge 3


In [762]:

target = (
    mass.calculate_mass("ILQINSRWWC", ion_type="b", charge=1)
    + mass.calculate_mass("NSRWWCNDG", ion_type="b", charge=0)
    - H2
)

test_fragment_matching([SEQ1, SEQ2], bonds=[(18, 30 + 18)], target=target)

Charge 1
Charge 2
{'seq': 'NTDGSTDYGILQINSRWWCNDGR+GSTDYGILQINSRWWCNDG', 'ranges': [(0, 23), (36, 55)], 'mass': 4832.115698604099, 'error': 6.544047612485566, 'mods': (Mod(description='=S', mass=-2.01565006414),)}
{'seq': 'NTDGSTDYGILQINSRWWCNDG+GSTDYGILQINSRWWCNDGR', 'ranges': [(0, 22), (36, 56)], 'mass': 4832.115698604099, 'error': 6.544047612485566, 'mods': (Mod(description='=S', mass=-2.01565006414),)}
Charge 3


In [763]:
target = (
    mass.calculate_mass("NTDGSTDYGILQINSRWWCNDGR", ion_type="M")
    + mass.calculate_mass("RWWCNDGR", ion_type="b")
    - H2
) / 3 + PROTON

test_fragment_matching(
    [SEQ1, SEQ2],
    bonds=[(18, 30 + 18)],
    target=target,
)

Charge 1
Charge 2
Charge 3
{'seq': 'NTDGSTDYGILQINSRWWCNDGR+QINSRWWC', 'ranges': [(0, 23), (44, 52)], 'mass': 3741.6589806582388, 'error': 6.722183611275158, 'mods': ()}
{'seq': 'NTDGSTDYGILQINSRWWCNDGR+RWWCNDGR', 'ranges': [(0, 23), (48, 56)], 'mass': 3741.633828539558, 'error': 4.861484279063855e-10, 'mods': (Mod(description='–H2O neutral loss', mass=-18.0105646837),)}
{'seq': 'NTDGSTDYGILQINSRWWC+QINSRWWCNDGR', 'ranges': [(0, 19), (44, 56)], 'mass': 3741.658980658239, 'error': 6.7221836113966935, 'mods': ()}
{'seq': 'GSTDYGILQINSRWWCNDGR+INSRWWCNDGR', 'ranges': [(3, 23), (45, 56)], 'mass': 3741.6048369103987, 'error': 7.748447638105533, 'mods': (Mod(description='–NH3 neutral loss', mass=-17.02654910101), Mod(description='–NH3 neutral loss', mass=-17.02654910101), Mod(description='–SSH', mass=31.972071))}
{'seq': 'GSTDYGILQINSRWWCNDGR+INSRWWCNDGR', 'ranges': [(3, 23), (45, 56)], 'mass': 3741.6702140482785, 'error': 9.724402910191374, 'mods': (Mod(description='=S', mass=-2.01565006414

In [None]:
# MultiP generation

{
    "sequence": "CELAAAMK+GCR",
    "ranges": [(2, 3), (16, 17)],
    "cysteine_bonds": 1,
    "inter_bonds": 1,
    "intra_bonds": 0,
    "mass": 1183.51476996307,
    "error": 0.17497809442811452,
    "mods": (Mod(description="met_ox", mass=15.9949),),
}


def gen_multip(peptides, d) -> List[MultiP]:
    segments = []
    for b, e in d["ranges"]:
        segment = None
        for p in peptides[b:e]:
            if segment is None:
                segment = p
            else:
                segment += p
        segments.append(segment)

    inter_bonds = d["inter_bonds"]
    intra_bonds = d["intra_bonds"]
    total_bonds = inter_bonds + intra_bonds

    if inter_bonds < len(segments) - 1:
        raise ValueError("There's more segments than bonds between them.")

    result = []
    blocked = []
    for fr, to in zip(segments, segments[1:]):
        for c1 in fr.cysteines:
            if c1 not in blocked:
                for c2 in to.cysteines:
                    if c2 not in blocked:
                        pass

    return MultiP(segments, bonds)