In [1]:
import dataclasses
from collections import Counter
from typing import Tuple, List, Dict
from enum import Enum, auto
import time

from protein import trypsin
from measurement import read_mgf, PeptideMeasurement
from pyteomics import mass
from common import LYS, BSA


@dataclasses.dataclass
class Modification:
    name: str
    mass: float
    count: int


@dataclasses.dataclass
class Peptide:
    beginning: int
    end: int
    seq: str
    modifications: List[Modification]

    _aas = None
    _mass = None
    _minmass = None
    _maxmass = None

    def count(self, amino_acid):
        if self._aas is None:
            self._aas = Counter(self.seq)
        return self._aas[amino_acid]

    @property
    def zwitterion_mass(self):
        if self._mass is None:
            self._mass = mass.calculate_mass(
                sequence=self.seq, ion_type="M", charge=0
            ) - mass.calculate_mass(formula="H2O")
        return self._mass

    @property
    def min_mass(self):
        if self._minmass is None:
            neg = sum(m.mass * m.count for m in self.modifications if m.mass < 0)
            self._minmass = self.zwitterion_mass + neg
        return self._minmass

    @property
    def max_mass(self):
        if self._maxmass is None:
            pos = sum(m.mass * m.count for m in self.modifications if m.mass > 0)
            self._maxmass = self.zwitterion_mass + pos
        return self._maxmass


In [2]:
measurements = {m.scan: m for m in read_mgf("../data/mgf/190318_LYS_AT_50x_05.mgf")}

In [3]:
peptides = []
for b, e in trypsin(LYS):
    seq = LYS[b:e]
    met_ox = Modification("met_ox", 15.9949, sum(aa == "M" for aa in seq))
    peptides.append(Peptide(b, e, seq, [met_ox]))

peptides

[Peptide(beginning=0, end=1, seq='K', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=1, end=5, seq='VFGR', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=5, end=13, seq='CELAAAMK', modifications=[Modification(name='met_ox', mass=15.9949, count=1)]),
 Peptide(beginning=13, end=14, seq='R', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=14, end=21, seq='HGLDNYR', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=21, end=33, seq='GYSLGNWVCAAK', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=33, end=45, seq='FESNFNTQATNR', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=45, end=61, seq='NTDGSTDYGILQINSR', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=61, end=68, seq='WWCNDGR', modifications=[Modification(name='

In [72]:

def within_bounds(reference_mass, measured_mass, ppm_error=10):
    return abs(reference_mass - measured_mass) <= err_margin(reference_mass, ppm_error)


def err_margin(reference_mass, ppm_error=10):
    return (ppm_error / 1e6) * reference_mass


def compute_error(reference_mass, measured_mass):
    return 1e6 * abs(measured_mass - reference_mass) / reference_mass


class State(Enum):
    BEFORE = auto()
    DURING = auto()


def set_tuple(t, i, x):
    return t[:i] + (x,) + t[i + 1 :]


def combine_modifications(
    modifications: List[Tuple[str, float]],
    starting_mass: float,
    target_mass: float,
    ppm_error: float = 10,
) -> List[int]:
    result = []

    def go(i, current, selection):
        if within_bounds(current, target_mass, ppm_error):
            result.append(selection)

        if i == len(modifications):
            return
        else:
            name, mass = modifications[i]
            go(i + 1, current + mass, selection + ((name, mass),))
            go(i + 1, current, selection)

    go(0, current=starting_mass, selection=())
    return list(set(result))


# peptide_masses should be digestion peptides with H2O loss
# TODO: Přepsat na čitelnější verzi
def precursor_mass_matches(
    peptides: List[Peptide],
    measurement: PeptideMeasurement,
    cysteine_mod: float,
    jumps: int,
    ppm_error: int = 10,
) -> List[str]:
    target = measurement.peptide_mass_estimate
    h2o = mass.calculate_mass(formula="H2O")
    h2 = mass.calculate_mass(formula="H2")

    result = []

    # TODO: Optimize with dynamic programming if possible
    def go(
        i: int,
        current: float,
        selection: Tuple[int, ...],
        state: State,
        jumps: int,
        cysteines_before: int,
        cysteines_now: int,
        min_mod_bound: float,
        max_mod_bound: float,
    ) -> None:
        cysteines = cysteines_before + cysteines_now

        # Non-bonded cysteines are alkylated, or modified in another way
        if cysteines_now >= 0:
            max_with_c = max_mod_bound + cysteine_mod * cysteines
            highest = max_with_c + err_margin(max_with_c, ppm_error)

            if target <= highest:
                must_have_cysteine = cysteine_mod * (cysteines % 2 == 1)
                min_with_c = min_mod_bound + must_have_cysteine
                lowest = min_with_c - err_margin(min_with_c, ppm_error)
                if lowest <= target:
                    ranges = list(zip(selection[::2], (selection + (i,))[1::2]))

                    mod_array = []

                    for b, e in ranges:
                        for p in peptides[b:e]:
                            for m in p.modifications:
                                mod_array += [(m.name, m.mass)] * m.count

                    cys_mod = "cys_alk", cysteine_mod * 2
                    mod_array += [cys_mod] * (cysteines // 2)

                    combinations = combine_modifications(
                        mod_array,
                        starting_mass=current + must_have_cysteine,
                        target_mass=target,
                        ppm_error=ppm_error,
                    )

                    seq = "+".join(
                        "".join(p.seq for p in peptides[b:e]) for b, e in ranges
                    )

                    for c in combinations:
                        total_mass = current + must_have_cysteine + sum(m for _, m in c)
                        result.append(
                            {
                                "sequence": seq,
                                "cysteine_mods": (cysteines % 2 == 1)
                                + sum(2 for n, _ in c if n == "cys_alk"),
                                "mass": total_mass,
                                "error": compute_error(total_mass, target),
                                "mods": c,
                            }
                        )

        if (
            i == len(peptides)
            or min_mod_bound - err_margin(min_mod_bound, ppm_error) > target
        ):
            # Either we're out of peptides to add
            # Or we're too high and we'll never correct it
            return
        else:
            if state == State.BEFORE:
                # Don't start yet
                go(
                    i + 1,
                    current,
                    selection,
                    state.BEFORE,
                    jumps,
                    cysteines_before,
                    cysteines_now,
                    min_mod_bound,
                    max_mod_bound,
                )
            elif state == State.DURING and min(jumps, cysteines) > 0:
                # End this run, begin next one
                go(
                    i,
                    current + h2o - h2,  # +H2O for peptide ends, –H2 for the S-S bond
                    selection + (i,),
                    state.BEFORE,
                    jumps - 1,
                    cysteines - 1,
                    -1,
                    min_mod_bound + h2o - h2,
                    max_mod_bound + h2o - h2,
                )

            # Take this one, and either begin or continue this run
            go(
                i + 1,
                current + peptides[i].zwitterion_mass,
                selection + (i,) if state == State.BEFORE else selection,
                State.DURING,
                jumps,
                cysteines_before,
                cysteines_now + peptides[i].count("C"),
                min_mod_bound + peptides[i].min_mass,
                max_mod_bound + peptides[i].max_mass,
            )

    go(
        0,
        current=h2o,
        selection=(),
        state=State.BEFORE,
        jumps=jumps,
        cysteines_before=0,
        cysteines_now=0,
        min_mod_bound=h2o,
        max_mod_bound=h2o,
    )

    return result


In [576]:
import csv

FILE_PATH = "../out/precursor_matches_bsa_at_2_bonds.csv"

start_time = time.time()

with open(FILE_PATH, "w") as f:
    field_names = ["scan", "sequence", "mass", "error", "cysteine_mods", "mods"]
    writer = csv.DictWriter(f, fieldnames=field_names)
    writer.writeheader()

    for scan, measurement in measurements.items():
        for match in precursor_mass_matches(
            peptides, measurement, cysteine_mod=57.0214, jumps=2, ppm_error=15
        ):
            writer.writerow({"scan": scan} | match)
end_time = time.time()

print(f"This takes {end_time - start_time} seconds")

This takes 2095.511372089386 seconds


In [78]:
start_time = time.time()

for scan, measurement in measurements.items():
    for match in sorted(
        precursor_mass_matches(
            peptides, measurement, cysteine_mod=57.0214, jumps=1, ppm_error=15
        ),
        key=lambda m: m["sequence"],
    ):
        seq = match["sequence"]
        mods = match["mods"]
        print(f"{scan}: {match}")

end_time = time.time()
print(f"This takes {end_time - start_time} seconds")

845: {'sequence': 'CELAAAMK+GCR', 'cysteine_mods': 0, 'mass': 1183.51476996307, 'error': 0.17497809442811452, 'mods': (('met_ox', 15.9949),)}
846: {'sequence': 'RHGLDNYR', 'cysteine_mods': 0, 'mass': 1029.51042528457, 'error': 0.046330920919352085, 'mods': ()}
848: {'sequence': 'CELAAAMK+GCR', 'cysteine_mods': 0, 'mass': 1183.51476996307, 'error': 0.7012779318072832, 'mods': (('met_ox', 15.9949),)}
849: {'sequence': 'CELAAAMK+CK', 'cysteine_mods': 0, 'mass': 1098.4871582329, 'error': 0.45653075413820915, 'mods': (('met_ox', 15.9949),)}
852: {'sequence': 'CELAAAMKR+CK', 'cysteine_mods': 0, 'mass': 1238.5933692565, 'error': 0.363565438838716, 'mods': ()}
854: {'sequence': 'CELAAAMKR+GCR', 'cysteine_mods': 0, 'mass': 1339.61588098667, 'error': 0.07791330145198574, 'mods': (('met_ox', 15.9949),)}
858: {'sequence': 'NRCK+GCR', 'cysteine_mods': 0, 'mass': 851.3854250955899, 'error': 3.210571780469647, 'mods': ()}
889: {'sequence': 'CELAAAMK+CK', 'cysteine_mods': 0, 'mass': 1082.4922582329, '

In [73]:
# 7012, 7013

matches = precursor_mass_matches(
    peptides, measurements[9241], cysteine_mod=57.0214, jumps=1, ppm_error=15
)

matches

[{'sequence': 'VFGRCELAAAMKRHGLDNYRGYSLGNWVCAAK+NRCKGTDVQAWIR',
  'cysteine_mods': 1,
  'mass': 5172.527916262248,
  'error': 8.658728661960046,
  'mods': (('met_ox', 15.9949),)}]

In [77]:
start_time = time.time()

for scan, measurement in measurements.items():
    for match in sorted(
        precursor_mass_matches(
            peptides, measurement, cysteine_mod=57.0214, jumps=1, ppm_error=15
        ),
        key=lambda m: m["sequence"],
    ):
        seq = match["sequence"]
        mods = match["mods"]
        print(f"{scan}: {seq}")

end_time = time.time()
print(f"This takes {end_time - start_time} seconds")

845: CELAAAMK+GCR
846: RHGLDNYR
848: CELAAAMK+GCR
849: CELAAAMK+CK
852: CELAAAMKR+CK
854: CELAAAMKR+GCR
858: NRCK+GCR
889: CELAAAMK+CK
890: CELAAAMKR+GCR
892: CELAAAMK+CK
893: CELAAAMKR
914: CELAAAMKR
917: HGLDNYR
956: CELAAAMK+GCR
957: CELAAAMK+GCR
964: CELAAAMK+GCR
974: GYSLGNWVCAAK
1007: CELAAAMKR+GCR
1015: HGLDNYR
1033: CELAAAMK+GCR
1036: CELAAAMK+GCR
1038: CELAAAMK+GCR
1057: CELAAAMKR+GCRL
1070: HGLDNYR
1072: CELAAAMKR+GCRL
1086: CELAAAMK+GCR
1093: CELAAAMKR
1097: CELAAAMK
1100: CELAAAMKR
1109: CELAAAMKR+GCRL
1122: HGLDNYR
1128: CELAAAMKR
1138: CELAAAMK+GCR
1157: CELAAAMK
1162: CELAAAMKR+GCRL
1169: CELAAAMKR+GCRL
1171: CELAAAMKR+GCRL
1187: HGLDNYR
1200: CELAAAMKR
1207: CELAAAMK+GCR
1216: CELAAAMK+NRCK
1223: CELAAAMK+GCRL
1224: CELAAAMK
1242: CELAAAMKR+GCRL
1259: HGLDNYR
1265: CELAAAMK+GCRL
1269: CELAAAMKR
1276: CELAAAMK+GCR
1281: CELAAAMKR+GCRL
1287: CELAAAMK+GCRL
1296: CELAAAMK
1303: WWCNDGR+CK
1310: CELAAAMKR+GCRL
1324: WWCNDGR+CK
1332: CELAAAMKR
1335: HGLDNYR
1336: CELAAAMK+GCR

In [None]:

@dataclasses.dataclass
class Start:
    beginning: int
    forward: bool
    end: int


# Do a (None, 0) when you want to allow a skip
def combine_modifications_2(
    modifications: List[List[Tuple[str, float]]],
    starting_mass: float,
    target_mass: float,
    ppm_error: float = 10,
) -> List[int]:
    result = []

    def go(i, current, selection):
        if within_bounds(current, target_mass, ppm_error):
            result.append(selection)

        if i == len(modifications):
            return
        else:
            for name, mass in modifications[i]:
                go(i + 1, current + mass, selection + ((name, mass),))

    go(0, current=starting_mass, selection=())
    return list(set(result))


class Pep:
    pass


# TODO: Peptide should have fixed cysteine bonds
# TODO: Add min_mass counting
# Charges should be solved outside in a simple loop
def match_fragments(target, peptide, breaks, ppm_error=10):

    result = []
    b_ion_mod = -mass.calculate_mass(formula="OH")
    y_ion_mod = 0

    h2o = mass.calculate_mass(formula="H2O")
    nh3 = mass.calculate_mass(formula="NH3")
    sulphur = mass.calculate_mass(formula="S")
    h2 = mass.calculate_mass(formula="H2")

    # TODO: Add peptide modded amino acids

    def go(
        i,
        current,
        starts: Tuple[Start, ...],
        selection: Tuple[int, ...],
        breaks_left: int,
        broken_cysteines: Tuple[int, ...],
        neutral_losses_count: int,
        modded_amino_acids: Dict[str, int],
    ):
        if breaks_left == 0:
            # TODO: Sum rest, take care that we properly (don't) add cysteine mods
            sum_rest = ...
            go(
                i,
                current + sum_rest,
                (),
                selection + tuple(n for s in starts for n in [s.beginning, s.end]),
                0,
                broken_cysteines,
                neutral_losses_count,
                ...,  # TODO Sum rest of modded aas
            )

        if within_bounds(current, target, ppm_error):
            ranges = [
                (x, y) if y > x else (y, x)
                for x, y in zip(selection[::2], (selection + (i,))[1::2])
            ]

            mod_array = []

            final_mass = current

            for aa, count in modded_amino_acids:
                minimum_mods = max(
                    peptide.mod_count(aa) - (peptide.count(aa) - count), 0
                )
                final_mass += minimum_mods * peptide.mod(aa).mass

                # How many can I have
                maximum_mods = min(peptide.mod_count(aa), count)
                # Optional mods
                for _ in range(maximum_mods - minimum_mods):
                    mod_array.append([(None, 0), peptide.mod(aa)])

            for _ in range(neutral_losses_count):
                # MAYBE: Make this more granular?
                mod_array += [
                    ("–H2O neutrall loss", -h2o),
                    ("–NH3 neutrall loss", -nh3),
                ]

            for c in broken_cysteines:
                symmetric = (
                    j := peptide[c].bond_partner is not None
                ) and j in broken_cysteines

                # Symmetry breaking
                if symmetric and i > j:
                    continue

                if symmetric:
                    mod_array.append([("-SSH + / or -SH + =S", -h2)])
                else:
                    mod_array.append(
                        [
                            ("-SSH", sulphur),
                            ("- /", -(sulphur + h2)),
                            ("=S", -h2),
                            ("-SH", 0),
                        ]
                    )

            combinations = combine_modifications(
                mod_array,
                starting_mass=current,
                target_mass=target,
                ppm_error=ppm_error,
            )

            # TODO: Build and return the result

            result.append(selection)

        # Nowhere to go next
        if len(starts) == 0 or len(starts) == 1 and i == starts[0].end:
            return

        if i == starts[0].beginning:
            selection += (i,)

        # We have to end this run
        if i == starts[0].end:
            go(
                starts[1].beginning,
                current,
                starts[1:],
                selection + (i,),
                breaks_left,
                broken_cysteines,
                neutral_losses_count,
                modded_amino_acids,
            )
        else:
            # TODO: Add peptide[i] with __eq__ and with mass and is_bonded
            # TODO: [i].mass should include C modifications
            aa = peptide[i].name
            if aa == "C" and (j := peptide[i].bond_partner is not None):

                if i < j:
                    # TODO: Add two runs to the peptide, each to one side
                    next_runs = peptide.runs_around(j)

                    # continue, don't break the SS bond
                    go(
                        i + starts[0].forward,
                        current + peptide[i].mass + peptide[j].mass,
                        starts[1:] + next_runs,
                        selection + (j, j),
                        breaks_left,
                        broken_cysteines,
                        neutral_losses_count,
                        modded_amino_acids | {aa: modded_amino_acids[aa] + 1}
                        if aa in peptide.modded_amino_acids
                        else {},
                    )

                # Continue, break the SS bond
                go(
                    i + starts[0].forward,
                    current + peptide[i].mass,
                    starts[1:],
                    selection,
                    breaks_left - 1,
                    broken_cysteines + (i,),
                    neutral_losses_count,
                    modded_amino_acids | {aa: modded_amino_acids[aa] + 1}
                    if aa in peptide.modded_amino_acids
                    else {},
                )
            else:
                # Go forward, adding another amino acid

                go(
                    i + starts[0].forward,
                    current + peptide[i].mass,
                    starts,
                    selection,
                    breaks_left,
                    broken_cysteines,
                    neutral_losses_count,
                    modded_amino_acids | {aa: modded_amino_acids[aa] + 1}
                    if aa in peptide.modded_amino_acids
                    else {},
                )

            # Break the peptide bond, start next run
            go(
                starts[1].beginning,
                current + b_ion_mod if starts[0].forward > 0 else y_ion_mod,
                starts[1:],
                selection + (i,),
                breaks_left - 1,
                broken_cysteines,
                neutral_losses_count + 1,
                modded_amino_acids,
            )

    # TODO: Add positions to the peptide
    # TODO: Optimize so that we aren't too close to the end
    # TODO: add charges
    for beginning in peptide:
        # TODO: Add is_break to the peptide
        modifier = 0
        if peptide.is_break(beginning):
            breaks -= 1
            modifier = y_ion_mod

        # TODO: Add runs_from to peptide, returning the whole possible fragment
        go(peptide.run_from(beginning))

    return result


In [108]:
mass.calculate_mass(formula="SH2") - mass.calculate_mass(
    formula="S"
) - mass.calculate_mass(formula="H2")

-2.6645352591003757e-15