In [333]:
import dataclasses
from collections import Counter
from typing import Tuple, List

from protein import trypsin
from measurement import read_mgf, PeptideMeasurement
from pyteomics import mass


@dataclasses.dataclass
class Modification:
    name: str
    mass: float
    count: int


@dataclasses.dataclass
class Peptide:
    beginning: int
    end: int
    seq: str
    modifications: List[Modification]

    _aas = None
    _mass = None
    _minmass = None
    _maxmass = None

    def count(self, amino_acid):
        if self._aas is None:
            self._aas = Counter(self.seq)
        return self._aas[amino_acid]

    @property
    def zwitterion_mass(self):
        if self._mass is None:
            self._mass = mass.calculate_mass(
                sequence=self.seq, ion_type="M", charge=0
            ) - mass.calculate_mass(formula="H2O")
        return self._mass

    @property
    def min_mass(self):
        if self._minmass is None:
            neg = sum(m.mass * m.count for m in self.modifications if m.mass < 0)
            self._minmass = self.zwitterion_mass + neg
        return self._minmass

    @property
    def max_mass(self):
        if self._maxmass is None:
            pos = sum(m.mass * m.count for m in self.modifications if m.mass > 0)
            self._maxmass = self.zwitterion_mass + pos
        return self._maxmass


In [21]:
LYS = "KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL"
measurements = {m.scan: m for m in read_mgf("../data/mgf/190318_LYS_AT_50x_05.mgf")}

In [334]:
peptides = []
for b, e in trypsin(LYS):
    seq = LYS[b:e]
    met_count = sum(aa == "M" for aa in seq)
    met_ox = Modification("met_ox", 15.9949, met_count)
    peptides.append(Peptide(b, e, seq, [met_ox]))

peptides

[Peptide(beginning=0, end=1, seq='K', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=1, end=5, seq='VFGR', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=5, end=13, seq='CELAAAMK', modifications=[Modification(name='met_ox', mass=15.9949, count=1)]),
 Peptide(beginning=13, end=14, seq='R', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=14, end=21, seq='HGLDNYR', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=21, end=33, seq='GYSLGNWVCAAK', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=33, end=45, seq='FESNFNTQATNR', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=45, end=61, seq='NTDGSTDYGILQINSR', modifications=[Modification(name='met_ox', mass=15.9949, count=0)]),
 Peptide(beginning=61, end=68, seq='WWCNDGR', modifications=[Modification(name='

In [184]:

def within_bounds(reference_mass, measured_mass, ppm_error=10):
    return abs(reference_mass - measured_mass) <= err_margin(reference_mass, ppm_error)


def err_margin(reference_mass, ppm_error=10):
    return (ppm_error / 1e6) * reference_mass


# peptide_masses should be digestion peptides with H2O loss
def precursor_matches(peptides, measurement, ppm_error=10):
    measured_mass = measurement.peptide_mass_estimate
    modifications = {}

    result = []
    b = 0
    reference_mass = mass.calculate_mass(formula="H2O")

    for e in range(len(peptides)):
        reference_mass += peptides[e].zwitterion_mass
        # for mod, count in peptides[e].modifications:
        #    modifications.setdefault(mod, 0)
        #    modifications[mod] += count

        # If the measured mass is higher than our current peptide
        # if reference_mass - err_margin(reference_mass, ppm_error) < measured_mass:
        # difference = reference_mass - measured_mass
        # Here subset sum with tolerance

        # Move bottom tolerance bound below measured_mass
        while reference_mass - err_margin(reference_mass, ppm_error) > measured_mass:
            # for mod, count in peptides[b].modifications:
            #    modifications[mod] -= count
            reference_mass -= peptides[b].zwitterion_mass
            b += 1

        checkpoint_b = b
        checkpoint_mass = reference_mass

        # Check all matches within tolerance with fixed e
        while measured_mass < reference_mass + err_margin(reference_mass, ppm_error):
            if within_bounds(reference_mass, measured_mass, ppm_error):
                result.append({"beginning": b, "end": e + 1, "mc": e - b, "mods": []})

            # for mod, count in peptides[b].modifications:
            #    modifications[mod] -= count
            reference_mass -= peptides[b].zwitterion_mass
            b += 1

        b = checkpoint_b
        reference_mass = checkpoint_mass

    return result


In [125]:
for scan, measurement in measurements.items():
    for match in precursor_matches(peptides, measurement, ppm_error=15):
        peptide = "".join(p.seq for p in peptides[match["beginning"] : match["end"]])
        print(f"{scan}: {peptide}")

846: RHGLDNYR
917: HGLDNYR
1015: HGLDNYR
1070: HGLDNYR
1122: HGLDNYR
1187: HGLDNYR
1259: HGLDNYR
1335: HGLDNYR
1417: HGLDNYR
1474: FESNFNTQATNR
1482: VFGRCELAAAMKR
1505: HGLDNYR
1557: CELAAAMKR
1604: HGLDNYR
1693: HGLDNYR
1783: HGLDNYR
1790: GTDVQAWIRGCR
1902: HGLDNYR
1997: FESNFNTQATNR
2022: FESNFNTQATNR
2030: HGLDNYR
2107: FESNFNTQATNR
2136: FESNFNTQATNR
2149: HGLDNYR
2206: FESNFNTQATNR
2281: FESNFNTQATNR
2284: HGLDNYR
2312: CKGTDVQAWIRGCR
2323: FESNFNTQATNR
2392: FESNFNTQATNR
2393: HGLDNYR
2471: NTDGSTDYGILQINSR
2493: FESNFNTQATNR
2530: FESNFNTQATNR
2532: HGLDNYR
2635: FESNFNTQATNR
2654: CELAAAMKR
2656: FESNFNTQATNR
2658: HGLDNYR
2770: FESNFNTQATNR
2773: HGLDNYR
2774: FESNFNTQATNR
2909: FESNFNTQATNR
2912: HGLDNYR
2914: FESNFNTQATNR
3032: FESNFNTQATNR
3036: HGLDNYR
3037: FESNFNTQATNR
3104: NTDGSTDYGILQINSR
3175: FESNFNTQATNR
3210: FESNFNTQATNR
3211: HGLDNYR
3303: FESNFNTQATNR
3329: WWCNDGR
3338: HGLDNYR
3342: FESNFNTQATNR
3417: FESNFNTQATNR
3456: HGLDNYR
3463: FESNFNTQATNR
3529: FESN

In [323]:

# source: https://stackoverflow.com/questions/23087820/python-subset-sum
def subset(array, num, abs_error):
    result = []

    def find(arr, num, path=()):
        if not arr:
            return
        if abs(arr[0] - num) <= abs_error:
            result.append(path + (arr[0],))
        else:
            find(arr[1:], num - arr[0], path + (arr[0],))
            find(arr[1:], num, path)

    find(array, num)

    return list(set(result))


def mod_sum(target, modifications, abs_error):
    masses = tuple(m.mass for m in modifications)

    def helper(t, mods, acc):
        print(t)
        if abs(target - 0) <= abs_error:
            return [acc]
        elif target < 0 - abs_error:
            return []
        else:
            first_nonzero = 0
            while mods[first_nonzero].count == 0:
                first_nonzero += 1
                print("nonzero,", first_nonzero)
                if first_nonzero >= len(mods):
                    return []
            mods[first_nonzero].count -= 1
            b = helper(t, mods, acc)
            acc2 = acc[:]
            acc2[first_nonzero] += 1
            a = helper(t - masses[first_nonzero], mods, acc2)

            return a + b

    return helper(target, [], [0] * len(modifications))


mods = [Modification("A", 5, 1), Modification("B", 1, 7), Modification("B", 3, 2)]
array = []
for m in mods:
    array += [m.mass] * m.count
subset(array, 10, 0)

[(1, 1, 1, 1, 3, 3),
 (5, 1, 1, 3),
 (5, 1, 1, 1, 1, 1),
 (1, 1, 1, 1, 1, 1, 1, 3)]

In [340]:

from enum import Enum, auto


class State(Enum):
    BEFORE = auto()
    DURING = auto()


# peptide_masses should be digestion peptides with H2O loss
def precursor_matches_2(
    peptides: List[Peptide],
    measurement: PeptideMeasurement,
    cysteine_mod: float,
    jumps: int,
    ppm_error: int = 10,
) -> List[str]:
    target = measurement.peptide_mass_estimate
    h2o = mass.calculate_mass(formula="H2O")
    h2 = mass.calculate_mass(formula="H2")

    result = []

    # TODO: Add mods
    # TODO: Optimize path building
    # TODO: Optimize for positive numbers
    # TODO: Optimize with dynamic programming if possible
    def go(
        i: int,
        current: float,
        path: Tuple[int, ...],
        state: State,
        jumps: int,
        cysteines_before: int,
        cysteines_now: int,
        min_mod_bound: int,
        max_mod_bound: int,
    ) -> None:
        # Non-bonded cysteines are alkylated, or modified in another way
        cysteines = cysteines_before + cysteines_now
        c_mod_add = cysteine_mod * cysteines

        final_mass = current + c_mod_add

        min_with_c = min_mod_bound + c_mod_add
        lowest = min_with_c - err_margin(min_with_c, ppm_error)

        max_with_c = max_mod_bound + c_mod_add
        highest = max_with_c + err_margin(max_with_c, ppm_error)

        if cysteines_now >= 0 and within_bounds(final_mass, target, ppm_error):
            seq = peptides[path[0]].seq
            for p in range(1, len(path)):
                if path[p] != path[p - 1] + 1:
                    seq += "+"
                seq += peptides[path[p]].seq

            result.append(
                {"sequence": seq, "cysteine_mods": cysteines, "mass": final_mass}
            )
        elif i == len(peptides) or current > target:
            # Either we're out of peptides to add
            # Or we're too high and we'll never correct it
            return
        else:
            if state == State.BEFORE:
                # Don't start yet
                go(
                    i + 1,
                    current,
                    path,
                    state.BEFORE,
                    jumps,
                    cysteines_before,
                    cysteines_now,
                    min_mod_bound,
                    max_mod_bound,
                )
            elif state == State.DURING and min(jumps, cysteines) > 0:
                # End this run, begin next one
                go(
                    i,
                    current + h2o - h2,  # +H2O for peptide ends, –H2 for the S-S bond
                    path,
                    state.BEFORE,
                    jumps - 1,
                    cysteines - 1,
                    -1,
                    min_mod_bound,
                    max_mod_bound,
                )

            # Take this one, and either begin or continue this run
            go(
                i + 1,
                current + peptides[i].zwitterion_mass,
                path + (i,),
                State.DURING,
                jumps,
                cysteines_before,
                cysteines_now + peptides[i].count("C"),
                min_mod_bound + peptides[i].min_mass,
                max_mod_bound + peptides[i].max_mass,
            )

    go(
        0,
        current=h2o,
        path=(),
        state=State.BEFORE,
        jumps=jumps,
        cysteines_before=0,
        cysteines_now=0,
        min_mod_bound=0,
        max_mod_bound=0,
    )

    return result


In [215]:
import time

start_time = time.time()
for scan, measurement in measurements.items():
    for match in precursor_matches(peptides, measurement, ppm_error=15):
        peptide = "".join(p.seq for p in peptides[match["beginning"] : match["end"]])
        print(f"{scan}: {peptide}")
end_time = time.time()

print(f"This takes {end_time - start_time} seconds")

846: RHGLDNYR
917: HGLDNYR
1015: HGLDNYR
1070: HGLDNYR
1122: HGLDNYR
1187: HGLDNYR
1259: HGLDNYR
1335: HGLDNYR
1417: HGLDNYR
1474: FESNFNTQATNR
1482: VFGRCELAAAMKR
1505: HGLDNYR
1557: CELAAAMKR
1604: HGLDNYR
1693: HGLDNYR
1783: HGLDNYR
1790: GTDVQAWIRGCR
1902: HGLDNYR
1997: FESNFNTQATNR
2022: FESNFNTQATNR
2030: HGLDNYR
2107: FESNFNTQATNR
2136: FESNFNTQATNR
2149: HGLDNYR
2206: FESNFNTQATNR
2281: FESNFNTQATNR
2284: HGLDNYR
2312: CKGTDVQAWIRGCR
2323: FESNFNTQATNR
2392: FESNFNTQATNR
2393: HGLDNYR
2471: NTDGSTDYGILQINSR
2493: FESNFNTQATNR
2530: FESNFNTQATNR
2532: HGLDNYR
2635: FESNFNTQATNR
2654: CELAAAMKR
2656: FESNFNTQATNR
2658: HGLDNYR
2770: FESNFNTQATNR
2773: HGLDNYR
2774: FESNFNTQATNR
2909: FESNFNTQATNR
2912: HGLDNYR
2914: FESNFNTQATNR
3032: FESNFNTQATNR
3036: HGLDNYR
3037: FESNFNTQATNR
3104: NTDGSTDYGILQINSR
3175: FESNFNTQATNR
3210: FESNFNTQATNR
3211: HGLDNYR
3303: FESNFNTQATNR
3329: WWCNDGR
3338: HGLDNYR
3342: FESNFNTQATNR
3417: FESNFNTQATNR
3456: HGLDNYR
3463: FESNFNTQATNR
3529: FESN

In [341]:
start_time = time.time()
for scan, measurement in measurements.items():
    for match in precursor_matches_2(
        peptides, measurement, cysteine_mod=57.0214, jumps=1, ppm_error=15
    ):
        seq = match["sequence"]
        print(f"{scan}: {seq}")
end_time = time.time()

print(f"This takes {end_time - start_time} seconds")

846: RHGLDNYR
852: CELAAAMKR+CK
858: NRCK+GCR
889: CELAAAMK+CK
890: CELAAAMKR+GCR
892: CELAAAMK+CK
917: HGLDNYR
956: CELAAAMK+GCR
964: CELAAAMK+GCR
974: GYSLGNWVCAAK
1007: CELAAAMKR+GCR
1015: HGLDNYR
1033: CELAAAMK+GCR
1038: CELAAAMK+GCR
1070: HGLDNYR
1086: CELAAAMK+GCR
1122: HGLDNYR
1138: CELAAAMK+GCR
1162: CELAAAMKR+GCRL
1171: CELAAAMKR+GCRL
1187: HGLDNYR
1207: CELAAAMK+GCR
1216: CELAAAMK+NRCK
1259: HGLDNYR
1276: CELAAAMK+GCR
1303: WWCNDGR+CK
1324: WWCNDGR+CK
1335: HGLDNYR
1336: CELAAAMK+GCR
1376: WWCNDGR+CK
1403: CELAAAMKR
1417: HGLDNYR
1419: CELAAAMK+GCR
1447: WWCNDGRTPGSR+CK
1460: WWCNDGR+CK
1465: WWCNDGR+GCR
1474: FESNFNTQATNR
1478: CELAAAMKR+GCRL
1491: CELAAAMKR
1505: HGLDNYR
1506: CELAAAMK+GCR
1539: CELAAAMKR+GCRL
1560: WWCNDGR+GCR
1570: WWCNDGR+CK
1579: CELAAAMKR+GCRL
1604: HGLDNYR
1613: CELAAAMK+GCR
1641: CELAAAMKR+GCRL
1660: WWCNDGR+GCR
1670: WWCNDGR+CK
1672: CELAAAMKR+GCRL
1693: HGLDNYR
1703: CELAAAMK+GCR
1704: CELAAAMK+GCRL
1724: CELAAAMKR+GCRL
1741: WWCNDGR+GCR
1750: WWCN

In [301]:
for sc in [852, 956, 2603]:
    for p in precursor_matches_2(
        peptides, measurements[sc], cysteine_mod=57.0214, jumps=1, ppm_error=15
    ):
        print(p)

{'sequence': 'CELAAAMKR+CK', 'cysteine_mods': 0, 'mass': 1238.5933692565}
{'sequence': 'CELAAAMK+GCR', 'cysteine_mods': 0, 'mass': 1167.51986996307}
{'sequence': 'KVFGRCELAAAMKR+GCRL', 'cysteine_mods': 0, 'mass': 2024.0594105479502}


In [332]:
c = Counter("ABCDED")
c["D"]

2