In [132]:
import dataclasses
from typing import Tuple, List

from protein import trypsin
from measurement import read_mgf
from pyteomics import mass


@dataclasses.dataclass
class Modification:
    name: str
    mass: float
    count: int


@dataclasses.dataclass
class Peptide:
    beginning: int
    end: int
    seq: str
    modifications: List[Modification]

    @property
    def mass(self, charge=0):
        return mass.calculate_mass(sequence=self.seq, ion_type="M", charge=charge)


In [21]:
LYS = "KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL"
measurements = {m.scan: m for m in read_mgf("../data/mgf/190318_LYS_AT_50x_05.mgf")}

In [133]:
peptides = []
for b, e in trypsin(LYS):
    seq = LYS[b:e]
    met_count = sum(1 for aa in seq if aa == "M")
    met_ox = Modification("met_oxidation", 15.9949, met_count)
    peptides.append(Peptide(b, e, seq, [met_ox] if met_count > 0 else []))

peptides

[Peptide(beginning=0, end=1, seq='K', modifications=[]),
 Peptide(beginning=1, end=5, seq='VFGR', modifications=[]),
 Peptide(beginning=5, end=13, seq='CELAAAMK', modifications=[Modification(name='met_oxidation', mass=15.9949, count=1)]),
 Peptide(beginning=13, end=14, seq='R', modifications=[]),
 Peptide(beginning=14, end=21, seq='HGLDNYR', modifications=[]),
 Peptide(beginning=21, end=33, seq='GYSLGNWVCAAK', modifications=[]),
 Peptide(beginning=33, end=45, seq='FESNFNTQATNR', modifications=[]),
 Peptide(beginning=45, end=61, seq='NTDGSTDYGILQINSR', modifications=[]),
 Peptide(beginning=61, end=68, seq='WWCNDGR', modifications=[]),
 Peptide(beginning=68, end=73, seq='TPGSR', modifications=[]),
 Peptide(beginning=73, end=96, seq='NLCNIPCSALLSSDITASVNCAK', modifications=[]),
 Peptide(beginning=96, end=97, seq='K', modifications=[]),
 Peptide(beginning=97, end=112, seq='IVSDGNGMNAWVAWR', modifications=[Modification(name='met_oxidation', mass=15.9949, count=1)]),
 Peptide(beginning=112, 

In [134]:

def within_bounds(reference_mass, measured_mass, ppm_error=10):
    return abs(reference_mass - measured_mass) <= err_margin(reference_mass, ppm_error)


def err_margin(reference_mass, ppm_error=10):
    return (ppm_error / 1e6) * reference_mass


# TODO Rewrite to loops


# peptide_masses should be digestion peptides with H2O loss
def precursor_matches(peptides, measurement, ppm_error=10):
    measured_mass = measurement.peptide_mass_estimate
    modifications = {}

    result = []
    b = 0
    reference_mass = mass.calculate_mass(formula="H2O")

    for e in range(len(peptides)):
        reference_mass += peptides[e].mass
        for mod, count in peptides[e].modifications:
            modifications.setdefault(mod, 0)
            modifications[mod] += count

        # If the measured mass is higher than our current peptide
        if reference_mass - err_margin(reference_mass, ppm_error) < measured_mass:
            difference = reference_mass - measured_mass
            # Here subset sum with tolerance

        # Move bottom tolerance bound below measured_mass
        while reference_mass - err_margin(reference_mass, ppm_error) > measured_mass:
            for mod, count in peptides[b].modifications:
                modifications[mod] -= count
            reference_mass -= peptides[b].mass

        checkpoint_b = b
        checkpoint_mass = reference_mass

        # Check all matches within tolerance with fixed e
        while measured_mass < reference_mass + err_margin(reference_mass, ppm_error):
            if within_bounds(reference_mass, measured_mass, ppm_error):
                result.append({"beginning": b, "end": e + 1, "mc": e - b, "mods": []})

            for mod, count in peptides[b].modifications:
                modifications[mod] -= count
            reference_mass -= peptides[b].mass
            b += 1

        b = checkpoint_b
        reference_mass = checkpoint_mass

    return result


In [125]:
PEPTIDE_MASSES = [p.mass() - mass.calculate_mass(formula="H2O") for p in peptides]

for scan, measurement in measurements.items():
    for match in precursor_matches(PEPTIDE_MASSES, measurement, ppm_error=15):
        peptide = "".join(p.seq for p in peptides[match["beginning"] : match["end"]])
        print(f"{scan}: {peptide}")

846: RHGLDNYR
917: HGLDNYR
1015: HGLDNYR
1070: HGLDNYR
1122: HGLDNYR
1187: HGLDNYR
1259: HGLDNYR
1335: HGLDNYR
1417: HGLDNYR
1474: FESNFNTQATNR
1482: VFGRCELAAAMKR
1505: HGLDNYR
1557: CELAAAMKR
1604: HGLDNYR
1693: HGLDNYR
1783: HGLDNYR
1790: GTDVQAWIRGCR
1902: HGLDNYR
1997: FESNFNTQATNR
2022: FESNFNTQATNR
2030: HGLDNYR
2107: FESNFNTQATNR
2136: FESNFNTQATNR
2149: HGLDNYR
2206: FESNFNTQATNR
2281: FESNFNTQATNR
2284: HGLDNYR
2312: CKGTDVQAWIRGCR
2323: FESNFNTQATNR
2392: FESNFNTQATNR
2393: HGLDNYR
2471: NTDGSTDYGILQINSR
2493: FESNFNTQATNR
2530: FESNFNTQATNR
2532: HGLDNYR
2635: FESNFNTQATNR
2654: CELAAAMKR
2656: FESNFNTQATNR
2658: HGLDNYR
2770: FESNFNTQATNR
2773: HGLDNYR
2774: FESNFNTQATNR
2909: FESNFNTQATNR
2912: HGLDNYR
2914: FESNFNTQATNR
3032: FESNFNTQATNR
3036: HGLDNYR
3037: FESNFNTQATNR
3104: NTDGSTDYGILQINSR
3175: FESNFNTQATNR
3210: FESNFNTQATNR
3211: HGLDNYR
3303: FESNFNTQATNR
3329: WWCNDGR
3338: HGLDNYR
3342: FESNFNTQATNR
3417: FESNFNTQATNR
3456: HGLDNYR
3463: FESNFNTQATNR
3529: FESN

In [147]:

# source: https://stackoverflow.com/questions/23087820/python-subset-sum
def subset(array, num, abs_error):
    result = []

    def find(arr, num, path=()):
        if not arr:
            return
        if abs(arr[0] - num) <= abs_error:
            result.append(path + (arr[0],))
        else:
            find(arr[1:], num - arr[0], path + (arr[0],))
            find(arr[1:], num, path)

    find(array, num)

    return list(set(result))


def mod_sum(target, modifications, abs_error):
    masses = tuple(m.mass for m in modifications)

    def helper(t, mods, acc):
        print(t)
        if abs(target - 0) <= abs_error:
            return [acc]
        elif target < 0 - abs_error:
            return []
        else:
            first_nonzero = 0
            while mods[first_nonzero].count == 0:
                first_nonzero += 1
                print("nonzero,", first_nonzero)
                if first_nonzero >= len(mods):
                    return []
            mods[first_nonzero].count -= 1
            b = helper(t, mods, acc)
            acc2 = acc[:]
            acc2[first_nonzero] += 1
            a = helper(t - masses[first_nonzero], mods, acc2)

            return a + b

    return helper(target, [], [0] * len(modifications))


mods = [Modification("A", 5, 1), Modification("B", 1, 7), Modification("B", 3, 2)]
array = []
for m in mods:
    array += [m.mass] * m.count
subset(array, 10, 0)

[(1, 1, 1, 1, 3, 3),
 (5, 1, 1, 3),
 (5, 1, 1, 1, 1, 1),
 (1, 1, 1, 1, 1, 1, 1, 3)]