In [23]:
from typing import Optional, List, Tuple, Dict

import numpy as np
import pandas as pd
from pepfrag import Peptide, IonType, ModSite

from src.fragment_matching import (
    write_matched_fragments,
)
from src.model.fragment import Fragment
from src.model.modification import IAA_ALKYLATION, CYS_BOND
from src.model.precursor import Precursor
from src.model.scan import Scan
from src.precursor_matching import write_matched_precursors
from src.utilities.constants import LIP_BONDS, PROTON, OVA_BONDS
from src.utilities.dataloading import cleave_protein
from pyteomics import fasta
import re

from random import randrange, randint, choices, shuffle
from pyteomics.mass import calculate_mass

In [24]:
PROTEIN = "OVA"
CYS_BONDS = OVA_BONDS
tryptides = cleave_protein(PROTEIN)
TRYPTIDE_COUNT = len(tryptides)
seq_path = f"../data/fasta/{PROTEIN}.fasta"
SEQUENCE = [r.sequence for r in fasta.read(seq_path)][0]
SEQUENCE

'GSIGAASMEFCFDVFKELKVHHANENIFYCPIAIMSALAMVYLGAKDSTRTQINKVVRFDKLPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYAEERYPILPEYLQCVKELYRGGLEPINFQTAADQARELINSWVESQTNGIIRNVLQPSSVDSQTAMVLVNAIVFKGLWEKAFKDEDTQAMPFRVTEQESKPVQMMYQIGLFRVASMASEKMKILELPFASGTMSMLVLLPDEVSGLEQLESIINFEKLTEWTSSNVMEERKIKVYLPRMKMEEKYNLTSVLMAMGITDVFSSSANLSGISSAESLKISQAVHAAHAEINEAGREVVGSAEAGVDAASVSEEFRADHPFLFCIKHIATNAVLFFGRCVSP'

In [25]:

def intersects(t, u):
    x, y = t
    a, b = u
    return not (x >= b or y <= a)


def dedup(xs):
    return list(dict(tp) for tp in set(tuple(p.items()) for p in (xs)))


def connected_cys_count(prec):
    return sum(res == "C" for res in prec.sequence) - prec.alkylation_count


def generate_precursors(base_count=10000):
    bondless_precursors: List[Dict] = []
    raw_bonded_material: List[Dict] = []
    for _ in range(0, base_count):
        x = randrange(0, TRYPTIDE_COUNT)
        y = randrange(x, min(TRYPTIDE_COUNT, x + 5))
        b, e = min(x, y), max(x, y)

        if b < e:
            charge = randint(1, 5)
            sequence = "".join(t.sequence for t in tryptides[b:e])
            alkylations = sum(res == "C" for res in sequence)

            cys_overlap = [i for i in [7, 10] if i in range(b, e)]
            if cys_overlap:
                alkylations -= len(cys_overlap)

            mass = calculate_mass(sequence) + alkylations * IAA_ALKYLATION.mass
            prec: Dict = {
                "charge": charge,
                "precursor": Precursor(
                    sequence=sequence,
                    mass=mass,
                    mz=mass / charge + PROTON,
                    segments=[(b, e)],
                    residue_ranges=[(tryptides[b].beginning, tryptides[e - 1].end)],
                    cys_bond_count=0,
                    alkylation_count=alkylations,
                    modifications=[],
                    error_ppm=0,
                ),
            }

            if cys_overlap:
                raw_bonded_material.append(prec)
            else:
                bondless_precursors.append(prec)

    bondless_precursors = dedup(bondless_precursors)
    raw_bonded_material = dedup(raw_bonded_material)

    bonded_precursors = []
    for i, s in enumerate(raw_bonded_material):
        prec: Precursor = s["precursor"]
        for t in raw_bonded_material[i:]:
            qrec: Precursor = t["precursor"]
            if not intersects(prec.segments[0], qrec.segments[0]):
                charge = randint(1, 5)
                ps = sorted([prec, qrec], key=lambda p: p.segments[0][0])
                mass = prec.mass + qrec.mass + CYS_BOND.mass
                joined = Precursor(
                    sequence=ps[0].sequence + "+" + ps[1].sequence,
                    mass=mass,
                    mz=mass / charge + PROTON,
                    segments=ps[0].segments + ps[1].segments,
                    residue_ranges=ps[0].residue_ranges + ps[1].residue_ranges,
                    cys_bond_count=1,
                    alkylation_count=prec.alkylation_count + qrec.alkylation_count,
                    modifications=ps[0].modifications + ps[1].modifications,
                    error_ppm=0,
                )
                bonded_precursors.append({"charge": charge, "precursor": joined})

    bonded_precursors = dedup(bonded_precursors)

    bonded_unipeptides = []
    for s in raw_bonded_material:
        p: Precursor = s["precursor"]
        if connected_cys_count(p) == 2:
            charge = s["charge"]
            bonded_unipeptides.append(
                {
                    "charge": charge,
                    "precursor": Precursor(
                        p.sequence,
                        p.mass + CYS_BOND.mass,
                        (p.mass + CYS_BOND.mass) / charge + PROTON,
                        p.segments,
                        p.residue_ranges,
                        p.cys_bond_count,
                        p.alkylation_count,
                        p.modifications,
                        p.error_ppm,
                    ),
                }
            )

    bonded_unipeptides = dedup(bonded_unipeptides)

    return bondless_precursors + bonded_precursors + bonded_unipeptides


In [26]:
precursors = generate_precursors(5000)
precursors[:3]

[{'charge': 3,
  'precursor': Precursor(sequence='KIK', mass=387.28455468883, mz=130.10267659501332, segments=[(23, 25)], residue_ranges=[(276, 279)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0)},
 {'charge': 1,
  'precursor': Precursor(sequence='LTEWTSSNVMEERK', mass=1708.80903492468, mz=1709.81685995675, segments=[(22, 24)], residue_ranges=[(263, 277)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0)},
 {'charge': 2,
  'precursor': Precursor(sequence='MEEK', mass=535.2311987866301, mz=268.62342442538505, segments=[(27, 28)], residue_ranges=[(286, 290)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0)}]

In [27]:

def valid_frags(frags, cys, length):
    def ok(frag):
        if "b" in frag[1]:
            return frag[2] > cys
        else:
            return frag[2] >= (length - cys)

    return [f for f in frags if ok(f)]


def charge_from_code(code):
    match = re.match(r".*\[(\d+)?\+]$", code)
    if match.group(1) is None:
        return 1
    else:
        return int(match.group(1))


def safe_frags(seq, frags, cysteines):
    b, e = seq
    safe = []
    unsafe = []
    for f in frags:
        mass, code, i = f
        if "b" in code:
            if all(c < b or b + i < c for c in cysteines):
                safe.append(f)
                continue
        else:
            if all(c > e or b + i > c for c in cysteines):
                safe.append(f)
                continue
        unsafe.append(f)

    return safe, unsafe


def build_safe_fragment(id, sequence, residue_range, charge, mz, break_count):
    return Fragment(
        id=id,
        sequence=sequence,
        residue_ranges=residue_range,
        intensity=10,
        intensity_ratio=1 / 10,
        target_mass=(mz - PROTON) * charge,
        mass=(mz - PROTON) * charge,
        target_mz=mz,
        mz=mz,
        charge=charge,
        break_count=break_count,
        error_ppm=0,
        modifications=[IAA_ALKYLATION for res in sequence if res == "C"],
        connected_bonds=[],
        disconnected_cys=[],
    )


def fragment_sequence(seq, frag, residue_range):
    _, code, i = frag
    sequence = seq[:i] if "b" in code else seq[-i:]
    b, e = residue_range
    frag_residue_range = (b, b + i) if "b" in code else (e - i, e)

    return sequence, frag_residue_range


def safe_frags_to_fragments(frags, prec_sequence, prec_residue_range, precursor):
    fragments = []
    for id, frag in enumerate(frags):
        mz, code, i = frag
        frag_charge = charge_from_code(code)
        frag_sequence, frag_residue_range = fragment_sequence(
            prec_sequence, frag, prec_residue_range
        )
        fragment = build_safe_fragment(
            id,
            frag_sequence,
            [frag_residue_range],
            frag_charge,
            mz,
            prec_residue_range != frag_residue_range,
        )

        fragments.append(
            {"fragment": fragment, "precursor": precursor, "var_bonds": []}
        )
    return fragments


def generate_fragments(peptide: Dict, count=50):
    max_charge = peptide["charge"]
    precursor: Precursor = peptide["precursor"]

    if connected_cys_count(precursor) == 0:
        pep = Peptide(
            precursor.sequence,
            charge=max_charge,
            modifications=[
                ModSite(IAA_ALKYLATION.mass, i + 1, IAA_ALKYLATION.description)
                for i, res in enumerate(precursor.sequence)
                if res == "C"
            ],
        )
        frags = pep.fragment(
            ion_types={IonType.y: [], IonType.b: [], IonType.precursor: []}
        )
        frags = list(sorted(list(set(choices(frags, k=count)))))

        return safe_frags_to_fragments(
            frags, precursor.sequence, precursor.residue_ranges[0], precursor
        )
    elif len(precursor.segments) == 2:
        ps, qs = precursor.sequence.split("+")
        psi = list(zip(range(*precursor.residue_ranges[0]), ps))
        qsi = list(zip(range(*precursor.residue_ranges[1]), qs))

        p_frags = Peptide(
            ps,
            charge=max_charge,
            modifications=[
                ModSite(IAA_ALKYLATION.mass, ri + 1, IAA_ALKYLATION.description)
                for ri, (ai, res) in enumerate(psi)
                if res == "C" and ai not in (72, 119)
            ],
        ).fragment(
            ion_types={
                IonType.y: [],
                IonType.b: [],
            }
        )

        q_frags = Peptide(
            qs,
            charge=max_charge,
            modifications=[
                ModSite(IAA_ALKYLATION.mass, ri + 1, IAA_ALKYLATION.description)
                for ri, (ai, res) in enumerate(qsi)
                if res == "C" and ai not in (72, 119)
            ],
        ).fragment(
            ion_types={
                IonType.y: [],
                IonType.b: [],
            }
        )

        psafe, pcys = safe_frags(
            precursor.residue_ranges[0],
            list(sorted(list(set(choices(p_frags, k=count))))),
            [72, 119],
        )
        qsafe, qcys = safe_frags(
            precursor.residue_ranges[1],
            list(sorted(list(set(choices(q_frags, k=count))))),
            [72, 119],
        )

        result = safe_frags_to_fragments(
            psafe, ps, precursor.residue_ranges[0], precursor
        ) + safe_frags_to_fragments(qsafe, qs, precursor.residue_ranges[1], precursor)

        shuffle(pcys)
        shuffle(qcys)

        p_fragments = [
            fr["fragment"]
            for fr in safe_frags_to_fragments(
                pcys, ps, precursor.residue_ranges[0], precursor
            )
        ]
        q_fragments = [
            fr["fragment"]
            for fr in safe_frags_to_fragments(
                qcys, qs, precursor.residue_ranges[1], precursor
            )
        ]

        for pf, qf in choices(list(zip(p_fragments, q_fragments)), k=count):
            total_charge = pf.charge + qf.charge

            if total_charge > max_charge:
                continue

            total_mz = pf.mz + qf.mz + (CYS_BOND.mass / total_charge)

            fragment = Fragment(
                0,
                sequence=pf.sequence + "+" + qf.sequence,
                residue_ranges=pf.residue_ranges + qf.residue_ranges,
                intensity=10,
                intensity_ratio=1,
                mass=(total_mz - PROTON) * total_charge,
                target_mass=(total_mz - PROTON) * total_charge,
                mz=total_mz,
                target_mz=total_mz,
                charge=total_charge,
                break_count=pf.break_count + qf.break_count,
                error_ppm=0,
                modifications=qf.modifications + pf.modifications,
                connected_bonds=tuple([(72, 119)]),
                disconnected_cys=tuple([]),
            )

            result.append(
                {"fragment": fragment, "precursor": precursor, "var_bonds": [(72, 119)]}
            )
        return result
    else:
        ps = precursor.sequence
        psi = list(zip(range(*precursor.residue_ranges[0]), ps))

        p_frags = Peptide(
            ps,
            charge=max_charge,
            modifications=[
                ModSite(IAA_ALKYLATION.mass, ri + 1, IAA_ALKYLATION.description)
                for ri, (ai, res) in enumerate(psi)
                if res == "C" and ai not in (72, 119)
            ],
        ).fragment(ion_types={IonType.y: [], IonType.b: [], IonType.precursor: []})
        p_frags = list(p_frags)

        print(p_frags)
        psafe, pcys = safe_frags(
            precursor.residue_ranges[0],
            list(sorted(list(set(choices(p_frags, k=count * 2))))),
            [72, 119],
        )

        result = []

        p_fragments = [
            fr["fragment"]
            for fr in safe_frags_to_fragments(
                [pf for pf in pcys if "b" in pf[1]],
                ps,
                precursor.residue_ranges[0],
                precursor,
            )
        ]
        q_fragments = [
            fr["fragment"]
            for fr in safe_frags_to_fragments(
                [qf for qf in pcys if "y" in qf[1]],
                ps,
                precursor.residue_ranges[0],
                precursor,
            )
        ]

        print(pcys, p_fragments)

        for pf, qf in choices(list(zip(p_fragments, q_fragments)), k=count):
            total_charge = pf.charge + qf.charge

            if total_charge > max_charge:
                continue

            pr, qr = pf.residue_ranges[0], qf.residue_ranges[0]
            if intersects(pr, qr):
                continue

            total_mz = pf.mz + qf.mz + (CYS_BOND.mass / total_charge)

            fragment = Fragment(
                0,
                sequence=pf.sequence + "+" + qf.sequence,
                residue_ranges=pf.residue_ranges + qf.residue_ranges,
                intensity=10,
                intensity_ratio=1,
                mass=(total_mz - PROTON) * total_charge,
                target_mass=(total_mz - PROTON) * total_charge,
                mz=total_mz,
                target_mz=total_mz,
                charge=total_charge,
                break_count=2 if pr[1] != qr[0] else 1,
                error_ppm=0,
                modifications=qf.modifications + pf.modifications,
                connected_bonds=tuple([(72, 119)]),
                disconnected_cys=tuple([]),
            )

            result.append(
                {"fragment": fragment, "precursor": precursor, "var_bonds": [(72, 119)]}
            )
        return result


In [28]:
generate_fragments(precursors[859])

[(147.11280416609898, 'y1[+]', 1), (114.091340445409, 'b1[+]', 1), (246.18121808016897, 'y2[+]', 2), (211.144104295019, 'b2[+]', 2), (349.19040303971894, 'y3[+]', 3), (268.16556801570897, 'b3[+]', 3), (477.24898054555894, 'y4[+]', 4), (415.23398192977896, 'b4[+]', 4), (590.333044524089, 'y5[+]', 5), (472.25544565046897, 'b5[+]', 5), (753.396373057729, 'y6[+]', 6), (587.2823886747591, 'b6[+]', 6), (882.438966146479, 'y7[+]', 7), (674.3144170794791, 'b7[+]', 7), (979.4917299960889, 'y8[+]', 8), (787.3984810580091, 'b8[+]', 8), (1092.5757939746188, 'y9[+]', 9), (916.4410741467591, 'b9[+]', 9), (1205.6598579531487, 'y10[+]', 10), (987.4781879319091, 'b10[+]', 10), (1302.7126218027588, 'y11[+]', 11), (1115.536765437749, 'b11[+]', 11), (1465.7759503363989, 'y12[+]', 12), (1218.5459503972988, 'b12[+]', 12), (1621.8770613604488, 'y13[+]', 13), (1275.5674141179888, 'b13[+]', 13), (1750.9196544491988, 'y14[+]', 14), (1376.6150925871689, 'b14[+]', 14), (1879.9622475379488, 'y15[+]', 15), (1463.64

[{'fragment': Fragment(id=0, sequence='LPGFGDSIEAQCGTSVNVHSSLRDILNQITKP+ERYPILPEYLQCVK', residue_ranges=[(61, 93), (108, 122)], intensity=10, intensity_ratio=1, mass=10317.297350647033, target_mass=10317.297350647033, mz=3440.1069419144146, target_mz=3440.1069419144146, charge=3, break_count=2, error_ppm=0, modifications=[Modification(description='Cys Alkylation (IAA)', mass=57.0214), Modification(description='Cys Alkylation (IAA)', mass=57.0214)], connected_bonds=((72, 119),), disconnected_cys=()),
  'precursor': Precursor(sequence='LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYAEERYPILPEYLQCVK', mass=6841.416016443381, mz=1369.2910283207461, segments=[(7, 11)], residue_ranges=[(61, 122)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0),
  'var_bonds': [(72, 119)]},
 {'fragment': Fragment(id=0, sequence='LPGFGDSIEAQCGTS+PEYLQCVK', residue_ranges=[(61, 76), (114, 122)], intensity=10, intensity_ratio=1, mass=4884.259857651052, target_mass=4884.259857651052, mz=1222.0

In [29]:
print(f"In total we have {len(precursors)} precursors")

scans: List[Scan] = []
fragment_records = []
precursor_records = []

for i, prec in enumerate(precursors):
    p: Precursor = prec["precursor"]
    frags = generate_fragments(prec)

    fragment_objects: List[Fragment] = [f["fragment"] for f in frags]
    scan = Scan(
        nth_in_order=i,
        id=i,
        time=i,
        charge=prec["charge"],
        prec_mz=p.mz,
        prec_intensity=100,
        prec_mass=p.mass,
        fragments_mz=np.array(sorted([f.mz for f in fragment_objects])),
        fragments_intensity=np.array([f.intensity for f in fragment_objects]),
        threshold=0,
    )
    scans.append(scan)

    precursor_records.append(scan.to_dict() | p.to_dict())

    fragment_records += [
        scan.to_dict()
        | fr["precursor"].to_dict()
        | {"var_bonds": fr["var_bonds"]}
        | fr["fragment"].to_dict()
        for fr in frags
    ]


In total we have 862 precursors
[(147.11280416609898, 'y1[+]', 1), (114.091340445409, 'b1[+]', 1), (246.18121808016897, 'y2[+]', 2), (211.144104295019, 'b2[+]', 2), (349.19040303971894, 'y3[+]', 3), (268.16556801570897, 'b3[+]', 3), (477.24898054555894, 'y4[+]', 4), (415.23398192977896, 'b4[+]', 4), (590.333044524089, 'y5[+]', 5), (472.25544565046897, 'b5[+]', 5), (753.396373057729, 'y6[+]', 6), (587.2823886747591, 'b6[+]', 6), (882.438966146479, 'y7[+]', 7), (674.3144170794791, 'b7[+]', 7), (979.4917299960889, 'y8[+]', 8), (787.3984810580091, 'b8[+]', 8), (1092.5757939746188, 'y9[+]', 9), (916.4410741467591, 'b9[+]', 9), (1205.6598579531487, 'y10[+]', 10), (987.4781879319091, 'b10[+]', 10), (1302.7126218027588, 'y11[+]', 11), (1115.536765437749, 'b11[+]', 11), (1465.7759503363989, 'y12[+]', 12), (1218.5459503972988, 'b12[+]', 12), (1621.8770613604488, 'y13[+]', 13), (1275.5674141179888, 'b13[+]', 13), (1750.9196544491988, 'y14[+]', 14), (1376.6150925871689, 'b14[+]', 14), (1879.962247

In [30]:
ideal_precursor_df = pd.DataFrame(precursor_records)

ideal_precursor_df.to_csv(
    "../out/csv/precursor_matches_TEST_OVA_segments=3_error=50ppm_ideal.csv",
    index=False,
)

ideal_precursor_df

Unnamed: 0,scan_id,scan_nth_in_order,scan_time,scan_total_intensity,prec_charge,prec_sequence,prec_segment_count,prec_tryptide_ranges,prec_residue_ranges,prec_max_mc_count,prec_mc,prec_cys_bond_count,prec_mass,prec_mz,prec_error,prec_alkylation_count,prec_mods
0,0,0,0,70.000,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,[1],0,387.284555,130.102677,0,0,[]
1,1,1,1,230.000,1,LTEWTSSNVMEERK,1,"[(22, 24)]","[(263, 277)]",1,[1],0,1708.809035,1709.816860,0,0,[]
2,2,2,2,100.000,2,MEEK,1,"[(27, 28)]","[(286, 290)]",0,[0],0,535.231199,268.623424,0,0,[]
3,3,3,3,140.000,2,VYLPR,1,"[(25, 26)]","[(279, 284)]",0,[0],0,646.380246,324.197948,0,0,[]
4,4,4,4,70.000,5,IK,1,"[(24, 25)]","[(277, 279)]",0,[0],0,259.189592,52.845743,0,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,857,857,857,50.000,2,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],0,6841.416016,3421.715833,0,0,[]
858,858,858,858,80.000,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],0,6841.416016,1711.361829,0,0,[]
859,859,859,859,50.000,5,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],0,6841.416016,1369.291028,0,0,[]
860,860,860,860,0.001,1,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],0,6841.416016,6842.423841,0,0,[]


In [31]:
ideal_fragment_df = pd.DataFrame(fragment_records)

ideal_fragment_df.to_csv(
    "../out/csv/fragment_matches_TEST_OVA_segments=3_error=50ppm_ideal.csv",
    index=False,
)

ideal_fragment_df

Unnamed: 0,scan_id,scan_nth_in_order,scan_time,scan_total_intensity,prec_charge,prec_sequence,prec_segment_count,prec_tryptide_ranges,prec_residue_ranges,prec_max_mc_count,...,frag_break_count,frag_error_ppm,frag_mods,frag_connected_bonds,frag_disconnected_cys,frag_interesting_disconnected_cys,frag_intensity,frag_intensity_ratio,target_mass,target_mz
0,0,0,0,70,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,True,0,[],[],[],[],10,0.1,128.094414,129.102239
1,0,0,0,70,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,False,0,[],[],[],[],10,0.1,387.282909,130.102128
2,0,0,0,70,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,True,0,[],[],[],[],10,0.1,146.104979,147.112804
3,0,0,0,70,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,False,0,[],[],[],[],10,0.1,387.283458,194.649554
4,0,0,0,70,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,True,0,[],[],[],[],10,0.1,241.178478,242.186303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35808,861,861,861,50,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,2,0,"[Cys Alkylation (IAA), Cys Alkylation (IAA)]","((72, 119),)",(),[],10,1.0,12203.773261,4068.932245
35809,861,861,861,50,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,2,0,"[Cys Alkylation (IAA), Cys Alkylation (IAA)]","((72, 119),)",(),[],10,1.0,12203.773261,4068.932245
35810,861,861,861,50,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,2,0,"[Cys Alkylation (IAA), Cys Alkylation (IAA)]","((72, 119),)",(),[],10,1.0,11177.026498,3726.683325
35811,861,861,861,50,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,2,0,"[Cys Alkylation (IAA), Cys Alkylation (IAA)]","((72, 119),)",(),[],10,1.0,12778.248211,4260.423895


In [32]:
precursor_matches = write_matched_precursors(
    tryptides,
    scans,
    "../out/precursor_matches/TEST_OVA_AT_segments=3_error=50ppm.pickle",
    max_segments=3,
    error_ppm=50,
)

precursor_match_records = []
for pm in precursor_matches:
    precursor_match_records.append(pm["scan"].to_dict() | pm["precursor"].to_dict())

prec_df = pd.DataFrame(precursor_match_records)

prec_df.to_csv(
    "../out/csv/precursor_matches_TEST_OVA_segments=3_error=50ppm.csv", index=False
)

prec_df

Looking for matches...
Saving the matches to ../out/precursor_matches/TEST_OVA_AT_segments=3_error=50ppm.pickle


Unnamed: 0,scan_id,scan_nth_in_order,scan_time,scan_total_intensity,prec_charge,prec_sequence,prec_segment_count,prec_tryptide_ranges,prec_residue_ranges,prec_max_mc_count,prec_mc,prec_cys_bond_count,prec_mass,prec_mz,prec_error,prec_alkylation_count,prec_mods
0,0,0,0,70.000,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,[1],0,390.308030,130.102677,0.0,0,[]
1,1,1,1,230.000,1,LTEWTSSNVMEERK,1,"[(22, 24)]","[(263, 277)]",1,[1],0,1709.816860,1709.816860,0.0,0,[]
2,2,2,2,100.000,2,MEEK,1,"[(27, 28)]","[(286, 290)]",0,[0],0,537.246849,268.623424,0.0,0,[]
3,3,3,3,140.000,2,VYLPR,1,"[(25, 26)]","[(279, 284)]",0,[0],0,648.395896,324.197948,0.0,0,[]
4,4,4,4,70.000,5,IK,1,"[(24, 25)]","[(277, 279)]",0,[0],0,264.228717,52.845743,0.0,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282,857,857,857,50.000,2,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],1,6843.431667,3421.715833,0.0,0,[Disulphide Bond (–H2)]
1283,858,858,858,80.000,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],1,6845.447317,1711.361829,0.0,0,[Disulphide Bond (–H2)]
1284,859,859,859,50.000,5,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],1,6846.455142,1369.291028,0.0,0,[Disulphide Bond (–H2)]
1285,860,860,860,0.001,1,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],1,6842.423841,6842.423841,0.0,0,[Disulphide Bond (–H2)]


In [33]:
fragment_matches = write_matched_fragments(
    precursor_matches=precursor_matches,
    tryptides=tryptides,
    output_path="../out/fragment_matches/TEST_OVA_AT_segments=3_error=50ppm.pickle",
    max_allowed_breaks=2,
    error_ppm=50,
)

fragment_match_records = []
for fm in fragment_matches:
    fragment_match_records.append(
        fm["scan"].to_dict()
        | fm["precursor"].to_dict()
        | fm["variant"].to_dict()
        | (fm["fragment"].to_dict() if fm["fragment"] is not None else {})
        | {"prec_variant_count": fm["variant_count"]}
    )

frag_df = pd.DataFrame(fragment_match_records)

frag_df.to_csv(
    "../out/csv/fragment_matches_TEST_OVA_segments=3_error=50ppm.csv", index=False
)

frag_df

  0%|          | 0/1287 [00:00<?, ?it/s]

Writing the matched fragments to ../out/fragment_matches/TEST_OVA_AT_segments=3_error=50ppm.pickle


100%|██████████| 1287/1287 [04:11<00:00,  5.12it/s]


Unnamed: 0,scan_id,scan_nth_in_order,scan_time,scan_total_intensity,prec_charge,prec_sequence,prec_segment_count,prec_tryptide_ranges,prec_residue_ranges,prec_max_mc_count,...,frag_error_ppm,frag_mods,frag_connected_bonds,frag_disconnected_cys,frag_interesting_disconnected_cys,frag_intensity,frag_intensity_ratio,target_mass,target_mz,prec_variant_count
0,0,0,0,70.0,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,4.282479,[],[],[],[],10.0,0.142857,128.094414,129.102239,1
1,0,0,0,70.0,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,2.274504,[],[],[],[],10.0,0.142857,241.178478,242.186303,1
2,0,0,0,70.0,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,4.249308,[],[],[],[],10.0,0.142857,387.282909,130.102128,1
3,0,0,0,70.0,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,2.832869,[],[],[],[],10.0,0.142857,387.283458,194.649554,1
4,0,0,0,70.0,3,KIK,1,"[(23, 25)]","[(276, 279)]",1,...,1.416429,[],[],[],[],10.0,0.142857,387.284006,388.291831,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177697,861,861,861,50.0,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,15.471227,"[-H2O (neutral loss), -NH3 (neutral loss)]","[(72, 119)]",[],[],10.0,0.200000,4067.924420,4068.932245,1
177698,861,861,861,50.0,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,15.471227,"[-H2O (neutral loss), -NH3 (neutral loss)]","[(72, 119)]",[],[],10.0,0.200000,4067.924420,4068.932245,1
177699,861,861,861,50.0,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,38.668231,"[-H2O (neutral loss), -H2O (neutral loss)]",[],[],[],10.0,0.200000,4067.924420,4068.932245,1
177700,861,861,861,50.0,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,38.668231,"[-H2O (neutral loss), -H2O (neutral loss)]",[],[],[],10.0,0.200000,4067.924420,4068.932245,1


In [35]:

import importlib
import fragment_matching

importlib.reload(fragment_matching)


<module 'fragment_matching' from '/Users/eugen/code/bp/bp-code/src/fragment_matching.py'>

In [39]:
scans[1].fragments_mz

array([ 114.09134045,  215.13901891,  303.21391519,  344.181612  ,
        432.25650828,  530.26092495,  561.29910137,  631.30860342,
        791.40800037,  805.37266023,  905.45092781,  919.41558767,
        992.48295622, 1018.48400159, 1079.51498462, 1180.56266309,
       1278.56707977, 1366.64197604, 1407.60967285, 1495.68456913,
       1563.71078388, 1596.7322476 , 1709.81631158])

In [76]:
fragment_matching._fragments_matching_targets(
    [
        fragment_matching.Target(
            0, (454.22962482186 - PROTON), 454.22962482186, 10, 3, 1
        )
    ],
    precursors[0]["precursor"].variants(tryptides)[0],
    2,
    5,
)

[Fragment(id=0, sequence='GGLEP', residue_ranges=[(126, 131)], intensity=10, intensity_ratio=1, mass=456.2458234513, target_mass=453.22179978979, mz=152.08194115043332, target_mz=454.22962482186, charge=3, break_count=1, error_ppm=1.210366836464799, modifications=(), connected_bonds=[], disconnected_cys=[]),
 Fragment(id=0, sequence='EPIN', residue_ranges=[(129, 133)], intensity=10, intensity_ratio=1, mass=456.24582345130005, target_mass=453.22179978979, mz=152.08194115043335, target_mz=454.22962482186, charge=3, break_count=2, error_ppm=1.2103668365902196, modifications=(), connected_bonds=[], disconnected_cys=[])]

In [71]:
scans[0].fragments_mz

array([ 454.22962483,  632.3402379 ,  681.35661625,  907.50361483,
        956.48360766, 1057.53128613, 1128.56839992, 1403.80454787,
       1442.69103423, 1669.82925904, 1746.99751085, 1919.0823031 ,
       2047.14088061, 2134.17290901, 2226.11493594, 2619.38508661,
       2956.44354027, 2959.59614201, 3070.48646771, 3127.50793143,
       3240.59199541, 3509.77717041, 3512.92977215, 3835.97257575,
       3856.07895557, 3964.03115325, 4148.11594551, 4558.37625977,
       4664.33393676, 4967.4592141 , 5066.52762802, 5069.68809184,
       5140.72520563, 5268.78378313, 5383.81072616, 5454.84783994,
       5463.76014714, 5626.9326322 , 5822.98103894, 5902.05962362,
       5951.07600196, 6525.38749968])

In [44]:
calculate_mass("LTEWTSSNVMEE", ion_type="b", charge=1)

1407.60967267015

In [41]:

mss = fragment_matching.write_matched_fragments(
    [precursors[1] | {"scan": scans[1]}],
    tryptides,
    "../out/fragment_matches/doesn_matter_just_testing.pickle",
    2,
    5,
)

a = set(
    [
        (m["fragment"].sequence, m["fragment"].target_mass, m["fragment"].modifications)
        for m in mss
    ]
)

mss

100%|██████████| 1/1 [00:00<00:00, 187.40it/s]

Writing the matched fragments to ../out/fragment_matches/doesn_matter_just_testing.pickle





[{'scan': <src.model.scan.Scan at 0x1176288b0>,
  'precursor': Precursor(sequence='LTEWTSSNVMEERK', mass=1708.80903492468, mz=1709.81685995675, segments=[(22, 24)], residue_ranges=[(263, 277)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0),
  'variant': <src.model.variant.Variant at 0x1259431c0>,
  'variant_count': 1,
  'fragment': Fragment(id=0, sequence='L', residue_ranges=[(263, 264)], intensity=10, intensity_ratio=0.043478260869565216, mass=114.09188900920002, target_mass=113.083515413339, mz=114.09188900920002, target_mz=114.091340445409, charge=1, break_count=1, error_ppm=4.850938069620759, modifications=(), connected_bonds=[], disconnected_cys=[])},
 {'scan': <src.model.scan.Scan at 0x1176288b0>,
  'precursor': Precursor(sequence='LTEWTSSNVMEERK', mass=1708.80903492468, mz=1709.81685995675, segments=[(22, 24)], residue_ranges=[(263, 277)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0),
  'variant': <src.model.variant.Variant at 0x125