In [1]:
from typing import Optional, List, Tuple, Dict

import numpy as np
import pandas as pd
from pepfrag import Peptide, IonType, ModSite

from src.fragment_matching import (
    write_matched_fragments,
)
from src.model.fragment import Fragment
from src.model.modification import IAA_ALKYLATION, CYS_BOND
from src.model.precursor import Precursor
from src.model.scan import Scan
from src.model.variant import Variant
from src.precursor_matching import write_matched_precursors
from src.utilities.constants import LIP_BONDS, PROTON, OVA_BONDS
from src.utilities.dataloading import cleave_protein
from pyteomics import fasta
import re

from random import randrange, randint, choices, shuffle
from pyteomics.mass import calculate_mass

In [2]:
PROTEIN = "OVA"
CYS_BONDS = OVA_BONDS
tryptides = cleave_protein(PROTEIN)
TRYPTIDE_COUNT = len(tryptides)
seq_path = f"../data/fasta/{PROTEIN}.fasta"
SEQUENCE = [r.sequence for r in fasta.read(seq_path)][0]
SEQUENCE

'GSIGAASMEFCFDVFKELKVHHANENIFYCPIAIMSALAMVYLGAKDSTRTQINKVVRFDKLPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYAEERYPILPEYLQCVKELYRGGLEPINFQTAADQARELINSWVESQTNGIIRNVLQPSSVDSQTAMVLVNAIVFKGLWEKAFKDEDTQAMPFRVTEQESKPVQMMYQIGLFRVASMASEKMKILELPFASGTMSMLVLLPDEVSGLEQLESIINFEKLTEWTSSNVMEERKIKVYLPRMKMEEKYNLTSVLMAMGITDVFSSSANLSGISSAESLKISQAVHAAHAEINEAGREVVGSAEAGVDAASVSEEFRADHPFLFCIKHIATNAVLFFGRCVSP'

In [3]:

def intersects(t, u):
    x, y = t
    a, b = u
    return not (x >= b or y <= a)


def dedup(xs):
    return list(dict(tp) for tp in set(tuple(p.items()) for p in (xs)))


def connected_cys_count(prec):
    return sum(res == "C" for res in prec.sequence) - prec.alkylation_count


def generate_precursors(base_count=10000):
    bondless_precursors: List[Dict] = []
    raw_bonded_material: List[Dict] = []
    for _ in range(0, base_count):
        x = randrange(0, TRYPTIDE_COUNT)
        y = randrange(x, min(TRYPTIDE_COUNT, x + 5))
        b, e = min(x, y), max(x, y)

        if b < e:
            charge = randint(1, 5)
            sequence = "".join(t.sequence for t in tryptides[b:e])
            alkylations = sum(res == "C" for res in sequence)

            cys_overlap = [i for i in [7, 10] if i in range(b, e)]
            if cys_overlap:
                alkylations -= len(cys_overlap)

            mass = calculate_mass(sequence) + alkylations * IAA_ALKYLATION.mass
            prec: Dict = {
                "charge": charge,
                "precursor": Precursor(
                    sequence=sequence,
                    mass=mass,
                    mz=mass / charge + PROTON,
                    segments=[(b, e)],
                    residue_ranges=[(tryptides[b].beginning, tryptides[e - 1].end)],
                    cys_bond_count=0,
                    alkylation_count=alkylations,
                    modifications=[],
                    error_ppm=0,
                ),
            }

            if cys_overlap:
                raw_bonded_material.append(prec)
            else:
                bondless_precursors.append(prec)

    bondless_precursors = dedup(bondless_precursors)
    raw_bonded_material = dedup(raw_bonded_material)

    bonded_precursors = []
    for i, s in enumerate(raw_bonded_material):
        prec: Precursor = s["precursor"]
        for t in raw_bonded_material[i:]:
            qrec: Precursor = t["precursor"]
            if not intersects(prec.segments[0], qrec.segments[0]):
                charge = randint(1, 5)
                ps = sorted([prec, qrec], key=lambda p: p.segments[0][0])
                mass = prec.mass + qrec.mass + CYS_BOND.mass
                joined = Precursor(
                    sequence=ps[0].sequence + "+" + ps[1].sequence,
                    mass=mass,
                    mz=mass / charge + PROTON,
                    segments=ps[0].segments + ps[1].segments,
                    residue_ranges=ps[0].residue_ranges + ps[1].residue_ranges,
                    cys_bond_count=1,
                    alkylation_count=prec.alkylation_count + qrec.alkylation_count,
                    modifications=ps[0].modifications + ps[1].modifications,
                    error_ppm=0,
                )
                bonded_precursors.append({"charge": charge, "precursor": joined})

    bonded_precursors = dedup(bonded_precursors)

    bonded_unipeptides = []
    for s in raw_bonded_material:
        p: Precursor = s["precursor"]
        if connected_cys_count(p) == 2:
            charge = s["charge"]
            bonded_unipeptides.append(
                {
                    "charge": charge,
                    "precursor": Precursor(
                        p.sequence,
                        p.mass + CYS_BOND.mass,
                        (p.mass + CYS_BOND.mass) / charge + PROTON,
                        p.segments,
                        p.residue_ranges,
                        p.cys_bond_count,
                        p.alkylation_count,
                        p.modifications,
                        p.error_ppm,
                    ),
                }
            )

    bonded_unipeptides = dedup(bonded_unipeptides)

    return bondless_precursors + bonded_precursors + bonded_unipeptides


In [4]:
precursors = generate_precursors(5000)
precursors[:3]

[{'charge': 2,
  'precursor': Precursor(sequence='NVLQPSSVDSQTAMVLVNAIVFKGLWEK', mass=3072.6317927857103, mz=1537.3237214249252, segments=[(14, 16)], residue_ranges=[(158, 186)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0)},
 {'charge': 5,
  'precursor': Precursor(sequence='GGLEPINFQTAADQAR', mass=1686.83254725058, mz=338.37433448218604, segments=[(12, 13)], residue_ranges=[(126, 142)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0)},
 {'charge': 3,
  'precursor': Precursor(sequence='VASMASEKMKILELPFASGTMSMLVLLPDEVSGLEQLESIINFEK', mass=4924.523754598979, mz=1642.5157432317299, segments=[(19, 22)], residue_ranges=[(218, 263)], cys_bond_count=0, alkylation_count=0, modifications=[], error_ppm=0)}]

In [5]:

def valid_frags(frags, cys, length):
    def ok(frag):
        if "b" in frag[1]:
            return frag[2] > cys
        else:
            return frag[2] >= (length - cys)

    return [f for f in frags if ok(f)]


def charge_from_code(code):
    match = re.match(r".*\[(\d+)?\+]$", code)
    if match.group(1) is None:
        return 1
    else:
        return int(match.group(1))


def safe_frags(seq, frags, cysteines):
    b, e = seq
    safe = []
    unsafe = []
    for f in frags:
        mass, code, i = f
        if "b" in code:
            if not any(b <= c < b + i for c in cysteines):
                safe.append(f)
                continue
        else:
            if not any(e - i <= c < e for c in cysteines):
                safe.append(f)
                continue
        unsafe.append(f)

    return safe, unsafe


def build_safe_fragment(id, sequence, residue_range, charge, mz, break_count):
    return Fragment(
        id=id,
        sequence=sequence,
        residue_ranges=residue_range,
        intensity=10,
        intensity_ratio=1 / 10,
        target_mass=(mz - PROTON) * charge,
        mass=(mz - PROTON) * charge,
        target_mz=mz,
        mz=mz,
        charge=charge,
        break_count=break_count,
        error_ppm=0,
        modifications=[IAA_ALKYLATION for res in sequence if res == "C"],
        connected_bonds=[],
        disconnected_cys=[],
    )


def fragment_sequence(seq, frag, residue_range):
    _, code, i = frag
    sequence = seq[:i] if "b" in code else seq[-i:]
    b, e = residue_range
    frag_residue_range = (b, b + i) if "b" in code else (e - i, e)

    return sequence, frag_residue_range


def safe_frags_to_fragments(frags, prec_sequence, prec_residue_range, precursor):
    fragments = []
    for id, frag in enumerate(frags):
        mz, code, i = frag
        frag_charge = charge_from_code(code)
        frag_sequence, frag_residue_range = fragment_sequence(
            prec_sequence, frag, prec_residue_range
        )
        fragment = build_safe_fragment(
            id,
            frag_sequence,
            [frag_residue_range],
            frag_charge,
            mz,
            prec_residue_range != frag_residue_range,
        )

        fragments.append(
            {"fragment": fragment, "precursor": precursor, "var_bonds": []}
        )
    return fragments


def generate_fragments(peptide: Dict, count=50):
    max_charge = peptide["charge"]
    precursor: Precursor = peptide["precursor"]

    if connected_cys_count(precursor) == 0:
        pep = Peptide(
            precursor.sequence,
            charge=max_charge,
            modifications=[
                ModSite(IAA_ALKYLATION.mass, i + 1, IAA_ALKYLATION.description)
                for i, res in enumerate(precursor.sequence)
                if res == "C"
            ],
        )
        frags = pep.fragment(
            ion_types={IonType.y: [], IonType.b: [], IonType.precursor: []}
        )
        frags = list(sorted(list(set(choices(frags, k=count)))))

        return safe_frags_to_fragments(
            frags, precursor.sequence, precursor.residue_ranges[0], precursor
        )
    elif len(precursor.segments) == 2:
        ps, qs = precursor.sequence.split("+")
        psi = list(zip(range(*precursor.residue_ranges[0]), ps))
        qsi = list(zip(range(*precursor.residue_ranges[1]), qs))

        p_frags = Peptide(
            ps,
            charge=1,
            modifications=[
                ModSite(IAA_ALKYLATION.mass, ri + 1, IAA_ALKYLATION.description)
                for ri, (ai, res) in enumerate(psi)
                if res == "C" and ai not in (72, 119)
            ],
        ).fragment(
            ion_types={
                IonType.y: [],
                IonType.b: [],
            }
        )

        q_frags = Peptide(
            qs,
            charge=1,
            modifications=[
                ModSite(IAA_ALKYLATION.mass, ri + 1, IAA_ALKYLATION.description)
                for ri, (ai, res) in enumerate(qsi)
                if res == "C" and ai not in (72, 119)
            ],
        ).fragment(
            ion_types={
                IonType.y: [],
                IonType.b: [],
            }
        )

        psafe, pcys = safe_frags(
            precursor.residue_ranges[0],
            list(sorted(list(set(choices(p_frags, k=count))))),
            [72, 119],
        )
        qsafe, qcys = safe_frags(
            precursor.residue_ranges[1],
            list(sorted(list(set(choices(q_frags, k=count))))),
            [72, 119],
        )

        result = safe_frags_to_fragments(
            psafe, ps, precursor.residue_ranges[0], precursor
        ) + safe_frags_to_fragments(qsafe, qs, precursor.residue_ranges[1], precursor)

        shuffle(pcys)
        shuffle(qcys)

        p_fragments = [
            fr["fragment"]
            for fr in safe_frags_to_fragments(
                pcys, ps, precursor.residue_ranges[0], precursor
            )
        ]
        q_fragments = [
            fr["fragment"]
            for fr in safe_frags_to_fragments(
                qcys, qs, precursor.residue_ranges[1], precursor
            )
        ]

        for pf, qf in choices(list(zip(p_fragments, q_fragments)), k=count):
            total_charge = randint(1, max_charge)
            total_mass = pf.mz + qf.mz + CYS_BOND.mass - 2 * PROTON

            if "C" not in pf.sequence or "C" not in qf.sequence:
                continue

            fragment = Fragment(
                0,
                sequence=pf.sequence + "+" + qf.sequence,
                residue_ranges=pf.residue_ranges + qf.residue_ranges,
                intensity=10,
                intensity_ratio=1,
                mass=total_mass,
                target_mass=total_mass,
                mz=total_mass / total_charge + PROTON,
                target_mz=total_mass / total_charge + PROTON,
                charge=total_charge,
                break_count=pf.break_count + qf.break_count,
                error_ppm=0,
                modifications=qf.modifications + pf.modifications,
                connected_bonds=tuple([(72, 119)]),
                disconnected_cys=tuple([]),
            )

            result.append(
                {"fragment": fragment, "precursor": precursor, "var_bonds": [(72, 119)]}
            )
        return result
    else:
        ps = precursor.sequence
        psi = list(zip(range(*precursor.residue_ranges[0]), ps))

        p_frags = Peptide(
            ps,
            charge=1,
            modifications=[
                ModSite(IAA_ALKYLATION.mass, ri + 1, IAA_ALKYLATION.description)
                for ri, (ai, res) in enumerate(psi)
                if res == "C" and ai not in (72, 119)
            ],
        ).fragment(ion_types={IonType.y: [], IonType.b: [], IonType.precursor: []})
        p_frags = list(p_frags)

        psafe, pcys = safe_frags(
            precursor.residue_ranges[0],
            list(sorted(list(set(choices(p_frags, k=count * 2))))),
            [72, 119],
        )

        result = []

        p_fragments = [
            fr["fragment"]
            for fr in safe_frags_to_fragments(
                [pf for pf in pcys if "b" in pf[1]],
                ps,
                precursor.residue_ranges[0],
                precursor,
            )
        ]
        q_fragments = [
            fr["fragment"]
            for fr in safe_frags_to_fragments(
                [qf for qf in pcys if "y" in qf[1]],
                ps,
                precursor.residue_ranges[0],
                precursor,
            )
        ]

        for pf, qf in choices(list(zip(p_fragments, q_fragments)), k=count):
            if "C" not in pf.sequence or "C" not in qf.sequence:
                continue
            total_charge = randint(1, max_charge)

            pr, qr = pf.residue_ranges[0], qf.residue_ranges[0]
            if intersects(pr, qr):
                continue

            total_mass = pf.mz + qf.mz + CYS_BOND.mass - 2 * PROTON

            fragment = Fragment(
                0,
                sequence=pf.sequence + "+" + qf.sequence,
                residue_ranges=pf.residue_ranges + qf.residue_ranges,
                intensity=10,
                intensity_ratio=1,
                mass=total_mass,
                target_mass=total_mass,
                mz=total_mass / total_charge + PROTON,
                target_mz=total_mass / total_charge + PROTON,
                charge=total_charge,
                break_count=2 if pr[1] != qr[0] else 1,
                error_ppm=0,
                modifications=qf.modifications + pf.modifications,
                connected_bonds=tuple([(72, 119)]),
                disconnected_cys=tuple([]),
            )

            result.append(
                {"fragment": fragment, "precursor": precursor, "var_bonds": [(72, 119)]}
            )
        return result


In [6]:

print(f"In total we have {len(precursors)} precursors")

scans: List[Scan] = []
fragment_records = []
precursor_records = []

for i, prec in enumerate(precursors):
    p: Precursor = prec["precursor"]
    frags = generate_fragments(prec)

    fragment_objects: List[Fragment] = [f["fragment"] for f in frags]
    scan = Scan(
        nth_in_order=i,
        id=i,
        time=i,
        charge=prec["charge"],
        prec_mz=p.mz,
        prec_intensity=100,
        prec_mass=p.mass,
        fragments_mz=np.array(sorted([f.mz for f in fragment_objects])),
        fragments_intensity=np.array([f.intensity for f in fragment_objects]),
        threshold=0,
    )
    scans.append(scan)

    precursor_records.append(scan.to_dict() | p.to_dict())

    fragment_records += [
        scan.to_dict()
        | fr["precursor"].to_dict()
        | {"var_bonds": fr["var_bonds"]}
        | fr["fragment"].to_dict()
        for fr in frags
    ]


In total we have 863 precursors


In [7]:
ideal_precursor_df = pd.DataFrame(precursor_records)

ideal_precursor_df.to_csv(
    "../out/csv/precursor_matches_TEST_OVA_segments=3_error=50ppm_ideal.csv",
    index=False,
)

ideal_precursor_df

Unnamed: 0,scan_id,scan_nth_in_order,scan_time,scan_total_intensity,prec_charge,prec_sequence,prec_segment_count,prec_tryptide_ranges,prec_residue_ranges,prec_max_mc_count,prec_mc,prec_cys_bond_count,prec_mass,prec_mz,prec_error,prec_alkylation_count,prec_mods
0,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,[1],0,3072.631793,1537.323721,0,0,[]
1,1,1,1,390,5,GGLEPINFQTAADQAR,1,"[(12, 13)]","[(126, 142)]",0,[0],0,1686.832547,338.374334,0,0,[]
2,2,2,2,450,3,VASMASEKMKILELPFASGTMSMLVLLPDEVSGLEQLESIINFEK,1,"[(19, 22)]","[(218, 263)]",2,[2],0,4924.523755,1642.515743,0,0,[]
3,3,3,3,480,5,ELYRGGLEPINFQTAADQARELINSWVESQTNGIIRNVLQPSSVDS...,1,"[(11, 15)]","[(122, 181)]",3,[3],0,6529.370387,1306.881902,0,0,[]
4,4,4,4,470,5,MKMEEKYNLTSVLMAMGITDVFSSSANLSGISSAESLK,1,"[(26, 29)]","[(284, 322)]",2,[2],0,4068.976778,814.803181,0,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,858,858,858,280,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],0,6841.416016,2281.479831,0,0,[]
859,859,859,859,250,1,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],0,6841.416016,6842.423841,0,0,[]
860,860,860,860,210,5,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],0,6841.416016,1369.291028,0,0,[]
861,861,861,861,160,2,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],0,6841.416016,3421.715833,0,0,[]


In [8]:
ideal_fragment_df = pd.DataFrame(fragment_records)

ideal_fragment_df.to_csv(
    "../out/csv/fragment_matches_TEST_OVA_segments=3_error=50ppm_ideal.csv",
    index=False,
)

ideal_fragment_df

Unnamed: 0,scan_id,scan_nth_in_order,scan_time,scan_total_intensity,prec_charge,prec_sequence,prec_segment_count,prec_tryptide_ranges,prec_residue_ranges,prec_max_mc_count,...,frag_break_count,frag_error_ppm,frag_mods,frag_connected_bonds,frag_disconnected_cys,frag_interesting_disconnected_cys,frag_intensity,frag_intensity_ratio,target_mass,target_mz
0,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,True,0,[],[],[],[],10,0.1,114.042379,115.050204
1,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,True,0,[],[],[],[],10,0.1,275.147572,276.155397
2,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,True,0,[],[],[],[],10,0.1,551.305650,276.660650
3,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,True,0,[],[],[],[],10,0.1,574.310401,288.163025
4,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,True,0,[],[],[],[],10,0.1,631.331864,316.673757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45028,862,862,862,270,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,2,0,"[Cys Alkylation (IAA), Cys Alkylation (IAA)]","((72, 119),)",(),[],10,1.0,5008.447670,1670.490382
45029,862,862,862,270,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,2,0,"[Cys Alkylation (IAA), Cys Alkylation (IAA)]","((72, 119),)",(),[],10,1.0,4379.134381,1095.791420
45030,862,862,862,270,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,2,0,"[Cys Alkylation (IAA), Cys Alkylation (IAA)]","((72, 119),)",(),[],10,1.0,2652.218892,885.080789
45031,862,862,862,270,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,2,0,"[Cys Alkylation (IAA), Cys Alkylation (IAA)]","((72, 119),)",(),[],10,1.0,5347.699862,1337.932790


In [9]:
precursor_matches = write_matched_precursors(
    tryptides,
    scans,
    "../out/precursor_matches/TEST_OVA_AT_segments=3_error=50ppm.pickle",
    max_segments=3,
    error_ppm=5,
)

precursor_match_records = []
for pm in precursor_matches:
    precursor_match_records.append(pm["scan"].to_dict() | pm["precursor"].to_dict())

prec_df = pd.DataFrame(precursor_match_records)

prec_df.to_csv(
    "../out/csv/precursor_matches_TEST_OVA_segments=3_error=50ppm.csv", index=False
)

prec_df

Looking for matches...
Saving the matches to ../out/precursor_matches/TEST_OVA_AT_segments=3_error=50ppm.pickle


Unnamed: 0,scan_id,scan_nth_in_order,scan_time,scan_total_intensity,prec_charge,prec_sequence,prec_segment_count,prec_tryptide_ranges,prec_residue_ranges,prec_max_mc_count,prec_mc,prec_cys_bond_count,prec_mass,prec_mz,prec_error,prec_alkylation_count,prec_mods
0,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,[1],0,3074.647443,1537.323721,0.000000e+00,0,[]
1,1,1,1,390,5,GGLEPINFQTAADQAR,1,"[(12, 13)]","[(126, 142)]",0,[0],0,1691.871672,338.374334,0.000000e+00,0,[]
2,2,2,2,450,3,VASMASEKMKILELPFASGTMSMLVLLPDEVSGLEQLESIINFEK,1,"[(19, 22)]","[(218, 263)]",2,[2],0,4927.547230,1642.515743,1.846868e-10,0,[]
3,3,3,3,480,5,ELYRGGLEPINFQTAADQARELINSWVESQTNGIIRNVLQPSSVDS...,1,"[(11, 15)]","[(122, 181)]",3,[3],0,6534.409512,1306.881902,0.000000e+00,0,[]
4,4,4,4,470,5,MKMEEKYNLTSVLMAMGITDVFSSSANLSGISSAESLK,1,"[(26, 29)]","[(284, 322)]",2,[2],0,4074.015903,814.803181,2.235193e-10,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155,858,858,858,280,3,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],1,6844.439492,2281.479831,0.000000e+00,0,[Disulphide Bond (–H2)]
1156,859,859,859,250,1,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],1,6842.423841,6842.423841,0.000000e+00,0,[Disulphide Bond (–H2)]
1157,860,860,860,210,5,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],1,6846.455142,1369.291028,0.000000e+00,0,[Disulphide Bond (–H2)]
1158,861,861,861,160,2,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,[3],1,6843.431667,3421.715833,0.000000e+00,0,[Disulphide Bond (–H2)]


In [10]:
fragment_matches = write_matched_fragments(
    precursor_matches=precursor_matches,
    tryptides=tryptides,
    output_path="../out/fragment_matches/TEST_OVA_AT_segments=3_breaks=2_error=5ppm.pickle",
    max_allowed_breaks=2,
    error_ppm=5,
)

fragment_match_records = []
for fm in fragment_matches:
    fragment_match_records.append(
        fm["scan"].to_dict()
        | fm["precursor"].to_dict()
        | fm["variant"].to_dict()
        | (fm["fragment"].to_dict() if fm["fragment"] is not None else {})
        | {"prec_variant_count": fm["variant_count"]}
    )

frag_df = pd.DataFrame(fragment_match_records)

frag_df.to_csv(
    "../out/csv/fragment_matches_TEST_OVA_segments=3_error=50ppm.csv", index=False
)

frag_df

  0%|          | 0/1160 [00:00<?, ?it/s]

Writing the matched fragments to ../out/fragment_matches/TEST_OVA_AT_segments=3_breaks=2_error=5ppm.pickle


100%|██████████| 1160/1160 [03:35<00:00,  5.39it/s]


Unnamed: 0,scan_id,scan_nth_in_order,scan_time,scan_total_intensity,prec_charge,prec_sequence,prec_segment_count,prec_tryptide_ranges,prec_residue_ranges,prec_max_mc_count,...,frag_error_ppm,frag_mods,frag_connected_bonds,frag_disconnected_cys,frag_interesting_disconnected_cys,frag_intensity,frag_intensity_ratio,target_mass,target_mz,prec_variant_count
0,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,4.810162,[],[],[],[],10,0.025000,114.042379,115.050204,1
1,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,1.681699,[],[],[],[],10,0.025000,326.194857,327.202682,1
2,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,1.990047,[],[],[],[],10,0.025000,551.305650,276.660650,1
3,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,1.718720,[],[],[],[],10,0.025000,638.337678,320.176664,1
4,0,0,0,400,2,NVLQPSSVDSQTAMVLVNAIVFKGLWEK,1,"[(14, 16)]","[(158, 186)]",1,...,1.512503,[],[],[],[],10,0.025000,725.369706,363.692678,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105760,862,862,862,270,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,0.160308,[],"[(72, 119)]",[],[],10,0.037037,6841.414920,6842.422745,1
105761,862,862,862,270,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,0.160308,[],"[(72, 119)]",[],[],10,0.037037,6841.414920,6842.422745,1
105762,862,862,862,270,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,0.160308,"[Cys 72, Cys 119: R–SSH + ()–R or R–SH + S=R]",[],"[72, 119]","[72, 119]",10,0.037037,6841.414920,1711.361555,1
105763,862,862,862,270,4,LPGFGDSIEAQCGTSVNVHSSLRDILNQITKPNDVYSFSLASRLYA...,1,"[(7, 11)]","[(61, 122)]",3,...,0.160308,"[Cys 72, Cys 119: R–SSH + ()–R or R–SH + S=R]",[],"[72, 119]","[72, 119]",10,0.037037,6841.414920,6842.422745,1


In [16]:

import importlib
import fragment_matching

importlib.reload(fragment_matching)


<module 'fragment_matching' from '/Users/eugen/code/bp/bp-code/src/fragment_matching.py'>

In [None]:

var = Precursor(
    "VVRFDKLPGFGDSIEAQCGTSVNVHSSLR+DILNQITKPNDVYSFSLASRLYAEERYPILPEYLQCVK",
    7604.863 - PROTON,
    7604.863,
    [(0, 8), (8, 11)],
    [(55, 84), (84, 122)],
    1,
    0,
    [],
    0,
).variants(tryptides)[0]
var.bonds

In [70]:

calculate_mass("VRFDKLPGFGDSIEA", charge=1, ion_type="b")

1632.8380328122

In [73]:
fragment_matching._fragments_matching_targets(
    [
        fragment_matching.Target(
            0, (1632.8380328122 - PROTON), 1632.8380328122, 10, 3, 1
        )
    ],
    var,
    2,
    5,
)

[Fragment(id=0, sequence='VRFDKLPGFGDSIEA', residue_ranges=[(56, 71)], intensity=10, intensity_ratio=1, mass=1634.8542314416397, target_mass=1631.83020778013, mz=544.9514104805465, target_mz=1632.8380328122, charge=3, break_count=2, error_ppm=0.3361655598719734, modifications=(), connected_bonds=[], disconnected_cys=[]),
 Fragment(id=0, sequence='SLASRLYAEERYPI', residue_ranges=[(99, 113)], intensity=10, intensity_ratio=1, mass=1634.8542314416397, target_mass=1631.83020778013, mz=544.9514104805465, target_mz=1632.8380328122, charge=3, break_count=2, error_ppm=0.3361655598719734, modifications=(Modification(description='-NH3 (neutral loss)', mass=-17.02654910101),), connected_bonds=[], disconnected_cys=[])]

In [48]:
scans[1].fragments_mz

array([ 114.09134045,  215.13901891,  303.21391519,  344.181612  ,
        432.25650828,  530.26092495,  561.29910137,  631.30860342,
        791.40800037,  805.37266023,  905.45092781,  919.41558767,
        992.48295622, 1018.48400159, 1079.51498462, 1180.56266309,
       1278.56707977, 1366.64197604, 1407.60967285, 1495.68456913,
       1563.71078388, 1596.7322476 , 1709.81631158])

In [14]:
(calculate_mass("DILNQITKPNDVYSFSLASRLYAEERYPILPE", ion_type="b", charge=1))

3736.92210328783

In [17]:

mss = fragment_matching.write_matched_fragments(
    [precursors[647] | {"scan": scans[647]}],
    tryptides,
    "../out/fragment_matches/doesn_matter_just_testing.pickle",
    2,
    5,
)


[mss]

  0%|          | 0/1 [00:00<?, ?it/s]

Writing the matched fragments to ../out/fragment_matches/doesn_matter_just_testing.pickle


100%|██████████| 1/1 [00:00<00:00,  3.06it/s]


[{'scan': <src.model.scan.Scan at 0x119eeba00>,
  'precursor': Precursor(sequence='VVRFDKLPGFGDSIEAQCGTSVNVHSSLR+DILNQITKPNDVYSFSLASRLYAEERYPILPEYLQCVK', mass=7603.854839927481, mz=7604.862664959552, segments=[(5, 8), (8, 11)], residue_ranges=[(55, 84), (84, 122)], cys_bond_count=1, alkylation_count=0, modifications=[], error_ppm=0),
  'variant': <src.model.variant.Variant at 0x12f361e50>,
  'variant_count': 1,
  'fragment': Fragment(id=9, sequence='VVRF', residue_ranges=[(55, 59)], intensity=10, intensity_ratio=0.011764705882352941, mass=502.31417779463993, target_mass=501.305804201069, mz=502.31417779463993, target_mz=502.313629233139, charge=1, break_count=1, error_ppm=1.0942640122106182, modifications=(), connected_bonds=[], disconnected_cys=[])},
 {'scan': <src.model.scan.Scan at 0x119eeba00>,
  'precursor': Precursor(sequence='VVRFDKLPGFGDSIEAQCGTSVNVHSSLR+DILNQITKPNDVYSFSLASRLYAEERYPILPEYLQCVK', mass=7603.854839927481, mz=7604.862664959552, segments=[(5, 8), (8, 11)], residue_ra