In [108]:
import pandas as pd

from src.model.scan import read_mgf
from src.utilities.dataloading import load_precursor_matches, cleave_protein

from pymzid.read_mzid import Mzid

In [109]:
cleave_protein("LYS")

[Peptide(beginning=0, end=1, seq=K, modifications={}),
 Peptide(beginning=1, end=5, seq=VFGR, modifications={}),
 Peptide(beginning=5, end=13, seq=CELAAAMK, modifications={'M': (Modification(description='Met Oxidation', mass=15.9949), 1)}),
 Peptide(beginning=13, end=14, seq=R, modifications={}),
 Peptide(beginning=14, end=21, seq=HGLDNYR, modifications={}),
 Peptide(beginning=21, end=33, seq=GYSLGNWVCAAK, modifications={}),
 Peptide(beginning=33, end=45, seq=FESNFNTQATNR, modifications={}),
 Peptide(beginning=45, end=61, seq=NTDGSTDYGILQINSR, modifications={}),
 Peptide(beginning=61, end=68, seq=WWCNDGR, modifications={}),
 Peptide(beginning=68, end=73, seq=TPGSR, modifications={}),
 Peptide(beginning=73, end=96, seq=NLCNIPCSALLSSDITASVNCAK, modifications={}),
 Peptide(beginning=96, end=97, seq=K, modifications={}),
 Peptide(beginning=97, end=112, seq=IVSDGNGMNAWVAWR, modifications={'M': (Modification(description='Met Oxidation', mass=15.9949), 1)}),
 Peptide(beginning=112, end=114, s

In [102]:
protein = "LIP"
kind = "AT"

data_path = f"../data/mgf/190318_{protein}_{kind}_50x_05.mgf"

print(f"Loading scans from {data_path}...")
scans = list(read_mgf(data_path))

print(len(scans))

Loading scans from ../data/mgf/190318_LIP_AT_50x_05.mgf...
13579


In [103]:
precursor_matches = load_precursor_matches(protein, kind, 3, 10, None)
len(precursor_matches)

Loading precursors from ../out/precursor_matches/LIP_AT_segments=3_error=10ppm.pickle...


67

In [104]:
len([pm for pm in precursor_matches if pm["precursor"].cys_bond_count == 0])


12

In [None]:

precursor_matches_df = pd.DataFrame(
    [(pm["scan"].to_dict() | pm["precursor"].to_dict()) for pm in precursor_matches]
)
precursor_matches_df

In [11]:
precursor_matches_df = precursor_matches_df[
    [
        "scan_id",
        "scan_nth_in_order",
        "prec_mz",
        "prec_sequence",
        "prec_mass",
        "prec_error",
        "prec_mods",
        "prec_cys_bond_count",
        "prec_max_mc_count",
    ]
]

precursor_matches_df

Unnamed: 0,scan_id,scan_nth_in_order,prec_mz,prec_sequence,prec_mass,prec_error,prec_mods,prec_cys_bond_count,prec_max_mc_count
0,2414,1541,504.207861,WDCVQCQK,1008.415722,1.261947,[Disulphide Bond (–H2)],1,0
1,2576,1696,504.207861,WDCVQCQK,1008.415722,0.898065,[Disulphide Bond (–H2)],1,0
2,2993,2092,563.601507,SVVPGNKWDCVQCQK,1690.804521,0.966294,[Disulphide Bond (–H2)],1,1
3,3298,2383,877.417790,SGTSNVQICTSEIETK,1754.835580,2.780792,[],0,0
4,3948,3001,456.271644,TIYLVFR,912.543289,4.440225,[],0,0
...,...,...,...,...,...,...,...,...,...
62,14468,12866,1375.322771,SVVPGNKWDCVQCQKWVPDGK+SGTSNVQICTSEIETK,4125.968312,8.603668,[],1,2
63,14545,12923,1375.322771,SVVPGNKWDCVQCQKWVPDGK+SGTSNVQICTSEIETK,4125.968312,9.047781,[],1,2
64,14616,12977,1375.322771,SVVPGNKWDCVQCQKWVPDGK+SGTSNVQICTSEIETK,4125.968312,9.314249,[],1,2
65,14627,12986,1031.744034,SVVPGNKWDCVQCQKWVPDGK+SGTSNVQICTSEIETK,4126.976137,8.038085,[],1,2


In [106]:
mgf_id = Mzid("../data/mgf/190318_LIP_AT_50x_05.mzid")
mgf_id.read_psm()
msgf_matches_df = mgf_id.psm_df

msgf_matches_df

Reading peptide spectrum matches:  96%|█████████▌| 22/23 [00:00<00:00, 41360.24it/s]


Unnamed: 0,sir_id,spectrum_id,pe_id,sii_id,z,mz,calc_mz,pep_id,pass_threshold,rank,MS-GF:RawScore,MS-GF:DeNovoScore,MS-GF:SpecEValue,MS-GF:EValue
0,SIR_4205,index=4204,PepEv_103_TIYLVFR_103,SII_4205_1,2,456.2720947265625,456.2710876464844,Pep_TIYLVFR,True,1,72,72,6.009203e-10,1.7847334e-07
1,SIR_1237,index=1236,PepEv_74_WVPDGK_74,SII_1237_1,2,351.68731689453125,351.1844787597656,Pep_WVPDGK,True,1,59,59,1.0067963e-09,2.990185e-07
2,SIR_1096,index=1095,PepEv_74_WVPDGK_74,SII_1096_1,2,351.68701171875,351.1844787597656,Pep_WVPDGK,True,1,64,65,1.2406985e-09,3.6848746e-07
3,SIR_4358,index=4357,PepEv_103_TIYLVFR_103,SII_4358_1,2,456.271484375,456.2710876464844,Pep_TIYLVFR,True,1,51,55,1.5642085e-09,4.6456992e-07
4,SIR_6003,index=6002,PepEv_66_WDCVQCQK_66,SII_6003_1,2,562.7702026367188,562.2665405273438,Pep_WDCVQCQK,True,1,-12,12,3.449692e-07,0.00010245586
5,SIR_4986,index=4985,PepEv_103_TIYLVFR_103,SII_4986_1,2,456.7746887207031,456.2710876464844,Pep_TIYLVFR,True,1,-17,4,2.7344875e-06,0.0008121428
6,SIR_1417,index=1416,PepEv_66_WDCVQCQK_66,SII_1417_1,2,562.269775390625,562.2665405273438,Pep_WDCVQCQK,True,1,-14,29,6.9427133e-06,0.002061986
7,SIR_108,index=107,PepEv_255_SGTSNVQICTSEIETK_255,SII_108_1,3,585.629638671875,585.2905883789062,Pep_SGTSNVQICTSEIETK,True,1,-33,14,1.1430891e-05,0.0033949746
8,SIR_2899,index=2898,PepEv_66_WDCVQCQK_66,SII_2899_1,2,562.2631225585938,562.2665405273438,Pep_WDCVQCQK,True,1,-15,61,1.7894687e-05,0.005314722
9,SIR_1419,index=1418,PepEv_74_WVPDGK_74,SII_1419_1,2,351.682861328125,351.1844787597656,Pep_WVPDGK,True,1,16,65,1.7998183e-05,0.0053454605


In [20]:
msgf_matches_df["spectrum_id"] = [
    int(s.removeprefix("index=")) for s in msgf_matches_df["spectrum_id"]
]
msgf_matches_df["pep_id"] = [s.removeprefix("Pep_") for s in msgf_matches_df["pep_id"]]
msgf_matches_df["their_error_ppm"] = [
    compute_error(float(reference), float(measured))
    for reference, measured in zip(msgf_matches_df["calc_mz"], msgf_matches_df["mz"])
]

msgf_matches_df = msgf_matches_df[
    ["spectrum_id", "calc_mz", "pep_id", "their_error_ppm"]
]
msgf_matches_df = msgf_matches_df.rename(
    columns={"pep_id": "their_sequence", "calc_mz": "their_mz"}
)
msgf_matches_df = msgf_matches_df.set_index("spectrum_id")
msgf_matches_df

NameError: name 'compute_error' is not defined

In [None]:
precursor_matches_df.join(msgf_matches_df, how="outer").to_csv(
    "../out/my_vs_their_matches_rat.csv"
)