In [1]:
import numpy as np
import pandas as pd
from Download_UnimodDB import *
import os, re

In [2]:
unimod = Download_Unimod_Dict()
pd.DataFrame.from_dict(unimod, orient='index')

Unnamed: 0,residues,mono_mass
1,TSCYHRK,42.0106
2,,-0.9840
3,K,226.0776
4,CUMYTSEDHK,57.0215
5,YTSMCRK,43.0058
...,...,...
2120,K,84.0575
2121,K,98.0732
2122,K,237.1771
2123,K,312.2213


In [3]:
getPTMmass(35, unimod)

15.9949

-----

In [4]:
def import_pep_IDs(PATH, filtering=False, drop_contaminants=True):
    df = pd.read_csv(PATH, usecols=['spectrum_title','scan','spectrum_file','matched_peptide','database_peptide',
                                     'modifications','leadprot','database','precursor_mass',
                                     'isCanonical','isModified',
                                     'q.value',
                                     'group_qval',
                                    'custom_q']
                     )

    if filtering=='global':
        df = df[df['q.value']<0.01].copy(deep=True)
    elif filtering=='groupwalk':
        df = df[df.group_qval<0.01].copy(deep=True)
    elif filtering=='custom':
        df = df[df.custom_q<0.01].copy(deep=True)
    elif filtering: 
        # gives error if filtering is not False
        print(f'Error! Filtering = {filtering}')
        return filtering

    if drop_contaminants:
        df = df[df.isCanonical!='Contam'].copy(deep=True)
    
    df['modified_peptide'] = df.matched_peptide + '|' + df.modifications
    # to remove retention times in parentheses
    df['modifications_noRT'] = df.modifications.str.replace(re.compile(r'\(.+?\)'), '', regex=True)
    
    return df

In [5]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
# dataset_name = "PXD002057.v0.11.4"
dataset_name = "PXD005833.v0.11.4"
# dataset_name = "PXD014258.v0.11.4"

canonical_path = os.path.join(working_folder, dataset_name, f'{dataset_name}-canon',    "group-walk-output.csv")
trembl_path    = os.path.join(working_folder, dataset_name, f'{dataset_name}-trembl',   "group-walk-output.csv")
openprot_path  = os.path.join(working_folder, dataset_name, f'{dataset_name}-openprot', "group-walk-output.csv")

canonical = import_pep_IDs(canonical_path)
# trembl    = import_pep_IDs(trembl_path,    filtering=filtering)
# openprot  = import_pep_IDs(openprot_path,  filtering=filtering)

In [11]:
def PTM_string_to_masses_list(ptm_string, unimod):
    if ptm_string=='Unmodified':
        return 'Unmodified'
    # if a peptide has 2 or more possible modified forms, take only 1
    x = ptm_string.split('_or_')[0]
    x = x.split('||')
    x = [_.split('|') for _ in x]
    x = {a:re.split('[][]',b) for a,b in x}
    x = [(int(a),b[3],int(b[1])) for a,b in x.items()]
    x = [(a,b,getPTMmass(c,unimod)) for a,b,c in x]    
    return x
    
PTM_string_to_masses_list('18|[4]Carbamidomethyl[C]||6|[11]Met->Hsl[M]_or_18|[4]Carbamidomethyl[C]||6|[526]Dethiomethyl[M]', unimod)

[(18, 'C', 57.0215), (6, 'M', -48.0034)]

In [12]:
canonical['modifications_masses'] = canonical.modifications_noRT.apply(lambda x: PTM_string_to_masses_list(x,unimod))
canonical

Unnamed: 0,spectrum_title,scan,spectrum_file,precursor_mass,database_peptide,matched_peptide,modifications,database,leadprot,q.value,isCanonical,isModified,custom_q,group_qval,modified_peptide,modifications_noRT,modifications_masses
0,AM19:controllerType=0 controllerNumber=1 scan=...,27002,AM19.mgf,2764.334692,LQVEPAVDTSGVQCYGPGIEGQGVFR,LQVEPAVDTSGVQCYGPGIEGQGVFR,14|[4]Carbamidomethyl[C],T,P21333,0.000000,Canonical,Expected,0.000000,0.000065,LQVEPAVDTSGVQCYGPGIEGQGVFR|14|[4]Carbamidometh...,14|[4]Carbamidomethyl[C],"[(14, C, 57.0215)]"
1,AM18:controllerType=0 controllerNumber=1 scan=...,34573,AM18.mgf,2949.534155,VGAGAPVYMAAVLEYLTAEILELAGNAAR,VGAGAPVYMAAVLEYLTAEILELAGNAAR,9|[35]Hydroxylation[M],T,Q16777,0.000000,Canonical,Expected,0.000000,0.000065,VGAGAPVYMAAVLEYLTAEILELAGNAAR|9|[35]Hydroxylat...,9|[35]Hydroxylation[M],"[(9, M, 15.9949)]"
2,AM19:controllerType=0 controllerNumber=1 scan=...,25990,AM19.mgf,2277.089330,GGMGSGGLATGIAGGLAGMGGIQNEK,GGMGSGGLATGIAGGLAGMGGIQNEK,3|[35]Hydroxylation[M],T,P05783,0.000000,Canonical,Expected,0.000000,0.000065,GGMGSGGLATGIAGGLAGMGGIQNEK|3|[35]Hydroxylation[M],3|[35]Hydroxylation[M],"[(3, M, 15.9949)]"
3,AM12:controllerType=0 controllerNumber=1 scan=...,14435,AM12.mgf,2205.887914,NMGGPYGGGNYGPGGSGGSGGYGGR,NMGGPYGGGNYGPGGSGGSGGYGGR,2|[35]Hydroxylation[M],T,P22626,0.000000,Canonical,Expected,0.000000,0.000065,NMGGPYGGGNYGPGGSGGSGGYGGR|2|[35]Hydroxylation[M],2|[35]Hydroxylation[M],"[(2, M, 15.9949)]"
4,AM16:controllerType=0 controllerNumber=1 scan=...,31938,AM16.mgf,2758.432950,YVEPIEDVPCGNIVGLVGVDQFLVK,YVEPIEDVPCGNIVGLVGVDQFLVK,10|[4]Carbamidomethyl[C],T,P13639,0.000000,Canonical,Expected,0.000000,0.000065,YVEPIEDVPCGNIVGLVGVDQFLVK|10|[4]Carbamidomethy...,10|[4]Carbamidomethyl[C],"[(10, C, 57.0215)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301922,AM13:controllerType=0 controllerNumber=1 scan=...,7405,AM13.mgf,708.124304,SGGSLKR,SGGSLKR,Unmodified,D,Q15784,0.235957,Canonical,Unmodified,0.073203,0.236040,SGGSLKR|Unmodified,Unmodified,Unmodified
301923,AM9:controllerType=0 controllerNumber=1 scan=3...,34421,AM9.mgf,1194.672826,PGGKSYEAYPK,PGGKSYEAYPK,Unmodified,D,Q6UXH8,0.236000,Canonical,Unmodified,0.073210,0.236062,PGGKSYEAYPK|Unmodified,Unmodified,Unmodified
301924,AM11:controllerType=0 controllerNumber=1 scan=...,10962,AM11.mgf,718.367529,QHGAGVR,QHGAGVR,Unmodified,T,Q86X02,0.236018,Canonical,Unmodified,0.073210,0.236062,QHGAGVR|Unmodified,Unmodified,Unmodified
301925,AM19:controllerType=0 controllerNumber=1 scan=...,7328,AM19.mgf,670.306616,KGAAGLR,KGAAGLR,Unmodified,T,Q8IZP6,0.236063,Canonical,Unmodified,0.073210,0.236076,KGAAGLR|Unmodified,Unmodified,Unmodified
