In [1]:
#default_exp reader.maxquant_reader

In [2]:
#hide
import pandas as pd
df = pd.DataFrame({'a':['1,2;3,4;5,6','3,4;5,6']})
def parse_str(s):
    return ';'.join([item.split(',')[0] for item in s.split(';')]), ';'.join([item.split(',')[1] for item in s.split(';')])
df['b'], df['c'] = zip(*df.a.apply(parse_str))
df

Unnamed: 0,a,b,c
0,"1,2;3,4;5,6",1;3;5,2;4;6
1,"3,4;5,6",3;5,4;6


In [3]:
#export
import pandas as pd
import numba
from alphadeep.reader.psm_reader import PSMReaderBase

@numba.njit
def parse_mq(
    modseq, 
    fixed_C=True
):
    PeptideModSeq = modseq.strip('_')
    mod_list = []
    site_list = []
    if PeptideModSeq.startswith('('):
        site_list.append('0')
        site_end = PeptideModSeq.find(')')+1
        mod_list.append(PeptideModSeq[:site_end])
        PeptideModSeq = PeptideModSeq[site_end:]
    site = PeptideModSeq.find('(')
    while site != -1:
        site_end = PeptideModSeq.find(')',site+1)+1
        if site_end < len(PeptideModSeq) and PeptideModSeq[site_end] == ')': 
            site_end += 1
        site_list.append(str(site+1)) 
        mod_list.append(PeptideModSeq[site-1:site_end])
        PeptideModSeq = PeptideModSeq[:site] + PeptideModSeq[site_end:]
        site = PeptideModSeq.find('(', site)
    if fixed_C:
        site = PeptideModSeq.find('C')
        while site != -1:
            site_list.append(str(site+1))
            mod_list.append('C(Carbamidomethyl (C))')
            site = PeptideModSeq.find('C',site+1)
    return ';'.join(mod_list), ';'.join(site_list)

class MaxQuantReader(PSMReaderBase):
    def __init__(self, 
        frag_types=['b','y','b-modloss','y-modloss'], 
        max_frag_charge=2
    ):
        super().__init__(frag_types, max_frag_charge)
        
        self.modification_convert_dict = {}
        self.modification_convert_dict['(Acetyl (Protein N-term))'] = 'Acetyl@Protein N-term'
        self.modification_convert_dict['C(Carbamidomethyl (C))'] = 'Carbamidomethyl@C'
        self.modification_convert_dict['M(Oxidation (M))'] = 'Oxidation@M'
        self.modification_convert_dict['S(Phospho (S))'] = 'Phospho@S'
        self.modification_convert_dict['T(Phospho (T))'] = 'Phospho@T'
        self.modification_convert_dict['Y(Phospho (Y))'] = 'Phospho@Y'
        self.modification_convert_dict['S(Phospho (ST))'] = 'Phospho@S'
        self.modification_convert_dict['T(Phospho (ST))'] = 'Phospho@T'
        self.modification_convert_dict['S(Phospho (STY))'] = 'Phospho@S'
        self.modification_convert_dict['T(Phospho (STY))'] = 'Phospho@T'
        self.modification_convert_dict['Y(Phospho (STY))'] = 'Phospho@Y'
        self.modification_convert_dict['K(GlyGly (K))'] = 'GlyGly@K'
        self.modification_convert_dict['(ac)'] = 'Acetyl@Protein N-term'
        self.modification_convert_dict['M(ox)'] = 'Oxidation@M'
        self.modification_convert_dict['S(ph)'] = 'Phospho@S'
        self.modification_convert_dict['T(ph)'] = 'Phospho@T'
        self.modification_convert_dict['Y(ph)'] = 'Phospho@Y'
        self.modification_convert_dict['K(gl)'] = 'GlyGly@K'

    def _load_file(self, filename):
        df = pd.read_csv(filename, sep='\t')
        df = df[(df['Reverse']!='+')&(~pd.isna(df['Retention time']))]
        psm_df = pd.DataFrame()
        psm_df['sequence'] = df['Sequence']
        psm_df['nAA'] = psm_df.sequence.str.len()
        psm_df['mods'], psm_df['mod_sites'] = zip(*df['Modified sequence'].apply(parse_mq))
        psm_df['charge'] = df['Charge']
        psm_df['RT'] = df['Retention time']*60
        if 'Scan number' in df.columns:
            # msms.txt
            psm_df['scan'] = df['Scan number']
        else:
            # evidence.txt
            psm_df['scan'] = df['MS/MS scan number']
        if 'K0' in df.columns:
            psm_df['mobility'] = 1/df['K0']
        else:
            psm_df['mobility'] = pd.NA
        if 'CCS' in df.columns:
            psm_df['CCS'] = df['CCS']
        else:
            psm_df['CCS'] = pd.NA
        psm_df['raw_name'] = df['Raw file']
        psm_df['score'] = df['Score']
        psm_df['proteins'] = df['Proteins']
        if 'Gene Names' in df.columns:
            psm_df['genes'] = df['Gene Names']
        elif 'Gene names' in df.columns:
            psm_df['genes'] = df['Gene names']
        else:
            psm_df['genes'] = ''
        self._psm_df = psm_df

In [5]:
#hide
filename = '/Users/zengwenfeng/Workspace/Data/HeLa_500ng/share/txt/msms.txt'
mq_reader = MaxQuantReader()
mq_reader.load(filename)
mq_reader.psm_df[mq_reader.psm_df.mods.str.contains('Oxidation')]

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,sequence,nAA,mods,mod_sites,charge,RT,scan,mobility,raw_name,score,proteins,genes
38,AAALEAMK,8,Oxidation@M,8,2,936.48,10442,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,89.550,sp|P31948|STIP1_HUMAN,
46,AAAPAPEEEMDECEQALAAEPK,22,Oxidation@M;Carbamidomethyl@C,11;13,2,3164.64,58999,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,153.230,sp|P26641|EF1G_HUMAN,
47,AAAPAPEEEMDECEQALAAEPK,22,Oxidation@M;Carbamidomethyl@C,11;13,3,3164.76,59002,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,93.851,sp|P26641|EF1G_HUMAN,
48,AAAPAPEEEMDECEQALAAEPK,22,Oxidation@M;Carbamidomethyl@C,11;13,2,3195.00,59650,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,202.320,sp|P26641|EF1G_HUMAN,
49,AAAPAPEEEMDECEQALAAEPK,22,Oxidation@M;Carbamidomethyl@C,11;13,3,3195.06,59651,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,124.690,sp|P26641|EF1G_HUMAN,
...,...,...,...,...,...,...,...,...,...,...,...,...
50243,YVMTTTTLER,10,Oxidation@M,4,2,2031.06,34495,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,84.213,sp|P05198|IF2A_HUMAN,
50279,YWDLMNLSEK,10,Oxidation@M,6,2,4328.10,83722,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,84.615,sp|Q9BZE4|NOG1_HUMAN,
50297,YYAVNFPMR,9,Oxidation@M,9,2,3565.50,67576,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,66.087,sp|Q92769|HDAC2_HUMAN,
50344,YYTSASGDEMVSLK,14,Oxidation@M,11,2,2662.74,48198,,20190402_QX1_SeVW_MA_HeLa_500ng_LC11,83.652,sp|P07900|HS90A_HUMAN,
