PXD002057 = "Parental SKBR3 and AZD8931-resistant SKBR3-AZDRc cell pellets (1 mg protein equivalent), **in biological triplicate** *(not fractions)*"

PXD014258 = "One of the three lanes from each gel is cut into five fractions, thus producing 15 fractions in total for all the three cell lines **(3 cell lines × 5 fractions)**. The tryptic peptides from all the 15 fractions were subjected to LC–MS/MS analyses using Q Exactive HF—Orbitrap mass spectrometer."

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import pyteomics.auxiliary as aux
import seaborn as sns
import os, re, subprocess
from pyteomics import mass

In [2]:
def trunc(x,l=10000):
    if len(x)>l:
        return x[:l]
    else:
        return x

In [3]:
def classify_leadprot(x):
    if 'CONTAMINANT' in x.upper():
        return 'Contam'
    elif x.endswith('_p_target'):
        return 'Target'
    elif x.startswith('II_') or x.startswith('IP_'):
        return 'NonCanon'
        # Ensembl is canonical
    else:
        return 'Canon'

def is_peptide_canonical(x):
    '''x is the list of protein classes'''
    if np.array([_=='Contam' for _ in x]).any():
        return 'Contam'
    if np.array([_=='Target' for _ in x]).any():
        return 'Target'
    if np.array([_=='Canon' for _ in x]).any():
        return 'Canon'
    return 'NonCanon'

def classifiy_mods(row):
    if row.modifications=='Unmodified':
        return 'Unmodified'
    elif len(row.unexpected_modification)>1:
        return 'Unexpected'
    else:
        return 'Expected'

In [4]:
def computeFDR(targets_decoys):
    tmp = targets_decoys.value_counts()
    try:
        return tmp['D']/tmp['T']
    except:
        return 0

def compute_optimal_cutoff(data_subgroup, maxiter=20):
    upperlimit, lowerlimit, cutoff = 1, 0, 1
    targets_decoys_ = data_subgroup[data_subgroup['q-value']<cutoff].database
    global_fdr = computeFDR(targets_decoys_)
    # print(cutoff,'\t',global_fdr)
    
    iterations = 0
    while f"{global_fdr:.2%}" != '1.00%' and iterations<maxiter:
        if global_fdr>.01:
            upperlimit = cutoff
        else:
            lowerlimit = cutoff
        cutoff = np.round((upperlimit+lowerlimit)/2, 4)
        targets_decoys_ = data_subgroup[data_subgroup['q-value']<cutoff].database
        global_fdr = computeFDR(targets_decoys_)
        global_fdr = np.round(global_fdr,4)
        iterations += 1
    # print(cutoff,'\t',global_fdr)
    # print(targets_decoys_.value_counts())
    data_subgroup['custom_filter_pass'] = data_subgroup['q-value']<cutoff
    return data_subgroup.copy(deep=True)

In [5]:
def custom_subgroup_filter(data_):
    filtered_subgroups = []
    for (c,m),df in data_.groupby(['isCanonical','isModified']).__iter__():
        tmp = aux.target_decoy.qvalues(df, key='psm_score', reverse=True, is_decoy=df.database=='D',
                                      formula=1, full_output=True, q_label='custom_q')
        filtered_subgroups.append(tmp)

    return pd.concat(filtered_subgroups, ignore_index=True)

In [6]:
FDRbench_path = "D:/entrapment_db"
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
# working_folder = "D:/run-ionbot"
PXDs = [
    'PXD002057-entrap-closed',
    'PXD005833-entrap-closed',
    'PXD014258-entrap-closed',
    'PXD002057-entrapment',
    'PXD005833-entrapment',
    'PXD014258-entrapment',
]
SEARCHES = [
    'Pep-Canon',
    'Pep-Trembl',
    'Pep-Open',
]

DATASETS = pd.MultiIndex.from_product([PXDs,SEARCHES])
DATASETS

MultiIndex([('PXD002057-entrap-closed',  'Pep-Canon'),
            ('PXD002057-entrap-closed', 'Pep-Trembl'),
            ('PXD002057-entrap-closed',   'Pep-Open'),
            ('PXD005833-entrap-closed',  'Pep-Canon'),
            ('PXD005833-entrap-closed', 'Pep-Trembl'),
            ('PXD005833-entrap-closed',   'Pep-Open'),
            ('PXD014258-entrap-closed',  'Pep-Canon'),
            ('PXD014258-entrap-closed', 'Pep-Trembl'),
            ('PXD014258-entrap-closed',   'Pep-Open'),
            (   'PXD002057-entrapment',  'Pep-Canon'),
            (   'PXD002057-entrapment', 'Pep-Trembl'),
            (   'PXD002057-entrapment',   'Pep-Open'),
            (   'PXD005833-entrapment',  'Pep-Canon'),
            (   'PXD005833-entrapment', 'Pep-Trembl'),
            (   'PXD005833-entrapment',   'Pep-Open'),
            (   'PXD014258-entrapment',  'Pep-Canon'),
            (   'PXD014258-entrapment', 'Pep-Trembl'),
            (   'PXD014258-entrapment',   'Pep-Open')],
         

In [7]:
folders = {search:[] for search in SEARCHES}
for search in SEARCHES:   
    for dataset_name in PXDs:
        for fld in os.scandir(os.path.join(working_folder, dataset_name, f"{dataset_name}-{search}")):
            if not fld.name.startswith('.') and os.path.isdir(fld.path): 
                folders[search].append([dataset_name,fld])
# folders

In [8]:
for search in SEARCHES:
    for i,(dataset_name,sample_fld) in enumerate(folders[search]):
        print(1+i,sample_fld)
        data = pd.read_csv(os.path.join(sample_fld.path,'ionbot.first.csv'))
        data.drop(columns=['q-value','PEP'], inplace=True)
        data = aux.target_decoy.qvalues(data, key='psm_score', reverse=True, is_decoy=data.database=='D',
                                        q_label='q-value', formula=1, full_output=True)
        # data.sort_values('database', ascending=False, inplace=True)
        data.unexpected_modification = data.unexpected_modification.fillna('')
        # from "combine-ionbot-results"
        data['leadprot'] = data.proteins.apply(lambda x: re.sub(r'\|\|.*','',x))
        data['leadprot'] = data.leadprot.apply(lambda x: re.sub(r'.*\(\(','',x))
        data['leadprot'] = data.leadprot.apply(lambda x: re.sub(r'\)\).*','',x))
    
        FDRbench = pd.read_csv(os.path.join(FDRbench_path,f"FDRbench-{search}.csv.gz"),
                               usecols=['sequence','proteins'])
        # print('FDRbench shape =', FDRbench.shape)
        FDRbench.rename(columns={'sequence':'leadprot','proteins':'entrap_proteins'}, inplace=True)
    
        data = data.merge(FDRbench, on='leadprot', how='left')
        # data.merge(FDRbench, on='leadprot', how='left').to_csv("C:/Users/Enrico/Desktop/FDRbench-x-Test.csv", index=False)
        data.entrap_proteins = data.entrap_proteins.fillna('x|contaminant|x')
        
        data.proteins = data.entrap_proteins.str.split(';', regex=False)
        data.proteins = data.proteins.apply(lambda x: x[:50] if len(x)>50 else x)    
        data.proteins = data.proteins.apply(lambda x: [_.split('|')[1] for _ in x])
        data['leadprot'] = data.proteins.apply(lambda x: x[0])
        data['protein_classes'] = data.proteins.apply(lambda x: np.unique([classify_leadprot(_) for _ in x]))

        data['peptide_unmod_mass'] = data.matched_peptide.apply(lambda x: mass.calculate_mass(x))
        data['ptm_delta_mass'] = round(data.precursor_mass - data.peptide_unmod_mass)
        most_common_mass_shifts = data.ptm_delta_mass.value_counts().head(10).index
        
        data['isCanonical'] = data.protein_classes.apply(is_peptide_canonical)
        # data['isModified']  = data.apply(classifiy_mods,axis=1)
        data['isModified'] = data.ptm_delta_mass.apply(lambda x: 'Unmodified' if x==0 else 'groupA' if x in most_common_mass_shifts else 'groupB')
        
        # Adds a columns telling you if a PSM passes the custom filters or not (custom_filter_pass)
        data2 = custom_subgroup_filter(data)
        
        # prepare for Group-walk
        data2['isTarget'] = data2.database.apply(lambda x: int(x=='T'))
        data2['FDRGroup'] = data2.isCanonical + '_' + data2.isModified
        
        out_fld = sample_fld.path
        data2.to_csv(os.path.join(out_fld, "group-walk-input.csv"), index=False)
        print(out_fld, data2.shape)

        print("GroupWalk run start...")
        _ = subprocess.run(['Rscript.exe', 'Run_group_walk.R', os.getcwd(), 
                            os.path.join(out_fld, "group-walk-input.csv"), 
                            os.path.join(out_fld, "group-walk-output.csv")])
        print("GroupWalk run OK =",_.returncode==0)

1 <DirEntry '130327_o2_01_hu_C1_2hr-pep-canon'>
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057-entrap-closed\PXD002057-entrap-closed-Pep-Canon\130327_o2_01_hu_C1_2hr-pep-canon (11046, 27)
GroupWalk run start...
GroupWalk run OK = True
2 <DirEntry '130327_o2_02_hu_P1_2hr-pep-canon'>
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057-entrap-closed\PXD002057-entrap-closed-Pep-Canon\130327_o2_02_hu_P1_2hr-pep-canon (12741, 27)
GroupWalk run start...
GroupWalk run OK = True
3 <DirEntry '130327_o2_03_hu_C2_2hr-pep-canon'>
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057-entrap-closed\PXD002057-entrap-closed-Pep-Canon\130327_o2_03_hu_C2_2hr-pep-canon (9158, 27)
GroupWalk run start...
GroupWalk run OK = True
4 <DirEntry '130327_o2_04_hu_P2_2hr-pep-canon'>
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057-entrap-closed\PXD002057-entrap-closed-Pep-Canon\130327_o2_04_hu_P2_2hr-pep-canon (11526, 27)
GroupWalk run start...
GroupWalk run OK = True
5 <DirEntry '130327_o2_05_hu_C3_2

  q = tfalse / (ind - cumsum) / ratio


C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD014258-entrap-closed\PXD014258-entrap-closed-Pep-Open\SampleHela-pep-open (48254, 27)
GroupWalk run start...
GroupWalk run OK = True
25 <DirEntry '130327_o2_01_hu_C1_2hr-pep-open'>
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057-entrapment\PXD002057-entrapment-Pep-Open\130327_o2_01_hu_C1_2hr-pep-open (20620, 27)
GroupWalk run start...
GroupWalk run OK = True
26 <DirEntry '130327_o2_02_hu_P1_2hr-pep-open'>
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057-entrapment\PXD002057-entrapment-Pep-Open\130327_o2_02_hu_P1_2hr-pep-open (21471, 27)
GroupWalk run start...
GroupWalk run OK = True
27 <DirEntry '130327_o2_03_hu_C2_2hr-pep-open'>
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057-entrapment\PXD002057-entrapment-Pep-Open\130327_o2_03_hu_C2_2hr-pep-open (18857, 27)
GroupWalk run start...
GroupWalk run OK = True
28 <DirEntry '130327_o2_04_hu_P2_2hr-pep-open'>
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057-entrapment\PXD00

  q = tfalse / (ind - cumsum) / ratio


C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD014258-entrapment\PXD014258-entrapment-Pep-Open\SampleHela-pep-open (72907, 27)
GroupWalk run start...
GroupWalk run OK = True


-----

In [9]:
for search in SEARCHES:
    for i,(dataset_name,sample_fld) in enumerate(folders[search]):
        print(1+i,sample_fld)
        out_fld = sample_fld.path
        
        tmp = pd.read_csv(os.path.join(out_fld, "group-walk-output.csv"), low_memory=False)
        tmp.drop(columns=['is.decoy','isTarget'], inplace=True)
        tmp.rename(columns={'q.value':'global_qvalue','custom_q':'groupwise_qvalue','group_qval':'GroupWalk_q'}, inplace=True)

        os.makedirs(f"D:/{dataset_name}-{search}-mass-shifts", exist_ok=True)
        tmp.to_csv(f"D:/{dataset_name}-{search}-mass-shifts/{sample_fld.name}-group-walk-output.csv.gz", index=False, compression='gzip')
        
        os.remove(os.path.join(out_fld, "group-walk-output.csv"))
        os.remove(os.path.join(out_fld, "group-walk-input.csv"))

1 <DirEntry '130327_o2_01_hu_C1_2hr-pep-canon'>
2 <DirEntry '130327_o2_02_hu_P1_2hr-pep-canon'>
3 <DirEntry '130327_o2_03_hu_C2_2hr-pep-canon'>
4 <DirEntry '130327_o2_04_hu_P2_2hr-pep-canon'>
5 <DirEntry '130327_o2_05_hu_C3_2hr-pep-canon'>
6 <DirEntry '130327_o2_06_hu_P3_2hr-pep-canon'>
7 <DirEntry 'AM10-pep-canon'>
8 <DirEntry 'AM11-pep-canon'>
9 <DirEntry 'AM12-pep-canon'>
10 <DirEntry 'AM13-pep-canon'>
11 <DirEntry 'AM14-pep-canon'>
12 <DirEntry 'AM15-pep-canon'>
13 <DirEntry 'AM16-pep-canon'>
14 <DirEntry 'AM17-pep-canon'>
15 <DirEntry 'AM18-pep-canon'>
16 <DirEntry 'AM19-pep-canon'>
17 <DirEntry 'AM20-pep-canon'>
18 <DirEntry 'AM21-pep-canon'>
19 <DirEntry 'AM7-pep-canon'>
20 <DirEntry 'AM8-pep-canon'>
21 <DirEntry 'AM9-pep-canon'>
22 <DirEntry 'Sample-BT474-pep-canon'>
23 <DirEntry 'Sample-MCF-pep-canon'>
24 <DirEntry 'SampleHela-pep-canon'>
25 <DirEntry '130327_o2_01_hu_C1_2hr-pep-canon'>
26 <DirEntry '130327_o2_02_hu_P1_2hr-pep-canon'>
27 <DirEntry '130327_o2_03_hu_C2_2hr-pep-c