Project 1 = https://www.ebi.ac.uk/pride/archive/projects/PXD002057

Project 2 = https://www.ebi.ac.uk/pride/archive/projects/PXD05388

Project 3 = https://www.ebi.ac.uk/pride/archive/projects/PXD003594

"Canonical" search database contains only UniProt SwissProt canonical protein sequences (Uniprot version 2023_01).

"trEMBL" search database includes protein isoforms and trEMBL sequences (Uniprot version 2023_01).

----

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import pyteomics.auxiliary as aux
import seaborn as sns
import os, re, subprocess
from utility_functions import *

In [None]:
working_folder = "C:/Users/pc/OneDrive - UGent/run-ionbot"
# working_folder = "D:/run-ionbot"
PXDs = [
    # 'PXD002057-closed',
    # 'PXD005833-closed',
    # 'PXD014258-closed',
    'PXD002057.v0.11.4',
    # 'PXD005833.v0.11.4',
    # 'PXD014258.v0.11.4',
]
SEARCHES = [
    # 'canon',
    # 'trembl',
    'openprot',
]
DATASETS = pd.MultiIndex.from_product([PXDs,SEARCHES])
DATASETS

In [None]:
folders = {dataset_name:{search:[] for search in SEARCHES} for dataset_name in PXDs}
for dataset_name in PXDs:
    for search in SEARCHES:   
        for fld in os.scandir(os.path.join(working_folder, dataset_name, f"{dataset_name}-{search}")):
            if not fld.name.startswith('.') and os.path.isdir(fld.path): 
                folders[dataset_name][search].append(fld)
# folders

In [None]:
output_files = []
for dataset_name in PXDs:
    for search in SEARCHES:  
        j = len(folders[dataset_name][search])
        for i,sample_fld in enumerate(folders[dataset_name][search]):            
            # Read and preprocess the dataset
            data = pd.read_csv(os.path.join(sample_fld.path,'ionbot.first.csv'))
            print(f'({1+i}/{j})',sample_fld, data.shape)
            # num_psms = len(data)
            data.drop(columns=['q-value', 'PEP'], inplace=True)
            data.modifications = data.modifications.fillna('Unmodified')
            data = aux.target_decoy.qvalues(data,
                                            key='psm_score',
                                            reverse=True,
                                            is_decoy=(data.database == 'D'),
                                            q_label='q-value',
                                            formula=1,
                                            full_output=True)
            data.sort_values('database', ascending=False, inplace=True)
            data.unexpected_modification = data.unexpected_modification.fillna('')
            
            # Process proteins column and compute associated features
            data['proteins'] = data.proteins.apply(process_proteins)
            data['leadprot'] = data.proteins.apply(lambda x: x[0])
            data['protein_classes'] = data.proteins.apply(
                lambda lst: np.unique([classify_leadprot(p) for p in lst])
            )
            
            data['isCanonical'] = data.protein_classes.apply(is_peptide_canonical)
            data['isModified'] = data.apply(classifiy_mods, axis=1)
            
            # Apply custom subgroup filter and compute Group-walk specific columns
            data2 = custom_subgroup_filter(data)
            data2['isTarget'] = data2.database.eq('T').astype(int)
            data2['FDRGroup'] = data2.isCanonical + '_' + data2.isModified
            
            # Convert list columns to semicolon-separated strings for CSV export
            data2['proteins'] = data2.proteins.apply(lambda lst: ';'.join(lst))
            data2['protein_classes'] = data2.protein_classes.apply(lambda lst: ';'.join(lst))
            
            # Save the processed data & prepare for groupwalk
            out_fld = sample_fld.path
            IN  = os.path.join(out_fld, "group-walk-input.csv")
            OUT = os.path.join(out_fld, "group-walk-output.csv")
            data2.to_csv(IN, index=False)
            print(out_fld, data2.shape)
            # print(num_psms==len(data2))
    
            # Run Groupwalk
            print("GroupWalk run start...")
            _ = subprocess.run(['Rscript.exe', 'Run_group_walk.R', os.getcwd(), IN, OUT])
            print("GroupWalk run OK =",_.returncode==0)
            os.remove(IN)

In [None]:
for dataset_name in PXDs:
    for search in SEARCHES: 
        tmp = []
        for fld in folders[dataset_name][search]:
            tmp.append(pd.read_csv(os.path.join(fld.path,"group-walk-output.csv"), low_memory=False))
        tmp = pd.concat(tmp, ignore_index=True)
        tmp.to_csv(os.path.join(working_folder, dataset_name, f'{dataset_name}-{search}',
                                f"combined-results-w-qvalues.csv.gz"),
                   index=False, compression='gzip')
        TD = tmp.database.value_counts()
        print(dataset_name, search, f"FDR={TD['D']/TD['T']:.2%}")
        del tmp
print('\nDone!')

----

In [None]:
# folders = {search:[] for search in SEARCHES}
# for search in SEARCHES:   
#     for dataset_name in PXDs:
#         for fld in os.scandir(os.path.join(working_folder, dataset_name, f"{dataset_name}-{search}")):
#             if not fld.name.startswith('.') and os.path.isdir(fld.path): 
#                 folders[search].append([dataset_name,fld])
# folders

In [None]:
# filtering='custom'
# for dataset_name in PXDs:
#     for search in SEARCHES: 
#         tmp = []
#         for fld in folders[dataset_name][search]:
#             tmp.append(pd.read_csv(os.path.join(fld.path,"group-walk-output.csv")))
#             # tmp.append(import_pep_IDs(os.path.join(fld.path,"group-walk-output.csv"), filtering=filtering))
#         tmp = pd.concat(tmp, ignore_index=True)
#         tmp.to_csv(os.path.join(working_folder, dataset_name, f'{dataset_name}-{search}',
#                                 f"combined-results-w-qvalues.csv.gz"),
#                    index=False, compression='gzip')
#         TD = tmp.database.value_counts()
#         # print(TD)
#         print(dataset_name, search, f"FDR={TD['D']/TD['T']:.2%}")
#         del tmp