In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
# import plotly.io as pio
# pio.renderers.default = "vscode"

import matplotlib.pyplot as plt
import seaborn as sns
import os, re

def trunc(x,l=10000):
    if len(x)>l:
        return x[:l]
    else:
        return x

In [2]:
working_folder = "C:/Users/Enrico/OneDrive - UGent/run-ionbot"
# working_folder = "D:/run-ionbot"
PXDs = [
    'PXD002057-closed',
    'PXD005833-closed',
    'PXD014258-closed',
    'PXD002057.v0.11.4',
    'PXD005833.v0.11.4',
    'PXD014258.v0.11.4',
    # 'PXD002057-entrapment',
    # 'PXD005833-entrapment',
    # 'PXD014258-entrapment',
    # 'PXD002057-entrap-closed',
    # 'PXD005833-entrap-closed',
    # 'PXD014258-entrap-closed',
]
SEARCHES = [
    'canon',
    'trembl',
    'openprot',
    # 'Pep-Canon',
    # 'Pep-Trembl',
    # 'Pep-Open',
    # 'Prot-Canon',
    # 'Prot-Trembl',
    # 'Prot-Open',
]
DATASETS = pd.MultiIndex.from_product([PXDs,SEARCHES])
DATASETS

MultiIndex([('PXD002057.v0.11.4',    'canon'),
            ('PXD002057.v0.11.4',   'trembl'),
            ('PXD002057.v0.11.4', 'openprot'),
            ('PXD005833.v0.11.4',    'canon'),
            ('PXD005833.v0.11.4',   'trembl'),
            ('PXD005833.v0.11.4', 'openprot'),
            ('PXD014258.v0.11.4',    'canon'),
            ('PXD014258.v0.11.4',   'trembl'),
            ('PXD014258.v0.11.4', 'openprot'),
            ( 'PXD002057-closed',    'canon'),
            ( 'PXD002057-closed',   'trembl'),
            ( 'PXD002057-closed', 'openprot'),
            ( 'PXD005833-closed',    'canon'),
            ( 'PXD005833-closed',   'trembl'),
            ( 'PXD005833-closed', 'openprot'),
            ( 'PXD014258-closed',    'canon'),
            ( 'PXD014258-closed',   'trembl'),
            ( 'PXD014258-closed', 'openprot')],
           )

In [3]:
# loop through ionbot results folders
# read ionbot results files and combine them into one df
for dataset_name,search in DATASETS:
    print([dataset_name,search])
    num_psms = 0
    folders = []
    combo_peps = []
    # print(os.path.join(working_folder, dataset_name, f"{dataset_name}-{search}"))
    for fld in os.scandir(os.path.join(working_folder, dataset_name, f"{dataset_name}-{search}")):
        if fld.name.startswith('.') or not os.path.isdir(fld.path): continue
        folders.append(fld)
    for i,fld in enumerate(folders):
        try:
            peps = pd.read_csv(os.path.join(fld.path,'ionbot.first.csv'))
            peps.spectrum_file = peps.spectrum_file.str.strip('.gzip')
            peps.spectrum_file = peps.spectrum_file.str.replace('.RAW.mgf', '.mgf', regex=True) 
            # solves issue with file names in PXD014258-canon; doesn't affect ther searches
            peps['experiment'] = fld.name
            combo_peps.append(peps)
            num_psms += len(peps)
            # print(f'Read: {fld} {i+1}/{len(folders)}')
        except:
            # print(f'>>Skipped: {fld} {i+1}/{len(folders)}')
            continue
    combo_peps = pd.concat(combo_peps, ignore_index=True)

    
    # keep only 1 protein per peptide (the first)
    # combo_peps['leadentry'] = combo_peps.proteins.apply(lambda x: re.sub(r'\(\(.*','',x))
    combo_peps['leadprot'] = combo_peps.proteins.apply(lambda x: re.sub(r'\|\|.*','',x))
    combo_peps['leadprot'] = combo_peps.leadprot.apply(lambda x: re.sub(r'.*\(\(','',x))
    combo_peps['leadprot'] = combo_peps.leadprot.apply(lambda x: re.sub(r'\)\).*','',x))

    
    # label unmodified peptides
    combo_peps.modifications = combo_peps.modifications.fillna('Unmodified')

    
    # in some mgf files the 'spectrum title' includes the file name, making the spectrum title unique.
    # when the file name is NOT included, spectrum titles are NOT unique, and this can mess up some analysis.
    combo_peps.spectrum_title = combo_peps.spectrum_file.str.strip('.mgf') + ':' + combo_peps.spectrum_title.apply(lambda x: x.split(':')[-1])

    
    # calculate FDR and plot target and decoy score distributions
    combo_peps.sort_values('database', ascending=False, inplace=True)
    tmp = combo_peps.database.value_counts()
    try:
        fdr = tmp['D']/tmp['T']
    except:
        fdr = 0
    fig = px.histogram(combo_peps, x='psm_score', color='database', nbins=100, 
                       title=f"{dataset_name}-{search} FDR = {fdr:.2%} [T={tmp['T']};D={tmp['D']}]")
    fig.write_image(os.path.join(working_folder, dataset_name, f"{dataset_name}-{search}","target-decoy-plot.png"))

    
    # remove decoy hits
    # combo_peps = combo_peps[combo_peps.database=='T']

    
    # save results to .csv
    combo_peps.proteins = combo_peps.proteins.apply(trunc)
    outpath = os.path.join(working_folder, dataset_name, f"{dataset_name}-{search}",
                           f"{dataset_name}-{search}-combined-results.csv")
    combo_peps.to_csv(outpath, index=False)
    print(outpath)
    print(num_psms==len(combo_peps))
    
    # outpath = os.path.join(working_folder, dataset_name, f"{dataset_name}-{search}",
    #                        f"{dataset_name}-{search}-combined-results-FDR-filtered.csv")
    # combo_peps[combo_peps['q-value']<.01].to_csv(outpath, index=False)
    # print(outpath)

print("\nDone!")

['PXD002057.v0.11.4', 'canon']
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057.v0.11.4\PXD002057.v0.11.4-canon\PXD002057.v0.11.4-canon-combined-results.csv
['PXD002057.v0.11.4', 'trembl']
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057.v0.11.4\PXD002057.v0.11.4-trembl\PXD002057.v0.11.4-trembl-combined-results.csv
['PXD002057.v0.11.4', 'openprot']
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD002057.v0.11.4\PXD002057.v0.11.4-openprot\PXD002057.v0.11.4-openprot-combined-results.csv
['PXD005833.v0.11.4', 'canon']
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD005833.v0.11.4\PXD005833.v0.11.4-canon\PXD005833.v0.11.4-canon-combined-results.csv
['PXD005833.v0.11.4', 'trembl']
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD005833.v0.11.4\PXD005833.v0.11.4-trembl\PXD005833.v0.11.4-trembl-combined-results.csv
['PXD005833.v0.11.4', 'openprot']
C:/Users/Enrico/OneDrive - UGent/run-ionbot\PXD005833.v0.11.4\PXD005833.v0.11.4-openprot\PXD005833.v0.11.4-openprot-combined-results.csv
['PX

----