In [1]:
import datetime
import itertools
import multiprocessing
import operator
import os
import pathlib
import pandas
import random
import re
import shutil
import time
import xlsxwriter

In [2]:
# parameter configuration
results_directory = 'results'

experiment_name = 'N1_Normal'
all_denovo_file = 'data/N1_Normal/N1_Normal_all_de_novo.csv'
db_matched_file = 'data/N1_Normal/N1_Normal_DB-matched_peptides.csv'

min_peptide_length = 7
max_peptide_length = 25
alc_score_cutoff = 50

include_PTMs = True # if True, converts C(+57.02)FKHSGTGM(+15.99)VHR to CFKHSGTGMVHR; else, skips the row in file

# create working directory if it does not exist; all output will be stored here
results_path = results_directory + os.sep + experiment_name
pathlib.Path(results_path).mkdir(parents=True, exist_ok=True)


In [3]:
# read data
df_db_matched = pandas.read_csv(db_matched_file, dtype={'Scan': int})
df_all_denovo = pandas.read_csv(all_denovo_file, dtype={'ALC (\%)': int, 'Scan': int})

# process PTMs
if include_PTMs:
    df_db_matched['Peptide'] = df_db_matched['Peptide'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))
    df_all_denovo['Peptide'] = df_all_denovo['Peptide'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))
else:
    df_db_matched = df_db_matched[~df_db_matched['Peptide'].str.contains('\+')]
    df_all_denovo = df_all_denovo[~df_all_denovo['Peptide'].str.contains('\+')]

# save DB-matched file with razor peptides removed
df_db_matched_no_razor_peptides = df_db_matched[df_db_matched['Accession'].apply(lambda x: str(x).count(':')) == 0]
df_db_matched_no_razor_peptides.to_csv(results_path + os.sep + 'DB-matched_no_razor_peptides.csv', index=False)

# save the unique protein IDs with razor peptides removed
# TODO: clarify which entries to keep - i.e. do we just keep the first occurence of each duplicate protein?
df_db_matched_no_razor_peptides_unique = df_db_matched_no_razor_peptides.drop_duplicates(subset='Accession',
                                                                                         keep='first')
df_db_matched_no_razor_peptides_unique.to_csv(results_path + os.sep + 'DB-matched_no_razor_peptides_unique.csv', index=False)


In [4]:
df_db_matched_no_razor_peptides_unique


Unnamed: 0,Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Area N1_Normal,Fraction,Scan,Source File,#Feature,#Feature N1_Normal,Accession,PTM,AScore,Found By
1,YKTPDFESTGLYSAMPR,127.79,1961.9193,17,-0.5,654.9800,53.52,6410000.0,51,30115,N1_ISD_RA2_01_1021.d,2,2,P50440|GATM_HUMAN,,,PEAKS DB
2,KYTPEQVAMATVTALHR,126.93,1914.9985,17,-0.7,639.3397,52.50,6250000.0,51,29338,N1_ISD_RA2_01_1021.d,2,2,P05062|ALDOB_HUMAN,,,PEAKS DB
6,LVQDVANNTNEEAGDGTTTATVLAR,123.30,2559.2412,25,-3.6,1280.6233,44.95,1530000.0,51,24786,N1_ISD_RA2_01_1021.d,2,2,P10809|CH60_HUMAN,,,PEAKS DB
7,KPAEDEWGKTPDAMK,123.26,1701.8032,15,0.1,851.9089,26.24,1150000.0,51,13188,N1_ISD_RA2_01_1021.d,1,1,P02792|FRIL_HUMAN,,,PEAKS DB
8,VHTECCHGDLLECADDRADLAK,122.70,2584.1104,22,-0.2,862.3772,34.04,9390000.0,51,17969,N1_ISD_RA2_01_1021.d,3,3,P02768|ALBU_HUMAN,Carbamidomethylation,C5:Carbamidomethylation:1000.00;C6:Carbamidome...,PEAKS DB
9,FLSQPFQVAEVFTGHMGK,122.66,2022.0033,18,0.3,675.0085,69.87,5930000.0,51,40177,N1_ISD_RA2_01_1021.d,3,3,P06576|ATPB_HUMAN,,,PEAKS DB
10,VIHDNFGIVEGLMTTVHAITATQK,122.11,2594.3525,24,-1.3,865.7903,79.95,10100000.0,51,46259,N1_ISD_RA2_01_1021.d,3,3,P04406|G3P_HUMAN,,,PEAKS DB
13,FIVDGWHEMDAENPLHQPSPSLNK,120.77,2760.2966,24,-2.1,921.1042,58.49,1140000.0,51,33209,N1_ISD_RA2_01_1021.d,1,1,Q16836|HCDH_HUMAN,,,PEAKS DB
15,AGSNVMQTFTFYASEDKLENR,120.25,2407.1113,21,-0.6,803.3773,61.30,2190000.0,51,34973,N1_ISD_RA2_01_1021.d,2,2,Q93088|BHMT1_HUMAN,,,PEAKS DB
18,FAELVYTGFWHSPECEFVR,120.04,2373.0889,19,-2.5,1187.5487,70.15,4870000.0,51,40347,N1_ISD_RA2_01_1021.d,2,2,P00966|ASSY_HUMAN,Carbamidomethylation,C15:Carbamidomethylation:1000.00,PEAKS DB


In [5]:
# TODO: get all denovo scan IDs that are not found in DB-matched data