In [1]:
import datetime
import itertools
import multiprocessing
import operator
import os
import pathlib
import pandas
import random
import re
import shutil
import time
import xlsxwriter

In [2]:
# parameters
results_directory = 'results'

experiment_name = 'N1_Normal'
all_denovo_file = 'data/N1_Normal_all_de_novo.csv'
db_matched_file = 'data/N1_Normal_DB-matched_peptides.csv'

min_peptide_length = 7
max_peptide_length = 25
alc_score_cutoff = 50

include_PTMs = True # if True, converts C(+57.02)FKHSGTGM(+15.99)VHR to CFKHSGTGMVHR; else, skips the row in file


In [3]:
# create working directory if it does not exist; all output will be stored here
results_path = results_directory + os.sep + experiment_name
pathlib.Path(results_path).mkdir(parents=True, exist_ok=True)
    

In [4]:
df_db_matched = pandas.read_csv(db_matched_file, dtype={'Scan': int})
df_db_matched


Unnamed: 0,Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Area N1_Normal,Fraction,Scan,Source File,#Feature,#Feature N1_Normal,Accession,PTM,AScore,Found By
0,LC(+57.02)YVALDFEQEMATAASSSSLEK,130.14,2549.1665,23,-3.5,1275.5861,78.32,7940000.0,51,45427,N1_ISD_RA2_01_1021.d,4,4,P63261|ACTG_HUMAN:P60709|ACTB_HUMAN:Q6S8J3|POT...,Carbamidomethylation,C2:Carbamidomethylation:1000.00,PEAKS DB
1,YKTPDFESTGLYSAMPR,127.79,1961.9193,17,-0.5,654.9800,53.52,6410000.0,51,30115,N1_ISD_RA2_01_1021.d,2,2,P50440|GATM_HUMAN,,,PEAKS DB
2,KYTPEQVAMATVTALHR,126.93,1914.9985,17,-0.7,639.3397,52.50,6250000.0,51,29338,N1_ISD_RA2_01_1021.d,2,2,P05062|ALDOB_HUMAN,,,PEAKS DB
3,GFYPSDIAVEWESNGQPENNYK,126.60,2543.1240,22,-3.3,1272.5651,66.22,2100000.0,51,37982,N1_ISD_RA2_01_1021.d,1,1,P01857|IGHG1_HUMAN:P01861|IGHG4_HUMAN,,,PEAKS DB
4,DC(+57.02)PVSSYNEWDPLEEVIVGR,125.91,2363.0740,20,-0.7,1182.5435,84.85,3970000.0,51,49387,N1_ISD_RA2_01_1021.d,1,1,P50440|GATM_HUMAN,Carbamidomethylation,C2:Carbamidomethylation:1000.00,PEAKS DB
5,NSC(+57.02)AADDKATEPLPK,124.02,1615.7511,15,-0.2,808.8827,20.92,1880000.0,51,9744,N1_ISD_RA2_01_1021.d,2,2,P50440|GATM_HUMAN,Carbamidomethylation,C3:Carbamidomethylation:1000.00,PEAKS DB
6,LVQDVANNTNEEAGDGTTTATVLAR,123.30,2559.2412,25,-3.6,1280.6233,44.95,1530000.0,51,24786,N1_ISD_RA2_01_1021.d,2,2,P10809|CH60_HUMAN,,,PEAKS DB
7,KPAEDEWGKTPDAMK,123.26,1701.8032,15,0.1,851.9089,26.24,1150000.0,51,13188,N1_ISD_RA2_01_1021.d,1,1,P02792|FRIL_HUMAN,,,PEAKS DB
8,VHTEC(+57.02)C(+57.02)HGDLLEC(+57.02)ADDRADLAK,122.70,2584.1104,22,-0.2,862.3772,34.04,9390000.0,51,17969,N1_ISD_RA2_01_1021.d,3,3,P02768|ALBU_HUMAN,Carbamidomethylation,C5:Carbamidomethylation:1000.00;C6:Carbamidome...,PEAKS DB
9,FLSQPFQVAEVFTGHMGK,122.66,2022.0033,18,0.3,675.0085,69.87,5930000.0,51,40177,N1_ISD_RA2_01_1021.d,3,3,P06576|ATPB_HUMAN,,,PEAKS DB


In [5]:
df_all_denovo = pandas.read_csv(all_denovo_file, dtype={'ALC (\%)': int, 'Scan': int})
df_all_denovo


Unnamed: 0,Fraction,Source File,Feature,Peptide,Scan,Tag length,ALC (%),Length,m/z,z,RT,Mass,ppm,local confidence (%),tag (>=0%),mode
0,51,N1_ISD_RA2_01_1021.d,18205,LHSKFQFTFK,3949,10,66,10,641.8480,2,11.57,1281.6870,-4.4,99 98 98 92 72 64 63 22 24 34,LHSKFQFTFK,CID
1,51,N1_ISD_RA2_01_1021.d,18205,LHSKFQFFTK,3949,10,66,10,641.8480,2,11.57,1281.6870,-4.4,99 98 98 92 72 64 64 22 24 34,LHSKFQFFTK,CID
2,51,N1_ISD_RA2_01_1021.d,18205,LHSKFGAFFTK,3949,11,56,11,641.8480,2,11.57,1281.6870,-4.4,99 98 98 91 82 10 20 60 20 20 29,LHSKFGAFFTK,CID
3,51,N1_ISD_RA2_01_1021.d,18205,LHSKEVGHFTK,3949,11,52,11,641.8480,2,11.57,1281.6829,-1.2,98 98 98 88 82 24 8 15 11 14 37,LHSKEVGHFTK,CID
4,51,N1_ISD_RA2_01_1021.d,18205,LHSKEGVHFTK,3949,11,51,11,641.8480,2,11.57,1281.6829,-1.2,98 98 98 88 82 11 19 14 11 14 37,LHSKEGVHFTK,CID
5,51,N1_ISD_RA2_01_1021.d,18205,LHKSEHFKEK,3980,10,77,10,641.8480,2,11.57,1281.6829,-1.2,97 95 87 76 82 89 85 68 55 43,LHKSEHFKEK,CID
6,51,N1_ISD_RA2_01_1021.d,18205,LHKSEHKFEK,3980,10,69,10,641.8480,2,11.57,1281.6829,-1.2,97 95 87 76 80 80 59 43 36 39,LHKSEHKFEK,CID
7,51,N1_ISD_RA2_01_1021.d,18205,LHSKEHFKEK,3980,10,66,10,641.8480,2,11.57,1281.6829,-1.2,97 93 53 28 69 85 83 65 53 41,LHSKEHFKEK,CID
8,51,N1_ISD_RA2_01_1021.d,18205,LHKESHFKEK,3980,10,58,10,641.8480,2,11.57,1281.6829,-1.2,96 91 65 17 12 73 79 62 50 39,LHKESHFKEK,CID
9,51,N1_ISD_RA2_01_1021.d,18205,LHKDTHFKEK,3980,10,57,10,641.8480,2,11.57,1281.6829,-1.2,96 92 62 15 9 69 78 62 50 39,LHKDTHFKEK,CID


In [6]:
if include_PTMs:
    df_db_matched['Peptide'] = df_db_matched['Peptide'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))
    df_all_denovo['Peptide'] = df_all_denovo['Peptide'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))

# save DB-matched file with razor peptides removed
df_db_matched_no_razor_peptides = df_db_matched[df_db_matched['Accession'].apply(lambda x: str(x).count(':')) == 0]
df_db_matched_no_razor_peptides.to_csv(results_path + os.sep + 'DB-matched_no_razor_peptides.csv', index=False)

# save the unique protein IDs with razor peptides removed
# TODO: clarify which entries to keep
df_db_matched_no_razor_peptides_unique = df_db_matched_no_razor_peptides.drop_duplicates(subset='Accession',
                                                                                         keep="first")
df_db_matched_no_razor_peptides_unique.to_csv(results_path + os.sep + 'DB-matched_no_razor_peptides_unique.csv', index=False)


In [7]:
df_db_matched_no_razor_peptides_unique

Unnamed: 0,Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Area N1_Normal,Fraction,Scan,Source File,#Feature,#Feature N1_Normal,Accession,PTM,AScore,Found By
1,YKTPDFESTGLYSAMPR,127.79,1961.9193,17,-0.5,654.9800,53.52,6410000.0,51,30115,N1_ISD_RA2_01_1021.d,2,2,P50440|GATM_HUMAN,,,PEAKS DB
2,KYTPEQVAMATVTALHR,126.93,1914.9985,17,-0.7,639.3397,52.50,6250000.0,51,29338,N1_ISD_RA2_01_1021.d,2,2,P05062|ALDOB_HUMAN,,,PEAKS DB
6,LVQDVANNTNEEAGDGTTTATVLAR,123.30,2559.2412,25,-3.6,1280.6233,44.95,1530000.0,51,24786,N1_ISD_RA2_01_1021.d,2,2,P10809|CH60_HUMAN,,,PEAKS DB
7,KPAEDEWGKTPDAMK,123.26,1701.8032,15,0.1,851.9089,26.24,1150000.0,51,13188,N1_ISD_RA2_01_1021.d,1,1,P02792|FRIL_HUMAN,,,PEAKS DB
8,VHTECCHGDLLECADDRADLAK,122.70,2584.1104,22,-0.2,862.3772,34.04,9390000.0,51,17969,N1_ISD_RA2_01_1021.d,3,3,P02768|ALBU_HUMAN,Carbamidomethylation,C5:Carbamidomethylation:1000.00;C6:Carbamidome...,PEAKS DB
9,FLSQPFQVAEVFTGHMGK,122.66,2022.0033,18,0.3,675.0085,69.87,5930000.0,51,40177,N1_ISD_RA2_01_1021.d,3,3,P06576|ATPB_HUMAN,,,PEAKS DB
10,VIHDNFGIVEGLMTTVHAITATQK,122.11,2594.3525,24,-1.3,865.7903,79.95,10100000.0,51,46259,N1_ISD_RA2_01_1021.d,3,3,P04406|G3P_HUMAN,,,PEAKS DB
13,FIVDGWHEMDAENPLHQPSPSLNK,120.77,2760.2966,24,-2.1,921.1042,58.49,1140000.0,51,33209,N1_ISD_RA2_01_1021.d,1,1,Q16836|HCDH_HUMAN,,,PEAKS DB
15,AGSNVMQTFTFYASEDKLENR,120.25,2407.1113,21,-0.6,803.3773,61.30,2190000.0,51,34973,N1_ISD_RA2_01_1021.d,2,2,Q93088|BHMT1_HUMAN,,,PEAKS DB
18,FAELVYTGFWHSPECEFVR,120.04,2373.0889,19,-2.5,1187.5487,70.15,4870000.0,51,40347,N1_ISD_RA2_01_1021.d,2,2,P00966|ASSY_HUMAN,Carbamidomethylation,C15:Carbamidomethylation:1000.00,PEAKS DB
