In [12]:
# Python 3.7 built-in
import datetime
import itertools
import multiprocessing
import operator
import os
import pathlib
import pandas
import random
import re
import shutil
import time

# additional dependencies
import xlsxwriter      # pip3 install xslxwriter
from Bio import SeqIO  # pip3 install biopython

In [10]:
# parameter configuration
results_directory = 'results'

experiment_name = 'N1_Normal'
all_denovo_file = 'data/N1_Normal/N1_Normal_all_de_novo.csv'
db_matched_file = 'data/N1_Normal/N1_Normal_DB-matched_peptides.csv'
reference_proteome_file = 'data/SPHu.fasta'

min_peptide_length = 7
max_peptide_length = 25
alc_score_cutoff = 50

include_PTMs = True # if True, converts C(+57.02)FKHSGTGM(+15.99)VHR to CFKHSGTGMVHR; else, skips the row in file

# create working directory if it does not exist; all output will be stored here
results_path = results_directory + os.sep + experiment_name
pathlib.Path(results_path).mkdir(parents=True, exist_ok=True)


In [29]:
# read data
df_db_matched = pandas.read_csv(db_matched_file, dtype={'Scan': int})
df_all_denovo = pandas.read_csv(all_denovo_file, dtype={'ALC (\%)': int, 'Scan': int})
ref_db = {seq.name: str(seq.seq) for seq in SeqIO.parse(reference_proteome_file, 'fasta')}

# process PTMs
if include_PTMs:
    df_db_matched['Peptide'] = df_db_matched['Peptide'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))
    df_all_denovo['Peptide'] = df_all_denovo['Peptide'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))
else:
    df_db_matched = df_db_matched[~df_db_matched['Peptide'].str.contains('\+')]
    df_all_denovo = df_all_denovo[~df_all_denovo['Peptide'].str.contains('\+')]

# save DB-matched file with razor peptides removed
df_db_matched_no_razor_peptides = df_db_matched[df_db_matched['Accession'].apply(lambda x: str(x).count(':')) == 0]
df_db_matched_no_razor_peptides.to_csv(results_path + os.sep + 'DB-matched_no_razor_peptides.csv', index=False)

# save the unique protein IDs with razor peptides removed
# TODO: clarify which entries to keep - i.e. do we just keep the first occurence of each duplicate protein?
df_db_matched_no_razor_peptides_unique = df_db_matched_no_razor_peptides.drop_duplicates(subset='Accession',
                                                                                         keep='first')
df_db_matched_no_razor_peptides_unique.to_csv(results_path + os.sep + 'DB-matched_no_razor_peptides_unique.csv', index=False)

# get all denovo scan IDs that are not found in DB-matched data
scan_ids_not_db_matched = set(df_all_denovo['Scan'].values) - set(df_db_matched['Scan'].values)
df_all_denovo_unmatched = df_all_denovo[df_all_denovo['Scan'].isin(scan_ids_not_db_matched)]
df_all_denovo_unmatched.to_csv(results_path + os.sep + 'Fused_peptide_candidates.csv', index=False)
