In [1]:
import itertools
import pandas
import re
import tqdm

from Bio import SeqIO

In [2]:
# get processed data
df_processed = pandas.read_excel('analysed/CONTROL_full_proteome_scan.xls')

# process raw data
df_db_matched = pandas.read_csv('data/N1_Normal_DB-matched_peptides.csv', dtype={'Scan': int})
df_all_denovo = pandas.read_csv('data/N1_Normal_all_de_novo.csv', dtype={'ALC (\%)': int, 'Scan': int})
df_db_matched['Peptide'] = df_db_matched['Peptide'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))
df_all_denovo['Peptide'] = df_all_denovo['Peptide'].apply(lambda x: re.sub(r'\([^()]*\)', '', x))

ref_db = {seq.name: str(seq.seq) for seq in SeqIO.parse('data/SPHu.fasta', 'fasta')}

# selection of de novo (oroginal HLA) peptides for fusion candidate scan
scan_ids_not_db_matched = set(df_all_denovo['Scan'].values) - set(df_db_matched['Scan'].values)
df_all_denovo_unmatched = df_all_denovo[df_all_denovo['Scan'].isin(scan_ids_not_db_matched)]


In [3]:
unmatched = set(df_all_denovo_unmatched.Scan.values) - set(df_processed.ScanID.values)
unmatched_prior_to_scan = df_all_denovo_unmatched[df_all_denovo_unmatched['Scan'].isin(unmatched)]
unmatched_prior_to_scan = unmatched_prior_to_scan[['Peptide', 'Scan', 'ALC (%)']].drop_duplicates()

# method to run matching to proteome for unmatched_prior_to_scan - return and store if no match
def peptide_in_reference_proteome(ref_db, peptide):
    for protein_name, sequence in ref_db.items():
        sequence = sequence.strip('X*')
        if 'X' in sequence:
            splitter = 'X'
        else:
            splitter = '*'
        for ORF in sequence.split(splitter):
            for match_data_large in re.finditer(peptide, ORF):
                return True
    return False
 

In [4]:
# match to proteome, ignoring I/L variants
def _create_combinations_with_isoleucines(peptide):
    """
    Performs all possible combinations of string reassignment of L to I.
    """
    count = peptide.count('L')
    product = [''.join(seq) for seq in itertools.product("01", repeat=count)]
    positions = [pos for pos, char in enumerate(peptide) if char == 'L']
    peptides = []
    for combo in product:
        partitioned_peptide = list(peptide)
        for pos, item in enumerate(positions):
            if combo[pos] == '1':
                partitioned_peptide[item] = 'I'
        peptides.append(''.join(partitioned_peptide))
    return peptides

unmatched = []
for values in tqdm.tqdm_notebook([data[1] for data in unmatched_prior_to_scan.iterrows()]):
    peptide = values['Peptide']
    for il_variant in _create_combinations_with_isoleucines(peptide):
        if peptide_in_reference_proteome(ref_db, il_variant):
            unmatched.append(False)
        else:
            unmatched.append(True)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=78245), HTML(value='')))

KeyboardInterrupt: 

In [None]:
unmatched_prior_to_scan['Unmatched_de_novo'] = unmatched