In [None]:
import copy
import msgpack
from collections import Counter

aa_mass = {'G': 57, 'A': 71, 'S': 87, 'P': 97, 'V': 99, 'T': 101, 'C': 103, 'I': 113,  'N': 114, 'D': 115, 'K': 128,  'E': 129, 'M': 131, 'H': 137, 'F': 147, 'R': 156, 'Y': 163, 'W': 186}
mass_aa = {57: 'G', 71: 'A', 87: 'S', 97: 'P', 99: 'V', 101: 'T', 103: 'C', 113: 'I', 114: 'N', 115: 'D', 128: 'K', 129: 'E', 131: 'M', 137: 'H', 147: 'F', 156: 'R', 163: 'Y', 186: 'W'}
masses = [57, 71, 87, 97, 99, 101, 103, 113, 114, 115, 128, 129, 131, 137, 147, 156, 163, 186]

In [None]:
def LeaderboardCyclopeptideSequencing(Spec,N):
    """Input 1: List with masses corresponding to an experimental spectrum
       Input 2: int. Acts as a cut following branching(expanding already existing peptides)
                The score of the Nth peptide from a reverse sorted list is the minimum acceptable score for further branching.
                If there are following peptides the score of which is also same as the score of N's peptide, these peptides
                will be branched too.
       Output : List with str (peptides) that had the highest score of matches between masses of theoretical and experimental 
                spectrum. 
                
                Branch and bound algorithm to generate peptides that are similar to the experimental spectrum.
                Bounding takes place three times :
                1) When the generated peptide's incremental sum of masses exceeds the experimental spectrum's sum of masses.
                2) When the generated peptide's incremental sum of masses equals the experimental spectrum's sum of masses 
                   (in this case the peptide is saved in another list but won't be further branched)
                3) When the generated peptide's incremental sum of masses is SMALLER than the score of the Nth peptide in a 
                   reverse sorted list (key=score)
                   
                Note that score is calculated based on linear spectrum for the bound step.
                However, as to the peptides that satisfy condition 2) above, that is the CandidatePeptides,
                their score is calculated based on cyclypeptide spectrum (CycloSpectrum ()).
                   
                """
    
    experimental_spectrum_counter = Counter(Spec)
    parent_mass = Spec[-1]                          # the biggest mass of the experimental spectrum
    brances_for_mass = []                           # all branches created by the addition of a single aa mass to all peptides
    candidates_for_mass =[]                         # part of CandidatePeptides
    All_New_Branches = []                           # contains all brances_for_mass after 18 iterations 
    CandidatePeptides = []                          #Peptides with last Prefix Mass equal to Parent Mass
    LeaderBoard = Initialize_Leaderboard (Spec)     # Create a list with 18 elements (as many as the peptides)
    first_round = True                              # markdown
    
    while LeaderBoard != [] :
            
        All_New_Branches = []
        LeaderBoardCP = msgpack.packb(LeaderBoard)  # to avoid changes in LeaderBoard itself while iterating
        for mass in masses:
            brances_for_mass, candidates_for_mass = Expand(msgpack.unpackb(LeaderBoardCP,strict_map_key=False),parent_mass,mass)       
            All_New_Branches.extend (brances_for_mass)  
            CandidatePeptides.extend (candidates_for_mass)      
        LeaderBoard = ChangeScore(All_New_Branches)  #AFTER iterating for each single aa mass and generating respective peptides.

        if LeaderBoard == []:
            break
        
        elif len(LeaderBoard) <= N:    #see markdown
            if first_round == False:
                for pepinfo in LeaderBoard:
                    if pepinfo[0][-1] == parent_mass:
                        CandidatePeptides.append(pepinfo)
                break
            if first_round == True:
                continue
                                   
        else:
            LeaderBoard = sorted(LeaderBoard, key= lambda x: x[1], reverse = True)
            
            minimum_pass = LeaderBoard[N][1]
            extra_ties = 0
            for i in range(N+1, len(LeaderBoard)):       
                if LeaderBoard[i][1] == minimum_pass :
                    extra_ties += 1
                else:
                    break 
                 
            LeaderBoard = LeaderBoard[:N+extra_ties]      #cut LeaderBoard based on Score of Nth peptides and its ties
            
#             if N/2 != 1 and N != 1 :                      # tweak to accelerate process, reduce the N
#                 N = round(N/2)
                
        first_round = False
            
  

    ############### Processing the collected CandidatePeptides  #################
    ###############     Based on CycloSpectrum() this time      #################  
    
    CandidatePeptidesStrings = PrefixMassesToSingleMAssesToString(CandidatePeptides,mass_aa) 
    Bestpeptides = FilterFromCandidates(CandidatePeptidesStrings, experimental_spectrum_counter)

   
    return Bestpeptides  

In [None]:
def Initialize_Leaderboard (Spectrum) :
    """Input  : List with int that correspond to masses of an experimental spectrum
       Output : List of this form : [[list with int],int,dict with int as keys and list with integer(s) as value]
                [list with int],int,dict with int as keys and list with integer(s) as value] is just one of the list's elements.
                 
                 The first element of each output's element is a list containing the prefixmasses, that is,
                 the incremental sum of masses while we are still adding new ones. In this function, which constitutes
                 the initialization step, the prefixmass is just the first mass added. 
                 The second element of each output's element is a score indicating how many masses of the linear spectrum
                 match with the experimental spectrum. In this function, which constitutes the initialization step,
                 the score can be either 0 (meaning that the mass does not exist in the experimental spectrum)
                 or 1.
                 The third element of each output's element is a dictionary. All unique values of an experimental spectrum
                 are submitted as keys in this dictionary. Each key has a value same as the key itself yet inside a list, 
                 i.e 57:[57]. If a value of the experimental spectrum exists twice, then it also exists twice in the dict's list, 
                 i.e 57:[57,57] and so on for three times and so forth... """
    
    LeaderBoard = []
    DSpect = {}
    
    for i in Spectrum:
        if i not in DSpect:
            DSpect[i]=[i]
        else :
            DSpect[i]= DSpect[i] + [i]    
    
    for aa, mass in aa_mass.items():
        pepinfo = [[mass],0,copy.deepcopy(DSpect)]                       #  [Prefix_masses] , score, [c_spec_remaining]
        try :
            pepinfo[2][mass].pop()
        except:
            pass
        else :
            pepinfo[1]= 1
        LeaderBoard.append(pepinfo)
    
    return LeaderBoard

In [None]:
def Expand(Board,parent_mass,mass):
    """Input 1: List of this form : [[list with int],int,dict with int as keys and list with integer(s) as value]
                [list with int],int,dict with int as keys and list with integer(s) as value] is just one of the list's elements.
                Input 1 is the output of Initialize_Leaderboard()
       Input 2: int corresponding to the larger mass detected in the experimental spectrum
       Input 3: an aminoacid's mass
       Ouput 1 : List of this form : [[list with int],int,dict with int as keys and list with integer(s) as value]
                [list with int],int,dict with int as keys and list with integer(s) as value] is just one of the list's elements.
       Output2 : List of this form : [[list with int],int,dict with int as keys and list with integer(s) as value]
                 [list with int],int,dict with int as keys and list with integer(s) as value] can be a list's element.
                 There might be more elements depending on data.
                 Lastly, Output 2 might be merely an empty list depending on data.
                 
                 Expands the prefixmass of each peptide by adding a specific mass(input 3). 
                 The function is ALWAYS called 17 MORE TIMES by LeaderboardCyclopeptideSequencing() 
                 until an iteration for all aminoacid masses is completed. 
                 If the new prefixmass (sum of the previous prefixmass+specific aminoacid's mass) is larger than the parentmass,
                 the peptide is bound and thus excluded from further research. 
                 If the new prefixmass is equal to the parentmass, the peptide is bound, won't be expanded more, yet it will 
                 be stored in a list with candidate peptides and finally returned.
                 If the new prefixmass is smaller than the parentmass, the peptide is stored in More_Branches list and finally 
                 returned. New branches will be made later, when LeaderboardCyclopeptideSequencing() calls Expand() again,
                 (NOT during the remaining 17 iterations but afterwards, during iteration of the while loop)."""

    More_Branches = []
    Equal_to_parent = []                        # Candidate peptides

    for pepinfo in Board:       
        new_prefix_mass = pepinfo[0][-1]+mass
        pepinfo[0].append(new_prefix_mass)
        if new_prefix_mass < parent_mass:
            More_Branches.append(pepinfo)
        elif new_prefix_mass > parent_mass :
            pass
        elif new_prefix_mass == parent_mass:
            Equal_to_parent.append(pepinfo)
            
    return More_Branches,Equal_to_parent


In [None]:
def ChangeScore (Board):
    """Input 1 :List of this form : [[list with int],int,dict with int as keys and list with integer(s) as value]
                [list with int],int,dict with int as keys and list with integer(s) as value] is just one of the list's elements.
                
      Output : List of this form : [[list with int],int,dict with int as keys and list with integer(s) as value]
                [list with int],int,dict with int as keys and list with integer(s) as value] is just one of the list's elements.
                
                This function finds all new masses created following the addition of a new prefixmass
                which took place while executing Expand(). Every time a new mass matches a value in the peptide's
                dictionary, this value is removed once, and the peptide's score is increased by one. Same goes
                for the new prefixmass. That is, if the new prefixmass exists in the dict's values, one of its values will be
                removed and the peptide's score will be increased by one.
                
                Below are some examples of how the new masses are found following the addition of a new prefixmass.
                For the sake of clarity, letters are used (instead of numbers which correspond to these letters).
                Note that in fact, we are using the numbers that correspond to the combination of letters below.
                
                Example 1) N,  NQ             (NQ = new prefixmass).  New mass is : NQ-N = Q.
                Example 2) N,  NQ   NQE       (NQE =new prefixmass).  New masses are NQE-N = QE, NQE-NQ = E
                Example 3) N,  NQ   NQE  NQEL (NQEL=prefixmass).      New masses are NQEL-N=QEL, NQEL-NQ = EL, NQEL-NQE = L  
                
                Note that: 
                The loop does not produce new prefixmasses, it justs iterates through previous prefixmasses and removes them
                from the new prefixmass. Hence, examples 1,2,3 are just different inputs and not products of the loop.
                """
    
    
    allnew = []
    for peptide in Board:
        LEN = len (peptide[0]) -1              # length of all appended prefixmasses in the peptide so far -1. Must follow for loop.
        new_prefix_mass = peptide[0][-1]      
        try :                                  # Check whether the new prefixmass exists in peptide's dictionary
            peptide[2][new_prefix_mass].pop ()
        except :
            pass
        else :
            peptide[1] += 1                    # if so, increase peptide's score by one
      
        for i in range (LEN):                  #I.e for Example 3, we remove 3 former prefixmasses from the new prefixmass
            new_mass = new_prefix_mass-peptide[0][i]    
            try :
                peptide[2][new_mass].pop()
            except:
                pass
            else :
                peptide[1] += 1
        
    return Board
                    
        

In [None]:
def PrefixMassesToSingleMAssesToString (Candidates,mass_aa):
    """Input 1: List of this form : [[list with int],int,dict with int as keys and list with integer(s) as value]
                [list with int],int,dict with int as keys and list with integer(s) as value] is just one of the list's elements.
       Input 2: Dict with masses (int) as keys and str (single letter corresponding to an aminoacid) as values 
       Output : str corresponding to a peptide comprising aminoacid symbols
       
       Note that mass_aa is the reverse of aa_mass. Hence it has masses as keys and aminoacid symbols as values"""
    
    translated_to_masses = []
    
    for peptide in Candidates:
        peptide_to_mass = mass_aa[peptide[0][0]]
        for index in range(1,len(peptide[0])):                         #index of prefixmass
            previous_aa_mass = peptide[0][index]-peptide[0][index-1]   #prefixmass minus prev prefixmass gives single aminoacid mass
            previous_aa = mass_aa[previous_aa_mass]
            peptide_to_mass += previous_aa
        translated_to_masses.append(peptide_to_mass)
    return translated_to_masses

In [None]:
def CycloSpectrum (cyclic_peptide):
    """Input 1: Str with letters that correspond to an aminoacid
       Ouput  : List with int as elements. Integers indicate the integer mass of each possible aminoacid-kmer in the cyclic peptide
    
                A cyclic peptide can have as many starts as its length. Hence :
                                                KNE 
                                                NEK
                                                EKN          are different forms of the same cyclic peptide
               The output comprises all aminoacid-kmers that can be formed by these different versions of the peptide.
               First element of the output must be zero and last the integer mass of all three aminoacidis (full length kmer)
               The intermediate values are the integer masses of each possible aminoacid-kmer - 1
               Thus, for    KNE :   K,KN     respective masses = 128, 128+114
                     for    NEK :   N,NE     respective masses = 114, 114+129
                     for    EKN :   E,EK     respective masses = 129, 129+128
               
               0 will precede these values and 128+114+129 will follow them as the last value in output"""

    from collections import deque 

    cyclic = deque(list(cyclic_peptide))      # str splitted to letters which are now elements of a list
    str_length = len(cyclic)      
    all_sums = []                             # Output. List with integer masses of all possible aa_kmers in peptide
    stri =""


    for form in range(str_length):            # iterate as many times as the number of forms the peptide can take
        Sum = 0                               # Incrementing integer mass of kmers for each form. 
        for base in cyclic:                   # Let base be K in first iteration and N in second
            Sum += aa_mass[base]              # Sum takes K's value first. The it takes K+N value and so on
            all_sums.append(Sum)              
        cyclic.rotate()                       # all elements will be moved one position. Thus new form is ready.
    
    all_sums = sorted(all_sums)
    all_sums = all_sums[0:str_length**2-str_length+1]  # apart from 0 and whole kmer mass, there are n(n-1) values.
    return all_sums 

In [None]:
def Score (theo_spec,ex_Spec):
    """Input 1: list with masses of a peptide's theoretical spectrum
       Input 2: Counter of a list with masses of an experimental spectrum
       Output : Int corresponding to the intersection of the two Counters. That is, int that shows the number of matches
                between the two counters. Note that if theo_spec has 4 times the mass of 57 and ex_Spec has 2 times the mass
                of 57, the score for yielded for this mass is 2."""
    
    theo_spec = Counter(theo_spec)
    overlaps = theo_spec & ex_Spec
    
    return sum(overlaps.values())

In [None]:
def FilterFromCandidates (CandidatePeptidesStrings,experimental_spectrum_counter):
    """Input 1: list with str corresponding to a peptide comprising aminoacid symbols
       Input 2: Counter of a list with masses corresponding to an experimental spectrum
       Output : List with str correspodning to the peptides with the highest matching score between their theoretical 
                spectrum and the experimental spectrum"""
    
    BestScore = 0
    Bestpeptides = ['']
    
    for index in range (len(CandidatePeptidesStrings)) :
        peptide = CandidatePeptidesStrings[index]        
        theoretical_spectrum = CycloSpectrum (peptide)
        peptide_score = Score (theoretical_spectrum,experimental_spectrum_counter)
        if peptide_score > BestScore:
            BestScore = peptide_score
            Bestpeptides = [peptide] 
        if peptide_score == BestScore:
            Bestpeptides.append(peptide)
    
    return Bestpeptides