In [1]:
def BoyerMoore_in_DNA (sequence,pattern):            
    
    """ 
        Input 1: str 
        Input 2: str
        Output : A list with numbers that represent starting indexes of match occurences.
    
        In a large DNA sequence comprising only bases of A, C, T and G,find all identical matches with a shorter sequence(pattern).
        The matches are submitted as points (indexes) in the sequence where the pattern starts to emerge each time.
        In each loop cycle - depending on whether the last letter of the pattern and that of the sub-sequence 
        match or not- the good suffix rule or bad character rule of Boyer-Moore algorithm is followed, respectively.  
        
                                                                                                          """                                                        
    penult_sfx_dist     = Dic_GoodSuffixRule_Distances (pattern)                               # distance of identical penultimate sfx from a pattern's last sfx. 
    character_dist      = Dic_BadCharRule_Distances(pattern)                                   # distance of chars from a pattern's end. See bad char rule
    longest_sfx         = max(penult_sfx_dist, key=penult_sfx_dist.get)                        # suffix with largest size of chars in pattern's end
    longest_sfx_len     = len(longest_sfx)
    dynamic_start_index = 0                                                                    # moving start-index in sequence to create smaller sub-sequences
    start_of_match      = []                                                                   # list with indexes where a match emerges.
    pattern_length      = len(pattern)
    sequence_length     = len(sequence)

    
    while dynamic_start_index + pattern_length <= sequence_length:  
        
        subseq = sequence[dynamic_start_index : dynamic_start_index+pattern_length]            # a subsequence of equal length to compare with pattern
        
        if subseq[-1] == pattern[-1]:                                                          # run good suffix rule
            
            suffix = Good_Suffix_Between_Strings (subseq,longest_sfx,longest_sfx_len)
            if subseq != pattern :
                distance = penult_sfx_dist[suffix]
            else :
                start_of_match.append(dynamic_start_index)
                distance = 1  
        else :                                                                                 # run bad char rule
            distance = character_dist[subseq[-1]]
            
        dynamic_start_index += distance 
        
    
    return start_of_match


In [2]:
def Dic_BadCharRule_Distances(pattern):                                   
    """ Input : string (representing a pattern)
        Output: Dictionary (dic values type: int)
        
        Preprocessing for bad character rule. Create a dictionary with the 4 DNA bases as keys. The distance of each base/key 
        from the end of the pattern will be its value. If one or more bases are not observed in the pattern, pattern's length
        will be attributed as value for this base. Why? Because if a subsequence's last char does not exist in a pattern at all,
        we can safely jump as many indexes as the number of pattern's length. No match is possible within this range. """
    
    posit_dic = {}
    
    for i in "ACGT":
        
        if pattern.rfind(i) != -1:                                      
            posit_dic[i] = (len(pattern) - 1) - pattern.rfind(i)   # -1 because len() is +1 larger than last index
        else : posit_dic[i] = len(pattern)                         # see remarks
        
        
    return posit_dic


In [3]:
def Dic_GoodSuffixRule_Distances (pattern):           
    """ Input : string
        Output: dic with the starting positions of penultimate good suffixes (type :int)
        
        Preprocessing for good suffix rule. Should be called AFTER identifying a match between the last character(s) 
        of a pattern and a subsequence. This match is known as good suffix and it might comprise many characters.
         
        Calculates and returns the distance of a penultimate identical match (good suffix) in the pattern ITSELF.
        Suffixes start from the rightmost char of the pattern. They get larger by one character on the left in each iteration.
        If a suffix is not found twice in the pattern, this means that there are no larger good suffixes apart from the already
        observed. The distance of the good suffixes can be used as a flag. 
        For instance: 
                      in the subsequence    G C G G C C A
                      and the pattern       T C A G T C A 
                      
        The good suffix (CA) is -by definition- a sequence that exists in the pattern's rightmost bases and matches the respective
        position-wise bases of the subsequence. AFTER identifying this match in another function(see Good_Suffix_Between_Strings), 
        we know that there is a preceding identical suffix in the pattern thanks to this function. Hence, as far as we know 
        the distance of the first CA in the pattern from the end of the pattern (4), we can safely jump as many indexes as that
        distance. """

    pattern_length = len(pattern)      
    dic = {} 
    
    assert pattern[-1] in pattern[:-1], 'The last base/character of your pattern ( {} ) does not appear twice in it. Hence, the good suffix rule cannot be used throughout scanning. Please run the Bad Character Rule solely or try using another method for faster results'.format(pattern[-1])
    
    
    for i in range (pattern_length-1, -1, -1):                                   # StartFromLastIndex & FinishToFirst 
        suffix = pattern[i:]                                                      
        preceding_pattern = pattern[:i]
        preceding_occ = preceding_pattern.rfind(suffix)                          # penultimate occurence of that suffix
        if preceding_occ != -1:
            dic[suffix] = len(preceding_pattern) - preceding_occ                 # Distance of 1st penultimate suffix's char from 1st last suffixe's char 
        else :                                                                   
            break                                                                
     
    if len (set(suffix)) == 1 :                                                  # if the longest good suffix comprises only one base                                                                                
        if len(dic.keys()) > 1 :                                                 # and there are at least two sfxs, i.e.'G' & 'GG' in a 'GGG' ending
            dic = dict.fromkeys(dic.keys(), 1)                                   # see remarks
                    
    return dic     
    

In [4]:
def Good_Suffix_Between_Strings (subseq,longest_sfx,longest_sfx_len):    
    """Input 1: str
       Input 2: str
       Input 3: int
       Output : str
       
       Necessary for the execution of good suffix rule. Finds the good suffix between a subsequence and a DNA pattern of equal N
       """
    
    possible_good_sfx = subseq[-longest_sfx_len:]
    last_index = longest_sfx_len - 1
        
    while last_index >= 0 and longest_sfx [last_index:] == possible_good_sfx[last_index:]  :   #at least one match will always exist since 2nd rule is chosen
        last_index -= 1
        
    return (possible_good_sfx[last_index+1:])                                                 #last_index + 1 to correct for -=1 in loop
     


## Remarks on Boyer-Moore related functions


#### Dic_BadCharRule_Distances


1) The last character of the pattern will have by default a 0 value. This could result in returning a 0 
distance, consequently leading to an infinite loop in the main function BoyerMoore_in_DNA 
-> (dynamic_start_index += distance). In this format though, the bad character rule will only run when there is 
no match between the last base of the pattern and that of a subsequence -> (if subseq[-1] == pattern[-1]:). 
Hence, the 0 value of the last char in the pattern will never be returned.

2) Inference in subsequences is made ONLY FOR CLARIFICATION as to the utility of this function.This is a preprocessing function which only takes a pattern as an input, hence running INDEPENDENT of any subsequence.



#### Dic_GoodSuffixRule_Distances


           1)Inference in subsequences in the description and here is made ONLY FOR CLARIFICATION as to the utility of this  
             function.This is a preprocessing function which only takes a pattern as an input, hence running INDEPENDENT 
             of any subsequence.
 
        Assuming that 
                          sequence = ..G C G G C C A..
                         subsequence = G C G G C C A
                         pattern     = T C A G C C A 
 We are not interested in a CCA match, rather in a CA match, because CCA is not observed elsewhere in the pattern. Thus, this
 function will terminate the submission of new rightmost suffixes/keys once a longer by one character on the left suffix (CCA), 
 ceases to exist elsewhere in pattern. The reason is obvious : In the aforementioned example, good suffix is CCA, yet only CA
 is found twice in pattern, forcing us to use the distance/value based on CA.
 
           2)  Utility of if statements following the for loop : If the created suffixes are AT LEAST 2 and they are all 
               consisted of the same dna base exclusively (i.e 'G','GG','GGG' in a pattern ending to ...'GGGG')
               there is a rare chance that a match will be omitted in a search. 
       
        Assuming that 
                          sequence = .. T T C C A A A A C C C..
                         subsequence =    T C C A A A A C C 
                             pattern =    C C A A A A A C C 
  
 The good suffix is CC, and the distance of the preceding CC from the end of the pattern is 7. Note that if the pattern be
 moved by 1 (instead of 7), it will perfectly match with the sequence on top. This problem occurs when a perfect match 
 starts one position after the starting index of the respective subsequence used for comparison and ends, of course, one 
 position after the end index of the latter. This problem occurs only when all created suffixes (>=2) that re-emerge in the  
 pattern are consisted solely by the same dna base. Therefore, all suffixes must get a 1 value of distance regardless of the  
 distance their penultimate identical matches normally have.


#### Good_Suffix_Between_Strings

Won't search for good suffixes (matches) that exceed the length of the rightmost longest suffix that re-occurs
in the pattern. See remarks on 'Dic_GoodSuffixRule_Distances()' for further details.
      

#### For a better understanding of how I create the good suffix rule click below

https://www.dropbox.com/s/nwpr6udb0dttz71/photo_2021-10-16_13-47-59.jpg?dl=0