In [1]:
from Bio import SeqIO


In [5]:
def search_all_occ(seq, pattern):
    res = []
    n = len(pattern)
    for i in range(len(seq) - n + 1):
        if seq[i:i+n] == pattern:
            res.append(i)
    return res


In [6]:
seqDNA = "ATAGAATAGATAATAGTC"
print( search_all_occ(seqDNA, "GAAT") )

[3]


In [1]:
class BoyerMoore:
    
    def __init__(self, alphabet, pattern):
        self.alphabet = alphabet
        self.pattern = pattern
        self.n = len(pattern)
        self.occ = {}
        self.preprocess()
        
    def preprocess(self):
        self.process_bcr()
        self.process_gsr()
    
    def process_bcr(self):
        for symb in self.alphabet:
            self.occ[symb] = -1
        for index, item in enumerate(self.pattern):
            self.occ[item] = index
    
    def process_gsr(self):
        self.f = [0] * (self.n + 1)
        self.s = [0] * (self.n + 1)
        i = self.n
        j = self.n + 1
        self.f[i] = j
        while i > 0:
            while j <= self.n and self.pattern[i-1] != self.pattern[j - 1]:
                if self.s[j] == 0:
                    self.s[j] = j-1
                j = self.f[j]
            i -= 1
            j -= 1
            self.f[i] = j
        j = self.f[0]
        for i in range(self.n):
            if self.s[i] == 0:
                self.s[i] = j
            if i == j:
                j = self.f[j]
    
    def search_pattern(self, text):
        res = []
        i = 0
        while i <= len(text)  - self.n:
            j = self.n - 1
            while j >= 0 and self.pattern[j] == text[j+1]:
                j -= 1
            if j < 0:
                res.append(i)
                i += self.s[0]
            else:
                c = text[j+i]
                i += max(self.s[j+1], j-self.occ[c])
        return res
    

In [2]:
def test():
    bm = BoyerMoore("ACTG", "ACCA")
    return bm.search_pattern("ATAGAACCAATGAACCATGATGAACCATGGATACCCAACCACC")


In [3]:
print(test())

[]
