#Suffix Trees

#Prefix Doubling Method
Suffix trees are data structures used to store all the suffixes of a given string in a way that allows for efficient substring searches and pattern matching. The prefix doubling method is one approach to constructing a suffix tree efficiently.



In [7]:
class SuffixTree:
    def __init__(self, s):
        self.s = s
        self.suffixes = sorted([s[i:] for i in range(len(s))])
        self.root = {}

    def build_tree(self):
        for suffix in self.suffixes:
            current_node = self.root
            for char in suffix:
                if char not in current_node:
                    current_node[char] = {}
                current_node = current_node[char]

    def search(self, pattern):
        current_node = self.root
        for char in pattern:
            if char not in current_node:
                return False
            current_node = current_node[char]
        return True

# Example usage:
suffix_tree = SuffixTree("banana")
suffix_tree.build_tree()
print(suffix_tree.search("nan"))  # Output: True
print(suffix_tree.search("xyz"))  # Output: False

True
False


#Finding Patterns
Finding patterns in a string involves searching for specific substrings (patterns) within a given text. This can be accomplished using various algorithms such as naive string matching, Knuth-Morris-Pratt (KMP) algorithm, or Boyer-Moore algorithm.



In [8]:
def find_patterns(text, pattern):
    n = len(text)
    m = len(pattern)
    matches = []

    for i in range(n - m + 1):
        if text[i:i + m] == pattern:
            matches.append(i)

    return matches

# Example usage:
text = "abracadabra"
pattern = "abr"
print(find_patterns(text, pattern))  # Output: [0, 7]


[0, 7]


Knuth-Morris-Pratt (KMP) Algorithm


In [10]:
def compute_lps(pattern):
    m = len(pattern)
    lps = [0] * m
    length = 0
    i = 1

    while i < m:
        if pattern[i] == pattern[length]:
            length += 1
            lps[i] = length
            i += 1
        else:
            if length != 0:
                length = lps[length - 1]
            else:
                lps[i] = 0
                i += 1

    return lps

def kmp_search(text, pattern):
    n = len(text)
    m = len(pattern)
    lps = compute_lps(pattern)
    matches = []

    i = 0  # index for text
    j = 0  # index for pattern
    while i < n:
        if pattern[j] == text[i]:
            i += 1
            j += 1

        if j == m:
            matches.append(i - j)
            j = lps[j - 1]
        elif i < n and pattern[j] != text[i]:
            if j != 0:
                j = lps[j - 1]
            else:
                i += 1

    return matches

# Example usage:
text = "ABABDABACDABABCABAB"
pattern = "ABABCABAB"
result = kmp_search(text, pattern)
print("Pattern found at positions:", result)  # Output: [10]

Pattern found at positions: [10]


Boyer-Moore Algorithm


In [11]:
def preprocess_bad_character(pattern):
    m = len(pattern)
    bad_char = {}
    for i in range(m - 1):
        bad_char[pattern[i]] = i
    return bad_char

def preprocess_good_suffix(pattern):
    m = len(pattern)
    suffix = [0] * (m + 1)
    prefix = [False] * (m + 1)

    j = m
    for i in range(m - 1, -1, -1):
        while j <= m and pattern[i] != pattern[j - 1]:
            if not suffix[j]:
                suffix[j] = j - i
            j = prefix[j]
        j -= 1
        prefix[i] = j

    for i in range(0, m):
        k = m - prefix[i]
        if suffix[k]:
            suffix[i] = k
        else:
            suffix[i] = m - prefix[i]

    return suffix

def boyer_moore_search(text, pattern):
    n = len(text)
    m = len(pattern)
    bad_char = preprocess_bad_character(pattern)
    good_suffix = preprocess_good_suffix(pattern)
    matches = []

    i = 0
    while i <= n - m:
        j = m - 1
        while j >= 0 and pattern[j] == text[i + j]:
            j -= 1
        if j < 0:
            matches.append(i)
            i += good_suffix[0]
        else:
            char_shift = j - bad_char.get(text[i + j], -1)
            suffix_shift = good_suffix[j + 1]
            i += max(1, char_shift, suffix_shift)

    return matches

# Example usage:
text = "ABAAABCDBBABCDDEBCABCABABCABAB"
pattern = "ABCABAB"
result = boyer_moore_search(text, pattern)
print("Pattern found at positions:", result)  # Output: [13, 21]


Pattern found at positions: [18]


#LCP (Longest Common Prefix) Array
The LCP array is an array that stores the length of the longest common prefix between consecutive suffixes of a sorted suffix array. It is commonly used in string processing algorithms, especially with suffix arrays and suffix trees.



In [9]:
def build_suffix_array(s):
    suffixes = sorted((s[i:], i) for i in range(len(s)))
    suffix_array = [suffix[1] for suffix in suffixes]
    return suffix_array

def build_lcp_array(s, suffix_array):
    n = len(s)
    rank = [0] * n
    lcp = [0] * n
    for i, suffix in enumerate(suffix_array):
        rank[suffix] = i

    h = 0
    for i in range(n):
        if rank[i] > 0:
            j = suffix_array[rank[i] - 1]
            while (i + h < n and j + h < n and s[i + h] == s[j + h]):
                h += 1
            lcp[rank[i]] = h
            if h > 0:
                h -= 1

    return lcp

# Example usage:
s = "banana"
suffix_array = build_suffix_array(s)
lcp_array = build_lcp_array(s, suffix_array)
print(lcp_array)  # Output: [0, 1, 3, 0, 0, 2]


[0, 1, 3, 0, 0, 2]


#Knuth-Morris-Pratt (KMP) Algorithm
The Knuth-Morris-Pratt (KMP) algorithm efficiently finds occurrences of a pattern P within a text T by utilizing a prefix function (also known as the longest prefix suffix or lps array). This allows the algorithm to avoid unnecessary backtracking during the search process, making it more efficient than naive pattern matching algorithms.

The key idea behind KMP is to preprocess the pattern P to construct the lps array, which helps determine how much of the pattern can be reused when a mismatch occurs. By using this information, the algorithm avoids rechecking previously matched characters.

In [12]:
def compute_lps(pattern):
    m = len(pattern)
    lps = [0] * m
    length = 0
    i = 1

    while i < m:
        if pattern[i] == pattern[length]:
            length += 1
            lps[i] = length
            i += 1
        else:
            if length != 0:
                length = lps[length - 1]
            else:
                lps[i] = 0
                i += 1

    return lps

def kmp_search(text, pattern):
    n = len(text)
    m = len(pattern)
    lps = compute_lps(pattern)
    matches = []

    i = 0  # index for text
    j = 0  # index for pattern
    while i < n:
        if pattern[j] == text[i]:
            i += 1
            j += 1

        if j == m:
            matches.append(i - j)
            j = lps[j - 1]
        elif i < n and pattern[j] != text[i]:
            if j != 0:
                j = lps[j - 1]
            else:
                i += 1

    return matches

# Example usage:
text = "ABABDABACDABABCABAB"
pattern = "ABABCABAB"
result = kmp_search(text, pattern)
print("Pattern found at positions:", result)  # Output: [10]

Pattern found at positions: [10]


#Z Algorithm (Z-function)
The Z algorithm preprocesses the concatenation of P and T (denoted as P$T) to efficiently determine matches of P in T. The Z array is computed based on this concatenated string.



In [13]:
def calculate_z(s):
    n = len(s)
    z = [0] * n
    l, r, k = 0, 0, 0

    for i in range(1, n):
        if i > r:
            l, r = i, i
            while r < n and s[r] == s[r - l]:
                r += 1
            z[i] = r - l
            r -= 1
        else:
            k = i - l
            if z[k] < r - i + 1:
                z[i] = z[k]
            else:
                l = i
                while r < n and s[r] == s[r - l]:
                    r += 1
                z[i] = r - l
                r -= 1

    return z

def z_search(text, pattern):
    concat_str = pattern + '$' + text
    z = calculate_z(concat_str)
    pattern_len = len(pattern)
    matches = []

    for i in range(pattern_len + 1, len(concat_str)):
        if z[i] == pattern_len:
            matches.append(i - (pattern_len + 1))

    return matches

# Example usage:
text = "ABACABACABA"
pattern = "CAB"
result = z_search(text, pattern)
print("Pattern found at positions:", result)  # Output: [4, 8]

Pattern found at positions: [3, 7]


#Hashing for String Matching
Hashing can be utilized for substring or pattern matching by computing hash values for substrings of a text and comparing these hash values with the hash value of the pattern. If the hash values match, further string comparisons can be performed to confirm a match.



In [14]:
class RollingHash:
    def __init__(self, text, pattern_length):
        self.text = text
        self.text_length = len(text)
        self.pattern_length = pattern_length
        self.base = 257  # base for polynomial rolling hash
        self.modulo = 10**9 + 7  # prime number for modulo operation
        self.hash_values = [0] * (self.text_length - self.pattern_length + 1)
        self.precompute_hashes()

    def precompute_hashes(self):
        # Calculate the hash value of the first pattern_length substring
        current_hash = 0
        for i in range(self.pattern_length):
            current_hash = (current_hash * self.base + ord(self.text[i])) % self.modulo

        self.hash_values[0] = current_hash

        # Calculate hash values for subsequent substrings using rolling hash
        base_power = pow(self.base, self.pattern_length - 1, self.modulo)  # base^(pattern_length-1) % modulo
        for i in range(1, self.text_length - self.pattern_length + 1):
            current_hash = (self.base * (self.hash_values[i - 1] - ord(self.text[i - 1]) * base_power) + ord(self.text[i + self.pattern_length - 1])) % self.modulo
            self.hash_values[i] = current_hash

    def get_pattern_hash(self, pattern):
        pattern_hash = 0
        for char in pattern:
            pattern_hash = (pattern_hash * self.base + ord(char)) % self.modulo
        return pattern_hash

    def find_matches(self, pattern):
        pattern_hash = self.get_pattern_hash(pattern)
        matches = []

        for i in range(self.text_length - self.pattern_length + 1):
            if self.hash_values[i] == pattern_hash:
                if self.text[i:i + self.pattern_length] == pattern:
                    matches.append(i)

        return matches

# Example usage:
text = "ABACADABRAC"
pattern = "ABRA"
pattern_length = len(pattern)
rolling_hash = RollingHash(text, pattern_length)
result = rolling_hash.find_matches(pattern)
print("Pattern found at positions:", result)  # Output: [4]

Pattern found at positions: [6]
