Наивный алгоритм поиска подстроки

In [None]:
from typing import List
def search(text: str, sample:str) -> List[int]:
    res = []
    search_end = len(text) - len(sample)
    for search_start in range(search_end):
        found = True
        for t, s in zip(text[search_start: search_start + len(sample)], sample):
            if t!=s:
                found = False
                break
        if (found):
            res.append(search_start)
    return res

In [None]:
text = "Это текст для проверки. В этом тексте есть слово 'текст'."
sample = "текст"
for index in search(text, sample):
    print(text[index:])

текст для проверки. В этом тексте есть слово 'текст'.
тексте есть слово 'текст'.
текст'.


Алгоритм Кнута — Морриса — Пратта (KMP)

1. Вычисление $\pi$-функции

In [None]:
def compute_prefix_function(sample: str) -> List[int]:
    m = len(sample)
    pi = [0] * m
    k = 0
    for q in range(1, m):
        while k > 0 and sample[k] != sample[q]:
            k = pi[k - 1]
        if (sample[k] == sample[q]):
            k += 1
        pi[q] = k
    return pi

2. Поиск подстроки

In [None]:
samples = ["word", "aaaa", "abab", "text", "abracadabra"]
pi_functions = [compute_prefix_function(s) for s in samples]
pi_functions

[[0, 0, 0, 0],
 [0, 1, 2, 3],
 [0, 0, 1, 2],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1, 0, 1, 2, 3, 4]]

In [None]:
def KMP(text: str, sample: str) -> List[int]:
    res = []
    n = len(text)
    m = len(sample)

    if m == 0: return res

    pi = compute_prefix_function(sample)
    q = 0 # количество совпавших символов

    for i in range(n):
        while q > 0 and sample[q] != text[i]:
            q = pi[q - 1];

        if (sample[q] == text[i]):
            q += 1

        if q == m:
            res.append(i - m + 1)
            q = pi[q - 1]
    return res

In [None]:
text = "This is a text to search words. The words are lined one by one. word"
word = "word"
for index in  KMP(text, word):
    print(text[index:])

words. The words are lined one by one. word
words are lined one by one. word
word


Алгоритм Бойера — Мура

1. Построение таблицы плохого символа

In [None]:
from typing import Dict
def build_bad_char_table(sample: str) -> Dict[str, int]:
    bad_char = {}
    m = len(sample)
    for i in range(m - 1):
        bad_char[sample[i]] = m - 1 - i
    return bad_char

In [None]:
samples = ["word", "aaaa", "abab", "text", "abracadabra"]
bad_chars = [build_bad_char_table(s) for s in samples]
bad_chars

[{'w': 3, 'o': 2, 'r': 1},
 {'a': 1},
 {'a': 1, 'b': 2},
 {'t': 3, 'e': 2, 'x': 1},
 {'a': 3, 'b': 2, 'r': 1, 'c': 6, 'd': 4}]

2. Поиск подстроки

In [None]:
def Boyer_Moore(text: str, sample: str) -> List[int]:
    res = []
    n = len(text)
    m = len(sample)

    if m == 0: return res

    bad_char_table = build_bad_char_table(sample)

    for i in range(n - m + 1):
        j = m - 1
        # Сравнение справа налево
        while j >= 0 and sample[j] == text[i + j]:
            j -= 1
        if j < 0:
            res.append(i)
            if i + m < n and text[i + m] in bad_char_table:
                i += m - bad_char_table[text[i + m]]
            elif i + m < n:
                i += m
        else:
            bad_char = text[i + j]
            shift = (max(1, j - (m - 1 - bad_char_table[bad_char]))
                     if bad_char in bad_char_table else
                     max(1, j + 1))
            i += shift
    return res


In [None]:

text = "This is a text to search words. The words are lined one by one word"
word = "word"
for index in  Boyer_Moore(text, word):
    print(text[index:])

words. The words are lined one by one word
words are lined one by one word
word


Алгоритм Рабина — Карпа

In [None]:
BASE = 256
PRIME = 101

def calculate_hash(s: str, start:int, end: int) -> int:
    hash = 0
    for i in range(start, end):
        hash = (hash * BASE + ord(s[i])) % PRIME
    return hash


def update_hash(old_hash: int, old_char:int, new_char:int, sample_len:int) -> int:
    hash = (old_hash - old_char * BASE ** (sample_len - 1)) % PRIME
    if hash < 0:
        hash += PRIME
    hash = (hash * BASE + new_char) % PRIME
    return hash

def Rabin_Karp(text: str, sample: str):
    res = []
    n = len(text)
    m = len(sample)

    if m == 0 or n < m:
        return res

    sample_hash = calculate_hash(sample, 0, m)
    text_hash = calculate_hash(text, 0, m)

    for i in range(n - m + 1):
        if sample_hash == text_hash:
            # Проверка на коллизию
            is_matched = True
            for j in range(m):
                if text[i + j] != sample[j]:
                    is_matched = False
                    break
            if (is_matched):
                res.append(i)
        # Пересчет хеша для следующего окна
        if i < n - m:
            text_hash = update_hash(text_hash, ord(text[i]), ord(text[i + m]), m)
    return res

In [None]:
text = "This is a text to search words. The words are lined one by one word"
word = "word"
for index in  Rabin_Karp(text, word):
    print(text[index:])

words. The words are lined one by one word
words are lined one by one word
word


Алгоритм Ахо — Корасик

In [3]:
from typing import List
import heapq
class AhoCorasick:
    class TrieNode:
        def __init__(self) -> None:
            self.children = {}
            self.fail = None;
            self.output = [] # индексы образцов, заканчивающихся здесь
            self.is_end = False

    def __init__(self, patterns: List[str]):
        self.root = self.TrieNode()
        self.patterns = patterns
        self.buildTrie()
        self.buildFailureLinks()

    def buildTrie(self):
        for i, pattern in enumerate(self.patterns):
            current = self.root
            for c in pattern:
                if c not in current.children:
                    current.children[c] = self.TrieNode()
                current = current.children[c]

            current.is_end = True
            current.output.append(i)


    def buildFailureLinks(self):
        q = []  # queue
        # Инициализация для корня
        for key, current in self.root.children.items():
            current.fail = self.root
            q.append(current)

        while q:
            current = q.pop(0)
            for key, child in current.children.items():
                failNode = current.fail
                while (failNode is not None and
                       key not in failNode.children):
                    failNode = failNode.fail


                child.fail = self.root if failNode is None else failNode.children[key]

                # Добавляем выходные ссылки из состояния неудачи
                child.output.extend(child.fail.output)
                q.append(child)


    def search(self, text: str):
        current = self.root

        for i, c in enumerate(text):
            # Переход по ссылкам неудачи, пока не найдем подходящий переход
            while current != self.root and c not in current.children:
                current = current.fail

            if c in current.children:
                current = current.children[c]
            else:
                current = self.root


            # Проверяем все образцы, заканчивающиеся в текущем состоянии
            for patternIndex in current.output:
                pattern = self.patterns[patternIndex]
                print(f"Образец ' {pattern} найден на позиции {i - len(pattern) + 1}")


# Использование
def AhoCorasickSearch(text, patterns):
    ac = AhoCorasick(patterns)
    ac.search(text)


text = "function calculateSum(arr) {\nlet total = 0;\nfor (let i = 0; i < arr.length; i++) {\ntotal += arr[i]; \n} \nreturn total; \n}";
patterns = ["function", "let", "for", "return", "if", "else", "while"];
AhoCorasickSearch(text, patterns)


Образец ' function найден на позиции 0
Образец ' let найден на позиции 29
Образец ' for найден на позиции 44
Образец ' let найден на позиции 49
Образец ' return найден на позиции 104


 Расстояние редактирования Левенштейна

In [11]:
import numpy as np
s1 = "kitten"
s2 = "sitting"
n = len(s1) + 1
m = len(s2) + 1
dp = np.zeros((n, m))
for i in range(n):
    dp[i][0] = i
for j in range(m):
    dp[0][j] = j
# Заполнение матрицы
for i in range(1, n):
    for j in range(1, m):
        # Определяем стоимость замены
        # Если символы совпадают — замена не нужна (стоимость 0)
        # Если разные — нужна замена (стоимость 1)
        cost = 0 if s1[i - 1] == s2[j - 1] else 1

        # Рассматриваем три возможные операции:
        dp[i][j] = min([
                dp[i - 1][j] + 1,      # Удаление: удаляем символ из s1
                dp[i][j - 1] + 1,      # Вставка: вставляем символ в s1
                dp[i - 1][j - 1] + cost # Замена: заменяем символ (или оставляем, если cost=0)
            ])
dp

array([[0., 1., 2., 3., 4., 5., 6., 7.],
       [1., 1., 2., 3., 4., 5., 6., 7.],
       [2., 2., 1., 2., 3., 4., 5., 6.],
       [3., 3., 2., 1., 2., 3., 4., 5.],
       [4., 4., 3., 2., 1., 2., 3., 4.],
       [5., 5., 4., 3., 2., 2., 3., 4.],
       [6., 6., 5., 4., 3., 3., 2., 3.]])