## Laboratorium 5.
#### Bartosz Hanc

1. Zaimplementuj algorytm wyszukiwania wzorca 2-wymiarowego. 

In [2]:
import numpy as np
from typing import Sequence, Hashable

In [3]:
class Node:
    def __init__(self, symb) -> None:
        self.next: dict[Hashable, Node] = {}
        self.fail: Node = None
        self.symb: str = symb
        self.term: int = -1


class Trie:
    def __init__(self, patterns) -> None:
        self.root = Node(symb="")
        self.__build_trie(patterns=patterns)

    def __build_trie(self, patterns: Sequence[Sequence[Hashable]]):
        for i, pattern in enumerate(patterns):
            node = self.root
            for c in pattern:
                if c in node.next:
                    node = node.next[c]
                else:
                    child = Node(symb=str(node.symb) + str(c))
                    node.next[c] = child
                    node = child

            node.term = i

    def __repr__(self):
        def rec_print(node, level=0):
            line = "" if level == 0 else "┕━━━━ "
            ret = "\t" * level + line + node.symb + "\n"

            for _, child in node.next.items():
                ret += rec_print(child, level + 1)

            return ret

        return rec_print(self.root)

    def build_automaton(self, alphabet: Sequence[Hashable]):
        from collections import deque

        queue = deque()
        root = self.root

        for c in alphabet:
            if c in root.next:
                node = root.next[c]
                node.fail = root
                queue.append(node)
            else:
                root.next[c] = root

        while len(queue) > 0:
            node: Node = queue.popleft()

            for c in alphabet:
                if c in node.next:
                    next: Node = node.next[c]
                    queue.append(next)
                    x = node.fail

                    while c not in x.next:
                        x = x.fail

                    next.fail = x.next[c]


def Aho_Corasick(trie: Trie, text):
    node: Node = trie.root
    ans = []

    for i, c in enumerate(text):
        while c not in node.next:
            node = node.fail

        node = node.next[c]
        if node.term > -1:
            ans.append((i, node.term))

    return ans


In [4]:
print(Trie(["ab", "bc", "bca", "bcb", "caa", "cab", "ba", "bcba"]))



	┕━━━━ a
		┕━━━━ ab
	┕━━━━ b
		┕━━━━ bc
			┕━━━━ bca
			┕━━━━ bcb
				┕━━━━ bcba
		┕━━━━ ba
	┕━━━━ c
		┕━━━━ ca
			┕━━━━ caa
			┕━━━━ cab



In [13]:
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
alphabet = set(text)

patterns = ["dolor", "esse", "dolore", "in"]
trie = Trie(patterns)
trie.build_automaton(alphabet=alphabet)

for i, pat_idx in Aho_Corasick(trie=trie, text=text):
    print(i, patterns[pat_idx])


16 dolor
48 in
80 in
107 dolor
108 dolore
137 in
252 dolor
255 in
272 in
293 esse
306 dolor
307 dolore
347 in
389 in


In [26]:
def match_2d(pattern, text):
    transpose = lambda A: [[A[i][j] for i in range(len(A))] for j in range(len(A[0]))]

    print(transpose(pattern))

    trie = Trie(patterns=patterns_1d)
    trie.build_automaton(alphabet=set(text.flatten()))

    N = [[-1 for _ in range(len(text[0]))] for _ in range(len(text))]

    for col, s in enumerate(transpose(text)):
        for row, pat_idx in Aho_Corasick(trie=trie, text=s):
            N[row][col] = pat_idx

    return N


In [27]:
pattern = list(map(lambda x: x.split(" "), ["a a a", "b b a", "a a b"]))
text = list(
    map(
        lambda x: x.split(" "),
        [
            "a b a b a b b",
            "a a a a b b b",
            "b b b a a a b",
            "a a a b b a a",
            "b b a a a b b",
            "a a b a a a a",
        ],
    )
)

match_2d(pattern, text)


[['a', 'b', 'a'], ['a', 'b', 'a'], ['a', 'a', 'b']]


NameError: name 'patterns_1d' is not defined

In [21]:
transpose = lambda arr: [[arr[i][j] for i in range(len(arr))] for j in range(len(arr[0]))]
transpose([[1,1],[2,2],[3,3]])

[[1, 2, 3], [1, 2, 3]]

2. Znajdź w załączonym pliku "haystack.txt" wszystkie sytuacje, gdy taka sama litera występuje na
   tej samej pozycji w dwóch kolejnych linijkach. Zwróć uwagę, na nierówną długość linii w pliku.

In [16]:
text = open("haystack.txt", "r").read().splitlines()
MAX_LEN = len(max(text, key=lambda x: len(x)))
text = list(
    map(lambda x: list(x), list(map(lambda x: x + " " * (MAX_LEN - len(x)), text)))
)

np.array(text)


array([['O', 'n', 'e', ..., ' ', ' ', ' '],
       ['o', 'f', ' ', ..., ' ', ' ', ' '],
       ['b', 'e', ' ', ..., ' ', ' ', ' '],
       ...,
       ['e', 'f', 'f', ..., ' ', ' ', ' '],
       ['(', 'P', 'a', ..., ' ', ' ', ' '],
       ['a', 'n', 'd', ..., ' ', ' ', ' ']], dtype='<U1')

3. Znajdź wszystkie wystąpienia "th" oraz "t h" w dwóch kolejnych liniach na tej samej pozycji. 

4. Wybierz przynajmniej 4 litery (małe). Znajdź wszystkie wystąpienia tej litery w załączonym pliku
   "haystack.png"

5. Znajdź wszystkie wystąpienia słowa "p a t t e r n" w haystack.png. 

6. Porównaj czas budowania automatu i czas wyszukiwania dla różnych rozmiarów wzorca 

7. Podziel plik na 2, 4 i 8 fragmentów (w poziomie) i porównaj czas przeszukiwania