## Laboratorium 5.
#### Bartosz Hanc

1. Zaimplementuj algorytm wyszukiwania wzorca 2-wymiarowego. 

In [131]:
import numpy as np
from typing import Sequence, Hashable


In [132]:
class Node:
    def __init__(self, symb) -> None:
        self.next: dict[Hashable, Node] = {}
        self.fail: Node = None
        self.symb: str = symb
        self.term: int = -1


class Trie:
    def __init__(self, patterns) -> None:
        self.root = Node(symb="")
        self.__build_trie(patterns=patterns)

    def __build_trie(self, patterns: Sequence[Sequence[Hashable]]):
        for i, pattern in enumerate(patterns):
            node = self.root
            for c in pattern:
                if c in node.next:
                    node = node.next[c]
                else:
                    child = Node(symb=str(c))
                    node.next[c] = child
                    node = child

            node.term = i

    def __repr__(self):
        def rec_print(node, level=0):
            line = "" if level == 0 else "┕━━━━ "
            ret = "\t" * level + line + node.symb + "\n"

            for _, child in node.next.items():
                ret += rec_print(child, level + 1)

            return ret

        return rec_print(self.root)

    def build_automaton(self, alphabet: Sequence[Hashable]):
        from collections import deque

        queue = deque()
        root = self.root

        for c in alphabet:
            if c in root.next:
                node = root.next[c]
                node.fail = root
                queue.append(node)
            else:
                root.next[c] = root

        while len(queue) > 0:
            node: Node = queue.popleft()

            for c in alphabet:
                if c in node.next:
                    next: Node = node.next[c]
                    queue.append(next)
                    x = node.fail

                    while c not in x.next:
                        x = x.fail

                    next.fail = x.next[c]


def Aho_Corasick(trie: Trie, text):
    node: Node = trie.root
    ans = []

    for i, c in enumerate(text):
        while c not in node.next:
            node = node.fail

        node = node.next[c]
        if node.term > -1:
            ans.append((i, node.term))

    return ans

In [133]:
print(Trie(["ab", "bc", "bca", "bcb", "caa", "cab", "ba", "bcba"]))


	┕━━━━ a
		┕━━━━ b
	┕━━━━ b
		┕━━━━ c
			┕━━━━ a
			┕━━━━ b
				┕━━━━ a
		┕━━━━ a
	┕━━━━ c
		┕━━━━ a
			┕━━━━ a
			┕━━━━ b



In [134]:
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
alphabet = set(text)

patterns = ["dolor", "esse", "dolore", "in"]
trie = Trie(patterns)
trie.build_automaton(alphabet=alphabet)

for i, pat_idx in Aho_Corasick(trie=trie, text=text):
    print(i, patterns[pat_idx])

16 dolor
48 in
80 in
107 dolor
108 dolore
137 in
252 dolor
255 in
272 in
293 esse
306 dolor
307 dolore
347 in
389 in


In [164]:
def match_2d(pattern_2d: np.array, text: np.array):
    patterns = np.unique(pattern_2d.T, axis=0)
    trie = Trie(patterns)
    trie.build_automaton(alphabet=set(text.flatten()))

    N = np.array([[-1 for _ in range(len(text[0]))] for _ in range(len(text))])

    for col, s in enumerate(text.T):
        for row, pat_idx in Aho_Corasick(trie=trie, text=s):
            N[row][col] = pat_idx

    pattern_1d = np.array([], dtype=np.int64)
    for pattern in pattern_2d.T:
        for i, other in enumerate(patterns):
            if np.array_equal(pattern, other):
                pattern_1d = np.append(pattern_1d, i)
                break

    ans = []
    n = len(pattern_1d)
    for row, s in enumerate(N):
        for col in range(len(s)):
            if col + n < len(s) and np.array_equal(pattern_1d, s[col : col + n]):
                ans.append((row, col + n - 1))

    return ans


def show(ans, text_array, pat_shape, extr_space=False):
    n, m = pat_shape
    MARK = "\033[91m"
    ENDC = "\033[0m"

    show = text_array.tolist()
    for row, col in ans:
        for i in range(row - n + 1, row + 1):
            for j in range(col - m + 1, col + 1):
                show[i][j] = MARK + show[i][j] + ENDC

    m = " " if extr_space else ""
    print("\n".join([m.join(s) for s in show]))


In [165]:
pattern = np.array(list(map(lambda x: x.split(" "), ["a a a", "b b a", "a a b"])))
text = np.array(
    list(
        map(
            lambda x: x.split(" "),
            [
                "a b a b a b b",
                "a a a a b b b",
                "b b b a a a b",
                "a a a b b a a",
                "b b a a a b b",
                "a a b a a a a",
            ],
        )
    )
)

ans = match_2d(pattern_2d=pattern, text=text)
print(ans, "\n")
show(ans, text, pattern.shape, extr_space=True)

[(3, 3), (4, 5), (5, 2)] 

a b a b a b b
a [91ma[0m [91ma[0m [91ma[0m b b b
b [91mb[0m [91mb[0m [91m[91ma[0m[0m [91ma[0m [91ma[0m b
[91ma[0m [91m[91ma[0m[0m [91m[91ma[0m[0m [91m[91mb[0m[0m [91mb[0m [91ma[0m a
[91mb[0m [91mb[0m [91ma[0m [91ma[0m [91ma[0m [91mb[0m b
[91ma[0m [91ma[0m [91mb[0m a a a a


2. Znajdź w załączonym pliku "haystack.txt" wszystkie sytuacje, gdy taka sama litera występuje na
   tej samej pozycji w dwóch kolejnych linijkach. Zwróć uwagę, na nierówną długość linii w pliku.

In [166]:
import string

text = open("haystack.txt", "r").read().splitlines()
MAX_LEN = len(max(text, key=lambda x: len(x)))
text = list(
    map(lambda x: list(x), list(map(lambda x: x + " " * (MAX_LEN - len(x)), text)))
)
text = np.array(text)

ans = []
for c in string.ascii_letters:
    pattern = np.array([[c], [c]])
    ans += match_2d(pattern, text)

show(ans, text, (2, 1))


One of [91mt[0mhe simplest and n a t u r a l types of information repr[91me[0msentation is by me[91ma[0m[91mn[0ms                    
of [91mw[0m[91mr[0m[91mi[0m[91mt[0m[91m[91mt[0m[0m[91me[0m[91mn[0m texts. This type of d a t a is characterized by t h [91me[0m fact t h a t it c[91ma[0m[91mn[0m                     
be [91mw[0m[91mr[0m[91mi[0m[91mt[0m[91mt[0m[91me[0m[91mn[0m down as a long sequence of characters. Such linear a sequence                                 
is called a text. T h e texts [91ma[0mre cen[91mt[0mral in "word pro[91mc[0mes[91ms[0ming" [91ms[0mystems, which                            
provide facili[91mt[0mies for [91mt[0m h e m[91ma[0mnipula[91mt[0mion of text[91ms[0m. Su[91mc[0mh [91ms[0mystem[91ms[0m usually pr[91mo[0mc[91me[0mss                         
objects t h a [91mt[0m are qui[91mt[0me large. For example, thi[91ms[0m book prob[91ma[0mbly c[91mo[0mntains m[91mo[0mr[91me

3. Znajdź wszystkie wystąpienia "th" oraz "t h" w dwóch kolejnych liniach na tej samej pozycji. 

In [167]:
ans = []
pattern = np.array([["t", " ", "h"], ["t", " ", "h"]])
ans += match_2d(pattern, text)

pattern = np.array([["t", " ", "h"], ["t", " ", "h"]])
ans += match_2d(pattern, text)

show(ans, text, pattern.shape)

One of the simplest and n a t u r a l types of information representation is by means                    
of written texts. This type of d a t a is characterized by t h e fact t h a t it can                     
be written down as a long sequence of characters. Such linear a sequence                                 
is called a text. T h e texts are central in "word processing" systems, which                            
provide facilities for t h e manipulation of texts. Such systems usually process                         
objects t h a t are quite large. For example, this book probably contains more                           
t h a n a million characters. Text algorithms occur in many areas of science and                         
information processing. Many text editors and programming languages have                                 
facilities for processing texts. In biology, text algorithms arise in the study                          
of molecular sequences. T h e complexity of te

4. Wybierz przynajmniej 4 litery (małe). Znajdź wszystkie wystąpienia tej litery w załączonym pliku
   "haystack.png"

In [168]:
from PIL import Image

text = np.array(Image.open("haystack.png").convert("L"))
pat_a = np.array(Image.open("a.png").convert("L"))
pat_b = np.array(Image.open("b.png").convert("L"))
pat_c = np.array(Image.open("c.png").convert("L"))
pat_d = np.array(Image.open("d.png").convert("L"))

print("Liczba wystąpień")
print("'a':", len(match_2d(pat_a, text)))
print("'b':", len(match_2d(pat_b, text)))
print("'c':", len(match_2d(pat_c, text)))
print("'d':", len(match_2d(pat_d, text)))


Liczba wystąpień
'a': 397
'b': 56
'c': 213
'd': 137


5. Znajdź wszystkie wystąpienia słowa "p a t t e r n" w haystack.png. 

In [161]:
text = np.array(Image.open("haystack.png").convert("L"))
pat = np.array(Image.open("pattern.png").convert("L"))
print("L. wystąpień 'p a t t e r n' :", len(match_2d(pat, text)))


L. wystąpień 'p a t t e r n': 5


6. Porównaj czas budowania automatu i czas wyszukiwania dla różnych rozmiarów wzorca 

In [211]:
import time
from tabulate import tabulate

table = []

text = open("haystack.txt", "r").read().splitlines()
alphabet = set(text)
MAX_LEN = len(max(text, key=lambda x: len(x)))
text = list(
    map(lambda x: list(x), list(map(lambda x: x + " " * (MAX_LEN - len(x)), text)))
)
text = np.array(text)

for pat_len in range(10, 110, 10):
    pattern = text[:pat_len, :pat_len]
    start = time.perf_counter()
    Trie(pattern).build_automaton(alphabet)
    end = time.perf_counter()
    build_time = end - start

    start = time.perf_counter()
    match_2d(pattern, text)
    end = time.perf_counter()
    search_time = end - start

    table.append((pat_len, build_time, search_time))


print(
    tabulate(
        table,
        headers=["Pattern length (NxN)", "Automaton build time [s]", "Search time [s]"],
    )
)


  Pattern length (NxN)    Automaton build time [s]    Search time [s]
----------------------  --------------------------  -----------------
                    10                   0.0002066          0.0386464
                    20                   0.000307           0.035057
                    30                   0.0012827          0.0345387
                    40                   0.0011188          0.0366719
                    50                   0.0016381          0.0398655
                    60                   0.0028601          0.041236
                    70                   0.0044025          0.122103
                    80                   0.0052552          0.0652718
                    90                   0.0052992          0.0573399
                   100                   0.0059585          0.132607


7. Podziel plik na 2, 4 i 8 fragmentów (w poziomie) i porównaj czas przeszukiwania

In [206]:
text_2 = text.T[:len(text) // 2].T
text_4 = text.T[:len(text) // 4].T
text_8 = text.T[:len(text) // 8].T

print("\n".join(["".join(l) for l in text_2.tolist()]))
print("\n".join(["".join(l) for l in text_4.tolist()]))
print("\n".join(["".join(l) for l in text_8.tolist()]))

One of the simplest and n a t u r a l type
of written texts. This type of d a t a is 
be written down as a long sequence of char
is called a text. T h e texts are central 
provide facilities for t h e manipulation 
objects t h a t are quite large. For examp
t h a n a million characters. Text algorit
information processing. Many text editors 
facilities for processing texts. In biolog
of molecular sequences. T h e complexity o
central and most studied problems in theor
be said t h a t it is the domain in which 
each other.                               
T h e basic textual problem in stringology
used to access information and, no doubt, 
are solving this problem as a frequently u
system. P a t t e r n matching is comparab
arithmetic operations.                    
Consider the problem of a reader of the Fr
who wants all entries related to the n a m
a n example of a p a t t e r n matching pr
the n a m e "Marie-Curie-Sklodowska" is th
find a string called a pattern of length m
greater t h

In [214]:
pattern = text[:20, :20]
table = []

start = time.perf_counter()
match_2d(pattern, text_2)
end = time.perf_counter()
search_time = end - start

table.append(("2", search_time))

start = time.perf_counter()
match_2d(pattern, text_4)
end = time.perf_counter()
search_time = end - start

table.append(("4", search_time))

start = time.perf_counter()
match_2d(pattern, text_8)
end = time.perf_counter()
search_time = end - start

table.append(("8", search_time))

print(tabulate(table, headers=["No. parts", "Search time [s]"]))


  No. parts    Search time [s]
-----------  -----------------
          2          0.0140266
          4          0.0063591
          8          0.0028982
