# Code Written by:
**Shweta Tiwari**
*20 Oct 2023*

## Algorithm:  Aho-Corasick

In [1]:
import time

In [2]:
from collections import deque, defaultdict
from itertools import count

# Algorithm

In [3]:
%%time
def aho_corasick():
    G = defaultdict(count(1).__next__)  # transitions
    W = defaultdict(set)                # alphabet
    F = defaultdict(lambda: 0)          # fallbacks
    O = defaultdict(set)                # outputs

    # automaton
    return G, W, F, O

CPU times: user 0 ns, sys: 5 µs, total: 5 µs
Wall time: 8.34 µs


In [4]:
%%time
def add_word(word, G, W, F, O):
    state = 0

    # add transitions between states
    for w in word:
        W[state].add(w)
        state = G[state, w]

    # add output
    O[state].add(word)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs


In [5]:
%%time
def build_fsa(G, W, F, O):
    # initial states
    queue = deque(G[0, w] for w in W[0])

    while queue:
        state = queue.popleft()

        # for each letter in alphabet
        for w in W[state]:
            # find fallback state
            t = F[state]
            while t and (t, w) not in G:
                t = F[t]

            # for next state define its fallback and output
            s = G[state, w]
            F[s] = G[t, w] if (t, w) in G else 0
            O[s] |= O[F[s]]

            queue.append(s)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.3 µs


In [6]:
%%time
def search(text, G, W, F, O):
    state = 0

    for i, t in enumerate(text):
        # fallback
        while state and (state, t) not in G:
            state = F[state]

        # transition
        state = G[state, t] if (state, t) in G else 0

        # output
        if O[state]:
            print('@', i, O[state])

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 9.3 µs


# Run

In [7]:
%%time
AC = aho_corasick()
add_word('bar', *AC)
add_word('ara', *AC)
add_word('bara', *AC)
add_word('barbara', *AC)
build_fsa(*AC)

CPU times: user 118 µs, sys: 0 ns, total: 118 µs
Wall time: 192 µs


In [8]:
%%time
search('barbarian barbara said: barabum', *AC)

@ 2 {'bar'}
@ 5 {'bar'}
@ 12 {'bar'}
@ 15 {'bar'}
@ 16 {'ara', 'barbara', 'bara'}
@ 26 {'bar'}
@ 27 {'ara', 'bara'}
CPU times: user 378 µs, sys: 80 µs, total: 458 µs
Wall time: 615 µs


# The End