<a href="https://colab.research.google.com/github/AbeHandler/AbeHandler.github.io/blob/master/sa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### The plan

1. Compute sa for each shard
2. Merge them w/ a mem map low memory merge
3. Shard the big SA. The sequences that are similar will go in the same shard
4. Search those shards in parallel

In [46]:
# Compute suffix array and LCP naively, then display common prefixes of length 2

def compute_sa(text):
    return sorted(range(len(text)), key=lambda i: text[i:])

def compute_lcp(text, sa):
    lcp = [0] * len(sa)
    for i in range(1, len(sa)):
        # find common prefix length between suffixes at sa[i] and sa[i-1]
        a, b = text[sa[i]:], text[sa[i-1]:]
        h = 0
        while h < len(a) and h < len(b) and a[h] == b[h]:
            h += 1
        lcp[i] = h
    return lcp

s = "aabaa"
s1 = f"{s}document stuff"
s2 = f"{s}lorem"
text = s1 + "$" + s2  # "aabaa document stuff$aabaa lorem"

# Compute SA and LCP
sa = compute_sa(text)
lcp = compute_lcp(text, sa)

THRESHOLD = 5

# Display entries where LCP >= 2 and show the two common characters
print(f"{'i':>2} {'SA[i]':>5} {'SA[i-1]':>7} {'LCP[i]':>7}  {f'Common ({THRESHOLD} chars)':>20}  {'Suffix_i':>12}  {'Suffix_prev'}")
print("-" * 100)
for i in range(1, len(sa)):
    if lcp[i] >= THRESHOLD:
        pos_i = sa[i]
        pos_prev = sa[i-1]
        common = text[pos_i:pos_i + THRESHOLD]
        suffix_i = text[pos_i:]
        suffix_prev = text[pos_prev:]
        print(f"{i:2} {pos_i:5} {pos_prev:7} {lcp[i]:7}  {common!r:20}  {suffix_i!r:12}  {suffix_prev!r}")


 i SA[i] SA[i-1]  LCP[i]      Common (5 chars)      Suffix_i  Suffix_prev
----------------------------------------------------------------------------------------------------
 3    20       0       5  'aabaa'               'aabaalorem'  'aabaadocument stuff$aabaalorem'
