<a href="https://colab.research.google.com/github/AbeHandler/AbeHandler.github.io/blob/master/sa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## perhaps tokens tho

In [2]:
import numpy as np
from pydivsufsort import divsufsort

# string1
string1 = "banana$" + "8"*8
A1 = divsufsort(string1)
A1.astype(np.uint64).tofile("A1.bin")

# string2
string2 = "strawberry$" + "8"*9
A2 = divsufsort(string2)
A2 + A1.max() + 1 # shift, +1 b/c zero indexes
A2.astype(np.uint64).tofile("A2.bin")

# save combined text for suffix comparisons
with open("full_text.txt", "wb") as f:
    f.write(string1.encode("utf-8"))
    f.write(string2.encode("utf-8"))

In [2]:
import numpy as np
import mmap

def stream_uint64(filepath):
    with open(filepath, "rb") as f:
        while True:
            b = f.read(8)
            if not b:
                break
            yield np.frombuffer(b, dtype=np.uint64)[0]

# Open text as mmap
with open("full_text.txt", "rb") as ftxt:
    text = mmap.mmap(ftxt.fileno(), 0, access=mmap.ACCESS_READ)

    # Create generators for both suffix arrays
    A1_gen = stream_uint64("A1.bin")
    A2_gen = stream_uint64("A2.bin")

    val1 = next(A1_gen, None)
    val2 = next(A2_gen, None)

    with open("merged_sa.bin", "wb") as fout:
        while val1 is not None and val2 is not None:
            s1 = text[val1:]
            s2 = text[val2:]
            if s1 < s2:
                fout.write(np.uint64(val1).tobytes())
                val1 = next(A1_gen, None)
            else:
                fout.write(np.uint64(val2).tobytes())
                val2 = next(A2_gen, None)

        # Drain remaining values
        while val1 is not None:
            fout.write(np.uint64(val1).tobytes())
            val1 = next(A1_gen, None)

        while val2 is not None:
            fout.write(np.uint64(val2).tobytes())
            val2 = next(A2_gen, None)


In [9]:
SA = np.fromfile("merged_sa.bin", dtype=np.uint64)

SA.astype(np.int32).shape

(35,)

In [None]:
from pydivsufsort import kasai

kasai(string1 + string2, SA.astype(np.int32))

In [10]:
from pydivsufsort import divsufsort, kasai

text = string1 + string2
sa = divsufsort(text)
string_lcp_array = kasai(text, string_suffix_array)
lcp = kasai(text, sa)

print(f"{'i':>2} {'SA[i]':>6} {'LCP[i]':>7}  Suffix")
print("-" * 40)

for i in range(len(sa)):
    suffix = text[sa[i]:]
    lcp_val = lcp[i] if i > 0 else 0
    print(f"{i:2} {sa[i]:6} {lcp_val:7}  {suffix}")

 i  SA[i]  LCP[i]  Suffix
----------------------------------------
 0     25       0  $888888888
 1      6       0  $88888888strawberry$888888888
 2     34       1  8
 3     33       2  88
 4     32       3  888
 5     31       4  8888
 6     30       5  88888
 7     29       6  888888
 8     28       7  8888888
 9     27       8  88888888
10     26       8  888888888
11      7       7  88888888strawberry$888888888
12      8       6  8888888strawberry$888888888
13      9       5  888888strawberry$888888888
14     10       4  88888strawberry$888888888
15     11       3  8888strawberry$888888888
16     12       2  888strawberry$888888888
17     13       1  88strawberry$888888888
18     14       0  8strawberry$888888888
19      5       1  a$88888888strawberry$888888888
20      3       3  ana$88888888strawberry$888888888
21      1       1  anana$88888888strawberry$888888888
22     18       0  awberry$888888888
23      0       1  banana$88888888strawberry$888888888
24     20       0  berry$

In [11]:
string1

'banana$88888888'

In [12]:
string2

'strawberry$888888888'

In [13]:
string1 + string2

'banana$88888888strawberry$888888888'

In [11]:
SA.astype(np.int32)

array([ 6, 10, 14, 13, 12, 11, 10,  9,  8,  7,  5,  3,  1,  0,  4,  2, 19,
       18, 17, 16, 15, 14, 13, 12, 11,  3,  5,  6,  2,  7,  8,  0,  1,  4,
        9], dtype=int32)

In [13]:
string_suffix_array

array([6, 5, 3, 1, 0, 4, 2], dtype=int32)

In [46]:
# Compute suffix array and LCP naively, then display common prefixes of length 2

def compute_sa(text):
    return sorted(range(len(text)), key=lambda i: text[i:])

def compute_lcp(text, sa):
    lcp = [0] * len(sa)
    for i in range(1, len(sa)):
        # find common prefix length between suffixes at sa[i] and sa[i-1]
        a, b = text[sa[i]:], text[sa[i-1]:]
        h = 0
        while h < len(a) and h < len(b) and a[h] == b[h]:
            h += 1
        lcp[i] = h
    return lcp

s = "aabaa"
s1 = f"{s}document stuff"
s2 = f"{s}lorem"
text = s1 + "$" + s2  # "aabaa document stuff$aabaa lorem"

# Compute SA and LCP
sa = compute_sa(text)
lcp = compute_lcp(text, sa)

THRESHOLD = 5

# Display entries where LCP >= 2 and show the two common characters
print(f"{'i':>2} {'SA[i]':>5} {'SA[i-1]':>7} {'LCP[i]':>7}  {f'Common ({THRESHOLD} chars)':>20}  {'Suffix_i':>12}  {'Suffix_prev'}")
print("-" * 100)
for i in range(1, len(sa)):
    if lcp[i] >= THRESHOLD:
        pos_i = sa[i]
        pos_prev = sa[i-1]
        common = text[pos_i:pos_i + THRESHOLD]
        suffix_i = text[pos_i:]
        suffix_prev = text[pos_prev:]
        print(f"{i:2} {pos_i:5} {pos_prev:7} {lcp[i]:7}  {common!r:20}  {suffix_i!r:12}  {suffix_prev!r}")


 i SA[i] SA[i-1]  LCP[i]      Common (5 chars)      Suffix_i  Suffix_prev
----------------------------------------------------------------------------------------------------
 3    20       0       5  'aabaa'               'aabaalorem'  'aabaadocument stuff$aabaalorem'


In [36]:
text[sa[0]:]

'$aab'