In [17]:
import clgen
from clgen.atomizer import GreedyAtomizer

In [18]:
kernel = """\
__kernel void A(__global char* a, short b, int c) {
  const int d = get_global_id(0);
  a[d] = b;
}"""

In [19]:
a1 = GreedyAtomizer.from_text(kernel)
derived_tokens = set(a1.vocab.keys())

In [24]:
tokens = [
    '__kernel',
    ' ',
    'void',
    'A',
    '(',
    '__global',
    'char',
    '*',
    'a',
    ',',
    'short',
    'b',
    'int',
    'c',
    ')',
    '{',
    '\n',
    '  ',
    'const',
    'int',
    'd',
    '=',
    'get_global_id',
    '0',
    ';',
    '[',
    ']',
    '}'
]
vocab = dict(zip(tokens, range(len(tokens))))
assert set(tokens) == derived_tokens

In [26]:
a2 = atomizer.GreedyAtomizer(vocab)

In [37]:
print(", ".join("{x:02d}".format(**vars()) for x in a2.atomize(kernel)))

00, 01, 02, 01, 03, 04, 05, 01, 06, 07, 01, 08, 09, 01, 10, 01, 11, 09, 01, 19, 01, 13, 14, 01, 15, 16, 17, 18, 01, 19, 01, 20, 01, 21, 01, 22, 04, 23, 14, 24, 16, 17, 08, 25, 20, 26, 01, 21, 01, 11, 24, 16, 27


In [54]:
width = 11
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

x = ["\\texttt{{{x:02d}}}".format(**vars()) for x in a2.atomize(kernel)]
print(" ".join(["l"] * width))
for chunk in chunks(x, width):
    print(" & ".join(chunk), "\\\\")

l l l l l l l l l l l
\texttt{00} & \texttt{01} & \texttt{02} & \texttt{01} & \texttt{03} & \texttt{04} & \texttt{05} & \texttt{01} & \texttt{06} & \texttt{07} & \texttt{01} \\
\texttt{08} & \texttt{09} & \texttt{01} & \texttt{10} & \texttt{01} & \texttt{11} & \texttt{09} & \texttt{01} & \texttt{19} & \texttt{01} & \texttt{13} \\
\texttt{14} & \texttt{01} & \texttt{15} & \texttt{16} & \texttt{17} & \texttt{18} & \texttt{01} & \texttt{19} & \texttt{01} & \texttt{20} & \texttt{01} \\
\texttt{21} & \texttt{01} & \texttt{22} & \texttt{04} & \texttt{23} & \texttt{14} & \texttt{24} & \texttt{16} & \texttt{17} & \texttt{08} & \texttt{25} \\
\texttt{20} & \texttt{26} & \texttt{01} & \texttt{21} & \texttt{01} & \texttt{11} & \texttt{24} & \texttt{16} & \texttt{27} \\


In [38]:
print(len(a2.atomize(kernel)))

53


In [48]:
print("\\textbf{i} & \\textbf{v}\\\\")
val = sorted(vocab.items(), key=lambda x: x[1])
for c1, c2, c3 in zip(val[:9], val[9:18], val[18:]):
    c1_l, c1_r = c1
    c2_l, c2_r = c2
    c3_l, c3_r = c3
    print("\\texttt{{{c1_r:02d}}} & \\texttt{{'{c1_l}'}} & \\texttt{{{c2_r:02d}}} & \\texttt{{'{c2_l}'}} & \\texttt{{{c3_r:02d}}} & \\texttt{{'{c3_l}'}}\\\\".format(**vars()))

\textbf{i} & \textbf{v}\\
\texttt{00} & \texttt{'__kernel'} & \texttt{09} & \texttt{','} & \texttt{19} & \texttt{'int'}\\
\texttt{01} & \texttt{' '} & \texttt{10} & \texttt{'short'} & \texttt{20} & \texttt{'d'}\\
\texttt{02} & \texttt{'void'} & \texttt{11} & \texttt{'b'} & \texttt{21} & \texttt{'='}\\
\texttt{03} & \texttt{'A'} & \texttt{13} & \texttt{'c'} & \texttt{22} & \texttt{'get_global_id'}\\
\texttt{04} & \texttt{'('} & \texttt{14} & \texttt{')'} & \texttt{23} & \texttt{'0'}\\
\texttt{05} & \texttt{'__global'} & \texttt{15} & \texttt{'{'} & \texttt{24} & \texttt{';'}\\
\texttt{06} & \texttt{'char'} & \texttt{16} & \texttt{'
'} & \texttt{25} & \texttt{'['}\\
\texttt{07} & \texttt{'*'} & \texttt{17} & \texttt{'  '} & \texttt{26} & \texttt{']'}\\
\texttt{08} & \texttt{'a'} & \texttt{18} & \texttt{'const'} & \texttt{27} & \texttt{'}'}\\
