In [2]:
from random import random, choice, randint

In [66]:
L = 9
file = f'dyckthree{L}.txt'

In [62]:
with open(file, 'r') as f:
    fail = 0
    succeed = [0 for _ in range(L+1)]
    for line in f:
        if ',' in line[:-1]:
            fail += 1
            continue
        s = line[:-1].index('|')
        succeed[s] += 1

print(f"Fail {fail}")
print(f"Succeed {succeed}")

Fail 3758475
Succeed [0, 6576750, 0, 906750, 0, 148650, 0]


# Star
No strings fail in `Star`; we pick uniformly from the strings.

We aim for test and validation sets with $5\cdot10^4$ strings (5%) each, and a train set with roughly $10^5$ strings (10%).

In [8]:
with open('star/train.txt', 'w') as train, \
     open('star/val.txt', 'w') as val, \
     open('star/test.txt', 'w') as test, \
     open(file, 'r') as f:
    for line in f:
        r = random()
        if (r <= 0.05):
            test.write(line)
        elif (r <= 0.1):
            val.write(line)
        elif (r <= 0.2):
            train.write(line)

# Brack
In `Brack`, we have about 40k strings that succeed (of various lengths). We therefore create train, test and validation sets with a 50-50 distribution between succeeding and failing strings. We pick all the succeeding strings and appropriately select the proportion of failing strings (about 4%).

In [58]:
with open('brack/train.txt', 'w') as train, \
     open('brack/val.txt', 'w') as val, \
     open('brack/test.txt', 'w') as test, \
     open(file, 'r') as f:
     for line in f:
          r = random()
          if ',' in line[:-1]:
               if (r <= 0.02):
                    train.write(line)
               elif (r <= 0.03):
                    test.write(line)
               elif (r <= 0.04):
                    val.write(line)
          else:
               if (r <= 0.5):
                    train.write(line)
               elif (r <= 0.75):
                    test.write(line)
               else:
                    val.write(line)

# Expr
We can generate strings for `Expr` without enumerating. There can only be an odd number of strings; thus we generate roughly 5000 strings of each length in the range from 3 to 21 (inclusive), plus 500 single-digit datapoints.
For a given length $L$:

* Pick a random number of (pairs of) brackets, $B \in \left[0, \frac{L-1}{2}\right]$.
* Generate a random string of length $L- 2B$, by alternating between digits and operators.
* (Repeat $B$ times): Pick two random, distinct digits in the string, and insert brackets before the first and after the second.

We also pick 50000 failing strings (about 1.3%) from the enumerated strings.

We follow similar procedures for the test and val datasets.

For uniformity, we pad all strings to have length 21.

In [69]:
# Failing strings first
with open('expr/train.txt', 'w') as train, \
     open('expr/val.txt', 'w') as val, \
     open('expr/test.txt', 'w') as test, \
     open(file, 'r') as f:
     for line in f:
          r = random()
          if ',' in line[:-1]:
               if (r <= 0.013):
                    train.write(line[:-1])
                    for _ in range(15):
                         train.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
                    train.write('\n')
               elif (r <= 0.019):
                    val.write(line[:-1])
                    for _ in range(15):
                         val.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
                    val.write('\n')
               elif (r <= 0.026):
                    test.write(line[:-1])
                    for _ in range(15):
                         test.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
                    test.write('\n')

In [70]:
# Succeeding strings
def gen_string(L):
     B = int(random() * (L-1)/2)
     s = ''
     for i in range(L-2*B):
          if (i % 2 == 0):
               s += choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
          else:
               s += choice(['+', '×', '^'])
     for _ in range(B):
          dig_pos = [i for i, c in enumerate(s) if c in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']]
          b1 = choice(dig_pos)
          b2 = choice([p for p in dig_pos if p >= b1])
          s = s[:b1] + '[' + s[b1:b2+1] + ']' + s[b2+1:]

     if L == 21:
          return s

     s += '|' + choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '[', ']'])
     while (len(s) < 21):
          s += choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']'])
     return s

with open('expr/train.txt', 'a') as train:
     for L in range(3, 22, 2):
          for _ in range(5000):
               train.write(gen_string(L) +'\n')
     for _ in range(500):
          train.write(str(randint(0, 9)) + '|')
          train.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '[', ']']))
          for _ in range(20):
               train.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
          train.write('\n')

with open('expr/test.txt', 'a') as test:
     for L in range(3, 22, 2):
          for _ in range(2500):
               test.write(gen_string(L) +'\n')
     for _ in range(250):
          test.write(str(randint(0, 9))+'|')
          test.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '[', ']']))
          for _ in range(20):
               test.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
          test.write('\n')

with open('expr/val.txt', 'a') as val:
     for L in range(3, 22, 2):
          for _ in range(2500):
               val.write(gen_string(L) +'\n')
     for _ in range(250):
          val.write(str(randint(0, 9))+'|')
          val.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '[', ']']))
          for _ in range(20):
               val.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
          val.write('\n')

# Triple
Here too, we don't need to enumerate. For a succeeding string, we pick a random length $l \in [1,6]$ and generate $a^lb^lc^l$. We then pad with random letters until 20. This we repeat for 50,000 strings for the train set and 25,000 each for the test and validation sets.

For failing strings, we pick 9.7% of the strings from the enumerated strings.

In [27]:
def gen_string_triple(L):
    l = randint(1,6)
    s = 'a'*l + 'b'*l + 'c'*l + '|'
    for _ in range(L-(l*3)):
        s += choice(['a', 'b', 'c'])
    return s

with open('triple/train.txt', 'w') as train, \
     open('triple/test.txt', 'w') as test, \
     open('triple/val.txt', 'w') as val, \
     open(file, 'r') as f:

    for _ in range(50000):
        train.write(gen_string_triple(20)+'\n')
    for _ in range(25000):
        val.write(gen_string_triple(20)+'\n')
        test.write(gen_string_triple(20)+'\n')
    
    for line in f:
        if ',' not in line: continue
        line = line[:-1]
        for _ in range(20-L):
            line += choice(['a', 'b', 'c'])
        r = random()
        if r <= 0.097:
            train.write(line + '\n')
        elif r <= 0.146:
            test.write(line + '\n')
        elif r <= 0.195:
            val.write(line + '\n')

# Dyck-1
No strings fail in Dyck-1. Thus we pick so as to have a uniform distribution over the lengths, from 0 to 20 (only even lengths); we pick about 20,000 strings of each length (if there are not enough, we take), of which half are used for training and a quarter each for testing and validation.

In [4]:
with open('dyck1/train.txt', 'w') as train, \
     open('dyck1/test.txt', 'w') as test, \
     open('dyck1/val.txt', 'w') as val, \
     open(file, 'r') as f:
    for line in f:
        line = line[:-1]
        s = line.index('|')
        r = random()
        match(s):
            case 0: # 20000/616666 = 0.032
                if r <= 0.016: train.write(line + '\n')
                elif r <= 0.024: test.write(line + '\n')
                elif r <= 0.032: val.write(line + '\n')
            case 2: # 20000/155382 = 0.128
                if r <= 0.064: train.write(line + '\n')
                elif r <= 0.096: test.write(line + '\n')
                elif r <= 0.128: val.write(line + '\n')
            case 4: # 20000/78406 = 0.255
                if r <= 0.128: train.write(line + '\n')
                elif r <= 0.192: test.write(line + '\n')
                elif r <= 0.255: val.write(line + '\n')
            case 6: # 20000/49540 = 0.403
                if r <= 0.201: train.write(line + '\n')
                elif r <= 0.302: test.write(line + '\n')
                elif r <= 0.403: val.write(line + '\n')
            case 8: # 20000/35140 = 0.569
                if r <= 0.285: train.write(line + '\n')
                elif r <= 0.427: test.write(line + '\n')
                elif r <= 0.569: val.write(line + '\n')
            case 10: # 20000/26796 = 0.746
                if r <= 0.373: train.write(line + '\n')
                elif r <= 0.559: test.write(line + '\n')
                elif r <= 0.746: val.write(line + '\n')
            case 12: # 20000/21516 = 0.929
                if r <= 0.465: train.write(line + '\n')
                elif r <= 0.697: test.write(line + '\n')
                elif r <= 0.929: val.write(line + '\n')
            case 14: # here on, there are not 20,000 strings per length
                if r <= 0.5: train.write(line + '\n')
                elif r <= 0.75: test.write(line + '\n')
                else: val.write(line + '\n')
            case 16:
                if r <= 0.5: train.write(line + '\n')
                elif r <= 0.75: test.write(line + '\n')
                else: val.write(line + '\n')
            case 18:
                if r <= 0.5: train.write(line + '\n')
                elif r <= 0.75: test.write(line + '\n')
                else: val.write(line + '\n')
            case 20:
                if r <= 0.5: train.write(line + '\n')
                elif r <= 0.75: test.write(line + '\n')
                else: val.write(line + '\n')

# Dyck-3
No strings fail in Dyck-3. However, the enumerated strings stop at length 9. Thus we first define a process to generate sets of strings of greater lengths by concatenating smaller strings and enclosing them in brackets randomly.

Then, we sample from the enumerated strings for lengths for which we have more than 20,000 samples (0, 2, 4, 6).  
For strings of length 8, we take all the enumerated samples (about 7000).  
For the remaining 13,000 samples of length 8 and the 20,000 samples of lengths 10 to 20 (inclusive), we use the generation process defined above.

In [87]:
lists = {2: [], 4: [], 6: [], 8: []}
with open(file, 'r') as f:
    for line in f:
        line = line[:-1]
        l = line.index('|')
        if l == 0: continue
        if len(line[l]) < 5000:
            lists[l].append(line[:l])

def gen_string_dyck(L):
    o = ['(', '{', '[']
    c = [')', '}', ']']

    # First, find a decomposition of L
    # into lengths that are available.
    lengths = []
    while sum(lengths) < L-2:
        r = randint(1, min((L-2-sum(lengths))//2, 4))
        lengths.append(2*r)

    # Find random positions to insert brackets.
    b1 = randint(0, len(lengths))
    b2 = randint(b1, len(lengths))
    b = randint(0, 2)
    lengths = lengths[:b1] + [o[b]] + lengths[b1:b2+1] + [c[b]] + lengths[b2+1:]

    s = ""
    for l in lengths:
        if isinstance(l, int):
            s += choice(lists[l])
        else: s += l

    s += '|'
    if (L < 20): s += choice(c)
    for _ in range(20-(L+1)):
        s += choice(o+c)

    return s

'{{}{()}}|]])[)]]{{}(('

In [92]:
o = ['(', '{', '[']
c = [')', '}', ']']
with open('dyck3/train.txt', 'w') as train, \
     open('dyck3/test.txt', 'w') as test, \
     open('dyck3/val.txt', 'w') as val, \
     open(file, 'r') as f:
    
    # First, we take strings from the lengths for which
    # we have enumerated more than 20k strings: 0, 2, 4, 6
    # For l = 8, we have only ~7k strings; we take all these.
    for line in f:
        line = line[:-1]
        s = line.index('|')

        line += choice(c)
        for _ in range(11): line += choice(o+c)
        line += '\n'

        r = random()
        match(s):
            case 0: # 20000/9153810 = 0.002
                if r <= 0.001: train.write(line)
                elif r <= 0.0015: test.write(line)
                elif r <= 0.002: val.write(line)
            case 2: # 20000/763020 = 0.026
                if r <= 0.013: train.write(line)
                elif r <= 0.019: test.write(line)
                elif r <= 0.026: val.write(line)
            case 4: # 20000/127332 = 0.157
                if r <= 0.078: train.write(line)
                elif r <= 0.117: test.write(line)
                elif r <= 0.157: val.write(line)
            case 8: # take all
                if r <= 0.5: train.write(line)
                elif r <= 0.75: test.write(line)
                else: val.write(line)
        
    # For l = 8, generate ~13k more strings
    for _ in range(6500): train.write(gen_string_dyck(8)+'\n')
    for _ in range(3250): test.write(gen_string_dyck(8)+'\n')
    for _ in range(3250): val.write(gen_string_dyck(8)+'\n')

    # For l = 10, 12, 14, 16, 18, 20,
    # generate ~20k strings each.
    for _ in range(10000):
        for L in range(10, 21, 2):
            train.write(gen_string_dyck(L)+'\n')

    for _ in range(5000):
        for L in range(10, 21, 2):
            test.write(gen_string_dyck(L)+'\n')

    for _ in range(5000):
        for L in range(10, 21, 2):
            val.write(gen_string_dyck(L)+'\n')