In [47]:
from random import random, choice, randint

In [61]:
L = 6
file = f'expr{L}.txt'

In [62]:
with open(file, 'r') as f:
    fail = 0
    succeed = [0 for _ in range(L+1)]
    for line in f:
        if ',' in line[:-1]:
            fail += 1
            continue
        s = line[:-1].index('|')
        succeed[s] += 1

print(f"Fail {fail}")
print(f"Succeed {succeed}")

Fail 3758475
Succeed [0, 6576750, 0, 906750, 0, 148650, 0]


# Star
No strings fail in `Star`; we pick uniformly from the strings.

We aim for test and validation sets with $5\cdot10^4$ strings (5%) each, and a train set with roughly $10^5$ strings (10%).

In [8]:
with open('star/train.txt', 'w') as train, \
     open('star/val.txt', 'w') as val, \
     open('star/test.txt', 'w') as test, \
     open(file, 'r') as f:
    for line in f:
        r = random()
        if (r <= 0.05):
            test.write(line)
        elif (r <= 0.1):
            val.write(line)
        elif (r <= 0.2):
            train.write(line)

# Brack
In `Brack`, we have about 40k strings that succeed (of various lengths). We therefore create train, test and validation sets with a 50-50 distribution between succeeding and failing strings. We pick all the succeeding strings and appropriately select the proportion of failing strings (about 4%).

In [58]:
with open('brack/train.txt', 'w') as train, \
     open('brack/val.txt', 'w') as val, \
     open('brack/test.txt', 'w') as test, \
     open(file, 'r') as f:
     for line in f:
          r = random()
          if ',' in line[:-1]:
               if (r <= 0.02):
                    train.write(line)
               elif (r <= 0.03):
                    test.write(line)
               elif (r <= 0.04):
                    val.write(line)
          else:
               if (r <= 0.5):
                    train.write(line)
               elif (r <= 0.75):
                    test.write(line)
               else:
                    val.write(line)

# Expr
We can generate strings for `Expr` without enumerating. There can only be an odd number of strings; thus we generate roughly 5000 strings of each length in the range from 3 to 21 (inclusive), plus 500 single-digit datapoints.
For a given length $L$:

* Pick a random number of (pairs of) brackets, $B \in \left[0, \frac{L-1}{2}\right]$.
* Generate a random string of length $L- 2B$, by alternating between digits and operators.
* (Repeat $B$ times): Pick two random, distinct digits in the string, and insert brackets before the first and after the second.

We also pick 50000 failing strings (about 1.3%) from the enumerated strings.

We follow similar procedures for the test and val datasets.

For uniformity, we pad all strings to have length 21.

In [69]:
# Failing strings first
with open('expr/train.txt', 'w') as train, \
     open('expr/val.txt', 'w') as val, \
     open('expr/test.txt', 'w') as test, \
     open(file, 'r') as f:
     for line in f:
          r = random()
          if ',' in line[:-1]:
               if (r <= 0.013):
                    train.write(line[:-1])
                    for _ in range(15):
                         train.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
                    train.write('\n')
               elif (r <= 0.019):
                    val.write(line[:-1])
                    for _ in range(15):
                         val.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
                    val.write('\n')
               elif (r <= 0.026):
                    test.write(line[:-1])
                    for _ in range(15):
                         test.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
                    test.write('\n')

In [70]:
# Succeeding strings
def gen_string(L):
     B = int(random() * (L-1)/2)
     s = ''
     for i in range(L-2*B):
          if (i % 2 == 0):
               s += choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
          else:
               s += choice(['+', '×', '^'])
     for _ in range(B):
          dig_pos = [i for i, c in enumerate(s) if c in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']]
          b1 = choice(dig_pos)
          b2 = choice([p for p in dig_pos if p >= b1])
          s = s[:b1] + '[' + s[b1:b2+1] + ']' + s[b2+1:]

     if L == 21:
          return s

     s += '|' + choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '[', ']'])
     while (len(s) < 21):
          s += choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']'])
     return s

with open('expr/train.txt', 'a') as train:
     for L in range(3, 22, 2):
          for _ in range(5000):
               train.write(gen_string(L) +'\n')
     for _ in range(500):
          train.write(str(randint(0, 9)) + '|')
          train.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '[', ']']))
          for _ in range(20):
               train.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
          train.write('\n')

with open('expr/test.txt', 'a') as test:
     for L in range(3, 22, 2):
          for _ in range(2500):
               test.write(gen_string(L) +'\n')
     for _ in range(250):
          test.write(str(randint(0, 9))+'|')
          test.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '[', ']']))
          for _ in range(20):
               test.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
          test.write('\n')

with open('expr/val.txt', 'a') as val:
     for L in range(3, 22, 2):
          for _ in range(2500):
               val.write(gen_string(L) +'\n')
     for _ in range(250):
          val.write(str(randint(0, 9))+'|')
          val.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '[', ']']))
          for _ in range(20):
               val.write(choice(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '×', '^', '[', ']']))
          val.write('\n')