In [197]:
reber_grammar = {
  0: ((1, 'B'), ),
  1: ((2, 'T'), (5, 'P'), ),
  2: ((2, 'S'), (3, 'X'), ),
  3: ((4, 'S'), (5, 'X'), ),
  4: ((7, 'E'), ),
  5: ((5, 'T'), (6, 'V'), ),
  6: ((3, 'P'), (4, 'V'), ),
  7: (),
}

nested_grammar = {
  0: ((1, 'B'), ),
  1: ((2, 'T'), (3, 'P'), ),
  2: ((4, 'B'), ),
  3: ((5, 'B'), ),
  4: ((6, 'T'), (7, 'P')),
  5: ((8, 'T'), (9, 'P')),
  6: ((6, 'S'), (10, 'X')),
  7: ((7, 'T'), (11, 'V')),
  8: ((8, 'S'), (12, 'X')),
  9: ((9, 'T'), (13, 'V')),
  10: ((7, 'X'), (14, 'S')),
  11: ((10, 'P'), (14, 'V')),
  12: ((9, 'X'), (15, 'S')),
  13: ((12, 'P'), (15, 'V')),
  14: ((16, 'E'), ),
  15: ((17, 'E'), ),
  16: ((18, 'T'), ),
  17: ((18, 'P'), ),
  18: ((19, 'E'), ),
  19: (),
}

incorrect_grammar = {
  0: ((1, 'B'), ),
  1: ((2, 'T'), (3, 'P'), ),
  2: ((4, 'B'), ),
  3: ((5, 'B'), ),
  4: ((6, 'P'), (7, 'T')),
  5: ((8, 'P'), (9, 'T')),
  6: ((6, 'S'), (10, 'X')),
  7: ((7, 'T'), (11, 'V')),
  8: ((8, 'S'), (12, 'X')),
  9: ((9, 'T'), (13, 'V')),
  10: ((7, 'X'), (14, 'S')),
  11: ((10, 'P'), (14, 'V')),
  12: ((9, 'X'), (15, 'S')),
  13: ((12, 'P'), (15, 'V')),
  14: ((16, 'E'), ),
  15: ((17, 'E'), ),
  16: ((18, 'T'), ),
  17: ((18, 'P'), ),
  18: ((19, 'E'), ),
  19: (),
}

symbols = ('B', 'T', 'P', 'S', 'X', 'V', 'E')

In [198]:
import random

def generate_grammar(grammar):
  sentence = ''
  i = 0
  while len(grammar[i]) > 0:
    (i, char) = grammar[i][random.randint(0, len(grammar[i]) - 1)]
    sentence += char
  return sentence

def generate_random_sequence(symbols, length):
  sentence = 'B'
  for _ in range(length - 2):
    sentence += symbols[random.randint(0, len(symbols) - 1)]
  return sentence + 'E'


In [199]:
n = 10000

used_grammar = nested_grammar

sentences = [generate_grammar(used_grammar) for _ in range(n)]

lengths = [len(x) for x in sentences]
lengths_squared = [x*x for x in lengths]
avg = sum(lengths) / n
var = (sum(lengths_squared) / n) - (avg*avg)
max_length = max(lengths)
print(min(lengths), max_length, sum(lengths) / n, var, var**(0.5))

9 40 12.0114 11.47587003999999 3.38760535481924


In [200]:
# invalid_sentences = [generate_random_sequence(symbols, len(sentence)) for sentence in sentences]
invalid_sentences = [generate_grammar(incorrect_grammar) for _ in range(n)]


invalid_lengths = [len(x) for x in invalid_sentences]
invalid_lengths_squared = [x*x for x in invalid_lengths]
invalid_avg = sum(invalid_lengths) / n
invalid_var = (sum(invalid_lengths_squared) / n) - (invalid_avg*invalid_avg)
print(min(invalid_lengths), max(invalid_lengths), sum(invalid_lengths) / n, invalid_var, invalid_var**(0.5))

max_length = max(max_length, max(invalid_lengths))

9 35 11.9944 11.274368639999977 3.357732663569269


In [201]:
import tensorflow as tf

def dsprint(ds):
  for item in ds.take(1):
    print(item)

tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)

tokenizer.fit_on_texts(sentences)

def preprocess(x):
  return [[-1]] + tokenizer.texts_to_sequences(x) + ([[-2]] * (max_length - len(x)))

tokenized_sentences = [preprocess(x) for x in sentences]
tokenized_invalid_sentences = [preprocess(x) for x in invalid_sentences]

print(tokenized_sentences[0])

[[-1], [3], [2], [3], [2], [1], [1], [1], [5], [5], [4], [2], [4], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2]]


In [202]:
correct_ds = tf.data.Dataset.from_tensor_slices(tokenized_sentences)
correct_ds = correct_ds.map(lambda x: (x, 1))

incorrect_ds = tf.data.Dataset.from_tensor_slices(tokenized_invalid_sentences)
incorrect_ds = incorrect_ds.map(lambda x: (x, 0))

ds = tf.data.Dataset.from_tensor_slices([correct_ds, incorrect_ds])
ds = ds.interleave(lambda x: x, 2, 1)
ds = ds.shuffle(2*n).batch(32)

dsprint(ds)

(<tf.Tensor: shape=(32, 41, 1), dtype=int32, numpy=
array([[[-1],
        [ 3],
        [ 2],
        ...,
        [-2],
        [-2],
        [-2]],

       [[-1],
        [ 3],
        [ 2],
        ...,
        [-2],
        [-2],
        [-2]],

       [[-1],
        [ 3],
        [ 1],
        ...,
        [-2],
        [-2],
        [-2]],

       ...,

       [[-1],
        [ 3],
        [ 2],
        ...,
        [-2],
        [-2],
        [-2]],

       [[-1],
        [ 3],
        [ 2],
        ...,
        [-2],
        [-2],
        [-2]],

       [[-1],
        [ 3],
        [ 2],
        ...,
        [-2],
        [-2],
        [-2]]], dtype=int32)>, <tf.Tensor: shape=(32,), dtype=int32, numpy=
array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1], dtype=int32)>)


In [203]:
train_size = int(n * 0.8)
val_size = (n - train_size) // 2

train = ds.take(train_size)
val = ds.skip(train_size).take(val_size)
test = ds.skip(train_size + val_size)

In [204]:
model = tf.keras.models.Sequential([
  tf.keras.layers.InputLayer(input_shape=(max_length + 1, 1)),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Conv1D(filters=1 * 10, kernel_size=2, padding='causal', activation='relu', dilation_rate=1),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Conv1D(filters=2 * 10, kernel_size=2, padding='causal', activation='relu', dilation_rate=2),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Conv1D(filters=4 * 10, kernel_size=2, padding='causal', activation='relu', dilation_rate=4),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Conv1D(filters=8 * 10, kernel_size=2, padding='causal', activation='relu', dilation_rate=8),
  tf.keras.layers.BatchNormalization(),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=False)),
  tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(optimizer='adam', loss=[tf.keras.losses.binary_crossentropy], metrics=[tf.keras.metrics.binary_accuracy])

In [206]:
model.fit(train, validation_data=val, epochs=5)

Epoch 1/5
Epoch 2/5
128/625 [=====>........................] - ETA: 54s - loss: 1.8862e-04 - binary_accuracy: 1.0000

KeyboardInterrupt: 

In [213]:
test = generate_grammar(nested_grammar)
tokens = preprocess(test)
random_test = generate_grammar(incorrect_grammar)
# random_test = generate_random_sequence(symbols, len(test))
# random_test = 'BTSSPXSE'
# random_test = 'BTXXVVSE'
random_tokens = preprocess(random_test)
print(test, tokens)
print(random_test, random_tokens)
model.predict([tokens, random_tokens])

# for item in ds.take(1):
#   print(model.predict(item[0]))
#   print(item[1])


BTBTXSETE [[-1], [3], [1], [3], [1], [6], [7], [4], [1], [4], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2]]
BPBPXSEPE [[-1], [3], [2], [3], [2], [6], [7], [4], [2], [4], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2], [-2]]


array([[9.998388e-01],
       [1.254029e-04]], dtype=float32)