In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
import time

2024-04-07 01:52:10.688454: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-07 01:52:10.688480: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-07 01:52:10.689280: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Data generation

In [2]:
SOURCE_MAX_LENGTH = 13 + 2
TARGET_MAX_LENGTH = 5 + 2
EOS = '<EOS>'
ops = ['+', '-', '*']
exps = [['<E>', '<OP>', '<E>'], ['<N>', '<OP>', '<E>'], ['(', '<E>', '<OP>', '<E>', ')'], ['<N>']]


def flatten(lst:list) -> list:
    flattened = []
    for item in lst:
        if isinstance(item, (list, tuple)):
            flattened.extend(item)
        else:
            flattened.append(item)
    return flattened
    

def make_expression(expression, depth=0, min_depth=0, max_depth=3) -> list:
    if len(expression) == 0: expression.append('<E>')
    if '<E>' not in expression: return expression
    if depth == max_depth:
        for i in range(len(expression)):
            if expression[i] == '<E>': expression[i] = '<N>'
        return expression

    while '<E>' in expression:
        if depth > min_depth:
            i = np.random.choice(len(exps))
        else:
            i = np.random.choice(len(exps[:-1]))
        ei = expression.index('<E>')
        expression[ei] = exps[i]
    expression = flatten(expression)
    return make_expression(expression, depth=depth + 1, min_depth=min_depth, max_depth=max_depth)

In [3]:
def generate_single_data(low=0, high=10, min_depth=0, max_depth=2):
    expression = []
    expression = make_expression(expression, min_depth=min_depth, max_depth=max_depth)
    for i in range(len(expression)):
        if expression[i] == '<N>':
            num = np.random.randint(low, high)
            expression[i] = str(num)
        elif expression[i] == '<OP>':
            op = np.random.choice(ops)
            expression[i] = op
    x = ''.join(expression)
    y = str(eval(x))
    return x, y

In [4]:
def generate_data(num_samples=1000, low=0, high=10, min_depth=1, max_depth=2):
    source = []
    target = []
    cnt = 0
    while cnt < num_samples:
        try:
            xi, yi = generate_single_data(low, high, min_depth=min_depth, max_depth=max_depth)
            source.append(xi)
            target.append(yi)
            cnt += 1
            if cnt % 1000 == 0:
                print(f'\rCount: {cnt:>9}', end='')
        except:
            pass
    print()
    return source, target

In [5]:
source1, target1 = generate_data(num_samples=50_000, low=1, min_depth=1, max_depth=2)
source2, target2 = generate_data(num_samples=50_000, low=1, min_depth=1, max_depth=1)

source = np.concatenate([source1, source2], axis=-1)
target = np.concatenate([target1, target2], axis=-1)

idx = [x for x in range(len(source))]
np.random.shuffle(idx)

source = source[idx]
target = target[idx]


max_source_generated = 0
max_target_generated = 0
for xi, yi in zip(source, target):
    max_source_generated = max(len(xi.strip()), max_source_generated)
    max_target_generated = max(len(yi.strip()), max_target_generated)

print(max_source_generated, 'is the maximum length for source')
print(max_target_generated, 'is the maximum length for target')

Count:     50000
Count:     50000
13 is the maximum length for source
4 is the maximum length for target


## Prepration

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [7]:
START_TOKEN = 's'
END_TOKEN = 'e'

def preprocess(source, target):
    out_source = []
    out_target = []
    for i in range(len(source)):
        out_source.append(f"{START_TOKEN}{source[i]}{END_TOKEN}")
        out_target.append(f"{START_TOKEN}{target[i]}{END_TOKEN}")
    return out_source, out_target


def tokenize(data, padding='pre'):
    tokenizer = Tokenizer(filters='', char_level=True)
    tokenizer.fit_on_texts(data)
    tokenized = tokenizer.texts_to_sequences(data)
    padded = pad_sequences(tokenized, maxlen=SOURCE_MAX_LENGTH, padding=padding)
    return padded, tokenizer

out_source, out_target = preprocess(source, target)
source, source_tokenizer = tokenize(out_source, padding='pre')
target, target_tokenizer = tokenize(out_target, padding='post')

In [8]:
SHUFFLE_BUFFER = 1000
BATCH_SIZE = 64
AUTOTUNE = tf.data.experimental.AUTOTUNE
VALID_SIZE = int(len(source) * 0.2)

dataset = tf.data.Dataset.from_tensor_slices((source, target))
test_ds = dataset.take(VALID_SIZE)
train_ds = dataset.skip(VALID_SIZE)

train_ds = train_ds.shuffle(SHUFFLE_BUFFER)
train_ds = train_ds.batch(BATCH_SIZE, drop_remainder=True, num_parallel_calls=AUTOTUNE)
train_ds = train_ds.prefetch(AUTOTUNE)

test_ds = test_ds.batch(BATCH_SIZE, drop_remainder=True, num_parallel_calls=True)
test_ds = test_ds.prefetch(AUTOTUNE)

In [9]:
for s, t in train_ds.take(1):
    for i in range(len(s)):
        ex_s = s[i]
        ex_t = t[i]
        ex_s = ''.join([source_tokenizer.index_word[x] for x in ex_s.numpy() if x != 0])
        ex_t = ''.join([target_tokenizer.index_word[x] for x in ex_t.numpy() if x != 0])
        print(ex_s[1:-1], "=", ex_t[1:-1])

(6+7+2-8) = 7
2+6 = 8
9-9 = 0
4-(9*6) = -50
7+8 = 15
2*1 = 2
7+5 = 12
5+1 = 6
9+1 = 10
(6+3) = 9
(8+6-4*1) = 10
(5*9) = 45
7*4 = 28
(4+5) = 9
9*6 = 54
3-5*7 = -32
9+1 = 10
(1-3)-7-9 = -18
(7+8+(9*2)) = 33
6*9 = 54
(2*2-8+3) = -1
3*1 = 3
6*6+6 = 42
7+8*4 = 39
4+6+7*8 = 66
5*5 = 25
((2+9)-1+2) = 12
7*7 = 49
8+8 = 16
(3-7) = -4
5*3 = 15
7*7 = 49
9+5+7+1 = 22
(8*9) = 72
2+9 = 11
((8+7)-6-6) = 3
4+1 = 5
7+4 = 11
(2*2) = 4
(7-1)+(4+7) = 17
7+4+2 = 13
4*2+6 = 14
(6*9-6*1) = 48
(4*3) = 12
(7-5+8+6) = 16
4*4 = 16
(1-9) = -8
(5+8-4*5) = -7
9-4 = 5
5*4 = 20
1*3 = 3
4*9 = 36
5+1*9 = 14
4-2+2 = 4
(8-6*6-1) = -29
9+1*2-4 = 7
4-5+2+2 = 3
5+1+5 = 11
5-6 = -1
4-9+4-6 = -7
3*1 = 3
6*8 = 48
(1+5) = 6
2+1 = 3


## Defining the model

In [10]:
from keras.models import Model
from keras.layers import Layer, LSTM, Bidirectional, Activation, Embedding, Dense
from keras.losses import SparseCategoricalCrossentropy
from keras.optimizers import Adam
from keras.metrics import Accuracy

In [11]:
class Encoder(Model):
    def __init__(self, units, vocab_size, embedding_dim, batch_size):
        super(Encoder, self).__init__()
        self.units = units
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.embedding_dim = embedding_dim

        self.embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.lstm = Bidirectional(LSTM(units=units, return_sequences=True, return_state=True))


    def initialize_hidden_states(self):
        h = tf.zeros((self.batch_size, self.units), dtype='float32')
        c = tf.zeros((self.batch_size, self.units), dtype='float32')
        bh = tf.zeros((self.batch_size, self.units), dtype='float32')
        bc = tf.zeros((self.batch_size, self.units), dtype='float32')
        return h, c, bh, bc


    def call(self, inputs, hidden):
        x = self.embedding(inputs)
        x, h, c, bh, bc = self.lstm(x)
        h = tf.concat([h, bh], axis=-1)
        c = tf.concat([c, bc], axis=-1)
        return x, h, c

In [12]:
class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.units = units

        self.Wa = Dense(units=units)
        self.Ua = Dense(units=units)
        self.V = Dense(1)


    def call(self, encoder_outputs, decoder_states):
        h, c = decoder_states
        
        h = tf.concat([h, c], axis=-1)
        h = tf.expand_dims(h, 1)

        a = tf.nn.tanh(self.Wa(h) + self.Ua(encoder_outputs))
        a = self.V(a)

        attention_weights = tf.nn.softmax(a, axis=1)
        context = attention_weights * encoder_outputs
        context = tf.reduce_sum(context, axis=1)

        return context, attention_weights

In [13]:
class Decoder(Model):
    def __init__(self, units, attention_units, vocab_size, embedding_dim):
        super(Decoder, self).__init__()
        self.units = units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.attention_units = attention_units

        self.embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.lstm = LSTM(units=units, return_state=True)
        self.attention = BahdanauAttention(units=attention_units)
        self.output_layer = Dense(vocab_size)


    def call(self, inputs, encoder_outputs, hidden):
        embeddings = self.embedding(inputs)
        context, attention_weights = self.attention(encoder_outputs, hidden)

        context = tf.expand_dims(context, 1)
        concat = tf.concat([embeddings, context], axis=-1)

        output, h, c = self.lstm(concat, initial_state=hidden)
        output = self.output_layer(output)

        return output, (h, c), attention_weights

In [14]:
SOURCE_VOCAB_SIZE = len(source_tokenizer.word_index) + 1
TARGET_VOCAB_SIZE = len(target_tokenizer.word_index) + 1
ENC_UNITS = 256
DEC_UNITS = 512
ATTENTION_UNITS = 10
EMBEDDING_DIM = 50

encoder = Encoder(ENC_UNITS, SOURCE_VOCAB_SIZE, EMBEDDING_DIM, BATCH_SIZE)
decoder = Decoder(DEC_UNITS, ATTENTION_UNITS, TARGET_VOCAB_SIZE, EMBEDDING_DIM)

In [15]:
optimizer = Adam()
loss_object = SparseCategoricalCrossentropy(from_logits=True)
train_accuracy = Accuracy()
valid_accuracy = Accuracy()

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    candidate_loss = loss_object(real, pred)
    mask = tf.cast(mask, dtype='float32')
    loss = candidate_loss * mask
    loss = tf.reduce_mean(loss)
    return loss

In [16]:
@tf.function
def training_step(source, target, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        encoder_outputs, h, c = encoder(source, enc_hidden)
        decoder_hidden = (h, c)

        dec_inp = tf.expand_dims([target_tokenizer.word_index[START_TOKEN]] * BATCH_SIZE, 1)
        for i in range(1, TARGET_MAX_LENGTH):
            decoder_outputs, decoder_hidden, _ = decoder(dec_inp, encoder_outputs, decoder_hidden)
            loss += loss_function(target[:, i], decoder_outputs)
            decoder_outputs = tf.argmax(decoder_outputs, axis=-1)
            
            train_accuracy.update_state(target[:, i], decoder_outputs)
            dec_inp = tf.expand_dims(target[:, i], 1)

        batch_loss = loss / int(targ.shape[1])
        variables = encoder.trainable_variables + decoder.trainable_variables
        grads = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(grads, variables))
        return batch_loss

In [17]:
@tf.function
def validation_step(source, target, enc_hidden, teacher_forcing=False):
    mean_loss = 0
    encoder_outputs, h, c = encoder(source, enc_hidden)
    decoder_hidden = (h, c)

    dec_inp = tf.expand_dims([target_tokenizer.word_index[START_TOKEN]] * BATCH_SIZE, 1)
    for i in range(1, TARGET_MAX_LENGTH):
        decoder_outputs, decoder_hidden, _ = decoder(dec_inp, encoder_outputs, decoder_hidden)
        loss = loss_function(target[:, i], decoder_outputs)
        mean_loss = mean_loss + loss

        decoder_outputs = tf.argmax(decoder_outputs, axis=-1)
        valid_accuracy.update_state(target[:, i], decoder_outputs)
        if not teacher_forcing:
            dec_inp = tf.expand_dims(decoder_outputs, 1)
        else:
            enc_inp = tf.expand_dims(target[:, i], 1)

    mean_loss = mean_loss / int(target.shape[1])
    return mean_loss

In [18]:
EPOCHS = 15

for epoch in range(EPOCHS):
    encoder_hidden = encoder.initialize_hidden_states()
    train_accuracy.reset_state()
    valid_accuracy.reset_state()
    
    print(f'Epoch {epoch + 1:>5}')

    mean_loss = 0
    for step, (src, targ) in enumerate(train_ds):
        loss = training_step(src, targ, encoder_hidden)
        mean_loss = mean_loss + (1 / (step + 1)) * (loss - mean_loss)
        train_acc = train_accuracy.result()
        print(f'\rStep {step + 1:>5}\tloss {mean_loss:>4.3f}\taccuracy {train_acc:2.3f}', end='')
    print()

    mean_val_loss = 0
    for step, (src, targ) in enumerate(test_ds):
        loss = validation_step(src, targ, encoder_hidden, teacher_forcing=False)
        mean_val_loss = mean_val_loss + (1 / (step + 1)) * (loss - mean_val_loss)
        val_acc = valid_accuracy.result()
        print(f'\rStep {step + 1:>4}\tval_loss {mean_val_loss:>4.3f}\tval_acc {val_acc:2.3f}', end='')
    print()

Epoch     1


I0000 00:00:1712442152.515754   66801 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Step  1250	loss 0.174	accuracy 0.836
Step  312	val_loss 0.141	val_acc 0.878
Epoch     2
Step  1250	loss 0.082	accuracy 0.918
Step  312	val_loss 0.113	val_acc 0.905
Epoch     3
Step  1250	loss 0.061	accuracy 0.937
Step  312	val_loss 0.111	val_acc 0.909
Epoch     4
Step  1250	loss 0.051	accuracy 0.947
Step  312	val_loss 0.081	val_acc 0.932
Epoch     5
Step  1250	loss 0.044	accuracy 0.953
Step  312	val_loss 0.066	val_acc 0.944
Epoch     6
Step  1250	loss 0.040	accuracy 0.958
Step  312	val_loss 0.073	val_acc 0.942
Epoch     7
Step  1250	loss 0.036	accuracy 0.961
Step  312	val_loss 0.054	val_acc 0.958
Epoch     8
Step  1250	loss 0.031	accuracy 0.967
Step  312	val_loss 0.063	val_acc 0.952
Epoch     9
Step  1250	loss 0.025	accuracy 0.973
Step  312	val_loss 0.041	val_acc 0.970
Epoch    10
Step  1250	loss 0.020	accuracy 0.978
Step  312	val_loss 0.048	val_acc 0.967
Epoch    11
Step  1250	loss 0.018	accuracy 0.981
Step  312	val_loss 0.039	val_acc 0.972
Epoch    12
Step  1250	loss 0.016	accuracy 0

## Evaluation

In [19]:
def evaluate(source_inp):
    source_inp = START_TOKEN.lower() + source_inp + END_TOKEN.lower()
    inp = source_tokenizer.texts_to_sequences([source_inp])
    inp = tf.convert_to_tensor(inp)
    padded = pad_sequences(inp, maxlen=SOURCE_MAX_LENGTH, padding='pre')
    hidden = encoder.initialize_hidden_states()
    encoded_outputs, h, c = encoder(padded, hidden)
    decoder_hidden = (h, c)

    dec_inp = tf.expand_dims([target_tokenizer.word_index[START_TOKEN.lower()]] * 1, 1)
    outputs = []
    for i in range(1, TARGET_MAX_LENGTH):
        decoder_outputs, decoder_hidden, attention_weights = decoder(dec_inp, encoded_outputs, decoder_hidden)
        decoder_outputs = tf.argmax(decoder_outputs, axis=-1)
        outputs.append(decoder_outputs)
        dec_inp = tf.expand_dims(decoder_outputs, 1)
    outputs = [target_tokenizer.index_word[x.numpy()[0]] for x in outputs if x != 0]
    return ''.join(outputs[:-1])

In [20]:
print('Easy Ones')
for i in range(20):
    test_source, test_target = generate_single_data(low=1, min_depth=1, max_depth=1)
    pred = evaluate(test_source)
    print(f'{test_source:<30} = {test_target:>8} and predicted {pred:>8}')

print('\nDifficult ones')
for i in range(20):
    test_source, test_target = generate_single_data(low=1, min_depth=1, max_depth=2)
    pred = evaluate(test_source)
    print(f'{test_source:<30} = {test_target:>8} and predicted {pred:>8}')

Easy Ones
5-3                            =        2 and predicted        2
4+8                            =       12 and predicted       12
(3-5)                          =       -2 and predicted       -2
8-1                            =        7 and predicted        7
4-3                            =        1 and predicted        1
8-9                            =       -1 and predicted       -1
8*7                            =       56 and predicted       56
1+6                            =        7 and predicted        7
7+8                            =       15 and predicted       15
2*6                            =       12 and predicted       12
8+9                            =       17 and predicted       17
9*3                            =       27 and predicted       27
1*5                            =        5 and predicted        5
4*6                            =       24 and predicted       24
(7+7)                          =       14 and predicted       14
1*5            