# Encoder/Decoder (Seq2Seq)

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras

2024-03-31 05:20:17.727464: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 05:20:17.727493: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 05:20:17.728361: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Data generation

In [2]:
SOURCE_MAX_LENGTH = 13
TARGET_MAX_LENGTH = 5
EOS = '<EOS>'
ops = ['+', '-', '*']
exps = [['<E>', '<OP>', '<E>'], ['<N>', '<OP>', '<E>'], ['(', '<E>', '<OP>', '<E>', ')'], ['<N>']]


def flatten(lst:list) -> list:
    flattened = []
    for item in lst:
        if isinstance(item, (list, tuple)):
            flattened.extend(item)
        else:
            flattened.append(item)
    return flattened
    

def make_expression(expression, depth=0, min_depth=0, max_depth=3) -> list:
    if len(expression) == 0: expression.append('<E>')
    if '<E>' not in expression: return expression
    if depth == max_depth:
        for i in range(len(expression)):
            if expression[i] == '<E>': expression[i] = '<N>'
        return expression

    while '<E>' in expression:
        if depth > min_depth:
            i = np.random.choice(len(exps))
        else:
            i = np.random.choice(len(exps[:-1]))
        ei = expression.index('<E>')
        expression[ei] = exps[i]
    expression = flatten(expression)
    return make_expression(expression, depth=depth + 1, min_depth=min_depth, max_depth=max_depth)

In [3]:
def generate_single_data(low=0, high=10, min_depth=0, max_depth=2):
    expression = []
    expression = make_expression(expression, min_depth=min_depth, max_depth=max_depth)
    for i in range(len(expression)):
        if expression[i] == '<N>':
            num = np.random.randint(low, high)
            expression[i] = str(num)
        elif expression[i] == '<OP>':
            op = np.random.choice(ops)
            expression[i] = op
    x = ''.join(expression)
    y = str(eval(x))
    return x, y

In [4]:
def generate_data(num_samples=1000, low=0, high=10, min_depth=1, max_depth=2):
    source = []
    target = []
    cnt = 0
    while cnt < num_samples:
        try:
            xi, yi = generate_single_data(low, high, min_depth=min_depth, max_depth=max_depth)
            source.append(xi)
            target.append(yi)
            cnt += 1
            if cnt % 1000 == 0:
                print(f'\rCount: {cnt:>9}', end='')
        except:
            pass
    return source, target

In [5]:
source1, target1 = generate_data(num_samples=200_000, min_depth=1, max_depth=2)
source2, target2 = generate_data(num_samples=200_000, min_depth=1, max_depth=1)

Count:    200000

In [6]:
source = np.concatenate([source1, source2], axis=-1)
target = np.concatenate([target1, target2], axis=-1)

idx = [x for x in range(len(source))]
np.random.shuffle(idx)

source = source[idx]
target = target[idx]

In [7]:
max_length_generated = 0
for xi, yi in zip(source, target):
    #print(xi, '=', yi)
    max_length_generated = max(len(xi.strip()), max_length_generated)

print(max_length_generated, 'is the maximum length')

13 is the maximum length


## Prepration

In [8]:
source_vocab = [EOS] + list('0123456789()') + ops
char_to_id_source = tf.keras.layers.StringLookup(vocabulary=source_vocab, num_oov_indices=0)
id_to_char_source = tf.keras.layers.StringLookup(vocabulary=source_vocab, num_oov_indices=0, invert=True)
SOURCE_VOCAB_SIZE = len(char_to_id_source.get_vocabulary())
print(f'Length of source vocabulary is {SOURCE_VOCAB_SIZE}')

Length of source vocabulary is 16


In [9]:
target_vocab = [EOS] + list('-0123456789')
char_to_id_target = tf.keras.layers.StringLookup(vocabulary=target_vocab, num_oov_indices=0)
id_to_char_target = tf.keras.layers.StringLookup(vocabulary=target_vocab, num_oov_indices=0, invert=True)
TARGET_VOCAB_SIZE = len(char_to_id_target.get_vocabulary())
print(f'Length of target vocabulary is {TARGET_VOCAB_SIZE}')

Length of target vocabulary is 12


In [10]:
def str_to_id_source(str_num):
    return char_to_id_source(tf.strings.unicode_split(str_num, 'UTF-8'))

def ids_to_str_source(ids):
    return tf.strings.reduce_join(id_to_char_source(ids), axis=-1)

def str_to_id_target(str_num):
    return char_to_id_target(tf.strings.unicode_split(str_num, 'UTF-8'))

def ids_to_str_target(ids):
    return tf.strings.reduce_join(id_to_char_target(ids), axis=-1)

In [11]:
EOS_SOURCE_INT = char_to_id_source(EOS)
EOS_TARGET_INT = char_to_id_target(EOS)
SHUFFLE_BUFFER = 1000
BATCH_SIZE = 128
AUTOTUNE = tf.data.experimental.AUTOTUNE

def get_dataset_ids(source, target):
    source = str_to_id_source(source)
    target = str_to_id_target(target)
    return source, target


def pad_sequences(source, target):
    source_len = tf.shape(source)[-1]
    target_len = tf.shape(target)[-1]
    source = tf.pad(source, [[SOURCE_MAX_LENGTH - source_len, 0]], constant_values=EOS_SOURCE_INT)
    target = tf.pad(target, [[1, TARGET_MAX_LENGTH - target_len - 1]], constant_values=EOS_TARGET_INT)
    return source, target


def get_source_target_label(source, target):
    label = target[1:]
    target = target[:-1]
    return (source, target), label


dataset = tf.data.Dataset.from_tensor_slices((source, target))
dataset = dataset.map(get_dataset_ids, num_parallel_calls=AUTOTUNE)
dataset = dataset.map(pad_sequences, num_parallel_calls=AUTOTUNE)
dataset = dataset.map(get_source_target_label, num_parallel_calls=AUTOTUNE)
dataset = dataset.shuffle(SHUFFLE_BUFFER)
dataset = dataset.batch(BATCH_SIZE, num_parallel_calls=AUTOTUNE)
dataset = dataset.prefetch(AUTOTUNE)

In [12]:
for (s, t), y in dataset.take(30):
    i = np.random.choice(len(s))
    s = s[i]
    t = t[i]
    print(ids_to_str_source(s).numpy().decode('utf-8').replace('<EOS>', ''), '=', 
          ids_to_str_target(t).numpy().decode('utf-8').replace('<EOS>', ''))

1+6+3-0 = 10
2-5-5 = -8
(2*0) = 0
8-(1*1) = 7
(8-0) = 8
3+6 = 9
5*(8+5) = 65
(8*2)*6+3 = 99
9+1 = 10
(8+4) = 12
(9*3) = 27
7+2*4 = 15
(0-2+(0*8)) = -2
0-5 = -5
9*9 = 81
8*4 = 32
3-9 = -6
3+0 = 3
3-(1*9) = -6
3+0 = 3
7-6 = 1
8+3-0-7 = 4
6*7*9*6 = 226
3-1 = 2
(6+0) = 6
7-3-3 = 1
5-8-(3+5) = -11
0*1*1*4 = 0
(7-8) = -1
2+5 = 7


## Model

In [13]:
from keras.layers import LSTM, TimeDistributed, RepeatVector, Dense, Embedding, Input, Bidirectional
from keras.models import Model
from keras.layers import Layer
from keras.optimizers import Adam, RMSprop, SGD
from keras.callbacks import LearningRateScheduler
from keras.regularizers import L2

In [14]:
SOURCE_EMBEDDING_DIM = 20
TARGET_EMBEDDING_DIM = 10
UNITS = 256

class Encoder(Layer):
    def __init__(self, units, embedding_dim, vocab_size):
        super(Encoder, self).__init__()
        self.units = units
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size

        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim)
        self.lstm1 = LSTM(self.units, return_state=True, return_sequences=True)
    
    def call(self, inputs):
        x = self.embedding(inputs)
        x, h, c = self.lstm1(x)
        return h, c


class Decoder(Layer):
    def __init__(self, units, embedding_dim, vocab_size):
        super(Decoder, self).__init__()
        self.units = units
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size

        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim,)
        self.lstm1 = LSTM(self.units, return_sequences=True, return_state=True)        

    def call(self, inputs, h, c, return_states=False):
        x = self.embedding(inputs)
        x, h, c = self.lstm1(x, initial_state=(h, c))
        if return_states:
            return x, h, c
        return x

In [15]:
class MyModel(Model):
    def __init__(self, units, source_embedding_dim, target_embedding_dim, source_vocab_size, target_vocab_size, **kwargs):
        super(MyModel, self).__init__(**kwargs)

        self.units = units
        self.source_embedding_dim = source_embedding_dim
        self.target_embedding_dim = target_embedding_dim
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size

        self.encoder = Encoder(self.units, self.source_embedding_dim, self.source_vocab_size)
        self.decoder = Decoder(self.units, self.target_embedding_dim, self.target_vocab_size)
        self.classifier = Dense(units=self.target_vocab_size, activation='softmax')
    
    
    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs
        h, c = self.encoder(encoder_inputs)
        x = self.decoder(decoder_inputs, h, c)
        x = self.classifier(x)
        return x

In [16]:
model = MyModel(units=UNITS, source_embedding_dim=SOURCE_EMBEDDING_DIM, target_embedding_dim=TARGET_EMBEDDING_DIM,
               source_vocab_size=SOURCE_VOCAB_SIZE, target_vocab_size=TARGET_VOCAB_SIZE)


model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(),
    metrics=['accuracy']
)

def get_scheduler(initial_learning_rate, weight=0.9):
    def scheduler(epoch):
        return initial_learning_rate * weight ** epoch

    return scheduler

scheduler = get_scheduler(1e-2, weight=0.97)
learning_rate_callback = LearningRateScheduler(scheduler)

In [17]:
history = model.fit(dataset, epochs=100, callbacks=[learning_rate_callback])

Epoch 1/100


I0000 00:00:1711849855.415036   27233 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [18]:
class PredictionModel(Model):
    def __init__(self, main_model):
        super(PredictionModel, self).__init__()
        
        self.main_model = main_model
        self.encoder = self.main_model.encoder
        self.decoder = self.main_model.decoder
        self.classifier = self.main_model.classifier

    def call(self, inputs):
        inputs = str_to_id_source(inputs)[None, ...]
        h, c = self.encoder(inputs)
        tensor_arr = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
        tensor_arr = tensor_arr.write(0, char_to_id_target(EOS).numpy())
        for cnt in range(1, TARGET_MAX_LENGTH + 1):
            semi_complete = tensor_arr.stack()[None, ...]
            x, = self.decoder(semi_complete, h, c)
            x = self.classifier(x)
            x = tf.argmax(x, axis=-1)
            tensor_arr = tensor_arr.write(cnt, x[-1])
        res = tensor_arr.stack()[None, 1:]
        return ids_to_str_target(res).numpy()[0].decode('utf-8').replace(EOS, '')

## Prediction

In [19]:
pred_model = PredictionModel(model)

test_source = [
    '2+3*4', '(5+3)', '2*(3+4)', '5*(2+3)*(5+3)', '(7+9)+8+7', '(1*5)+(9-2)'
]

for i in range(len(test_source)):
    res = pred_model(test_source[i])
    print(test_source[i], '=', res)

2+3*4 = 14
(5+3) = 8
2*(3+4) = 14
5*(2+3)*(5+3) = 90
(7+9)+8+7 = 31
(1*5)+(9-2) = 12


In [20]:
print('Easy Ones')
for i in range(20):
    test_source, test_target = generate_single_data(min_depth=1, max_depth=1)
    pred = pred_model(test_source)
    print(f'{test_source:<13} = {test_target:>5} and predicted {pred:>5}')

print('\nDifficult ones')
for i in range(20):
    test_source, test_target = generate_single_data(min_depth=1, max_depth=2)
    pred = pred_model(test_source)
    print(f'{test_source:<13} = {test_target:>5} and predicted {pred:>5}')

Easy Ones
9*1           =     9 and predicted     1
(6*5)         =    30 and predicted    30
5*6           =    30 and predicted    30
7+6           =    13 and predicted    15
(6*7)         =    42 and predicted    42
(4+0)         =     4 and predicted     4
3*6           =    18 and predicted    20
3-2           =     1 and predicted     1
2-0           =     2 and predicted     2
0+4           =     4 and predicted     4
3-9           =    -6 and predicted    -6
2-9           =    -7 and predicted    -7
5+6           =    11 and predicted    11
5+9           =    14 and predicted    14
0*2           =     0 and predicted     0
(0-0)         =     0 and predicted     0
6+8           =    14 and predicted    14
(4+1)         =     5 and predicted     5
(4*3)         =    12 and predicted    12
8*9           =    72 and predicted    72

Difficult ones
4*2+0*9       =     8 and predicted     8
2+5+5-4       =     8 and predicted     8
7*7+2         =    51 and predicted    57
((0-7)*1