- How to handle unknown words?

In [1]:
import numpy as np
import random
import pandas as pd
import os
import sympy
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import keras
from tensorflow.keras import layers
from sympy import srepr
from sympy import preorder_traversal, symbols
from sympy.parsing.sympy_parser import parse_expr
from gensim.models import Word2Vec

In [2]:
os.chdir("..")

In [3]:
def pad_right(list, total_length=5, const=0):
    length = len(list)
    values_needed = total_length - length 
    if values_needed > 0:
        return np.pad(list, (0, values_needed), mode="constant", constant_values=const) 
    else:
        return list[0:total_length]

In [4]:
data_file = "data.nosync/data.txt"
taylor_file = "data.nosync/data_taylor.txt"
coeffs_file = "data.nosync/data_coeffs.txt"

start = ["[start]"]
end = ["[end]"]

with open(data_file) as f:
    X = f.read().split("\n")
    X = np.array(X)
    X = X[:-1]   # somehow last entry is empty
    X = [parse_expr(xi) for xi in X]

with open(taylor_file) as f:
    y_taylor = f.read().split("\n")
    y_taylor = np.array(y_taylor)
    y_taylor = y_taylor[:-1]
    y_taylor = [parse_expr(yi) for yi in y_taylor]

In [5]:
with open(coeffs_file) as f:
    y_coeffs = f.read().split("\n")
    y_coeffs = y_coeffs[:-1]
    for i, y in enumerate(y_coeffs):
        y = parse_expr(y)
        # add start end tokens and remove whitespaces
        y_coeffs[i] = y 

In [6]:
tmp = [[X[i], y_coeffs[i], y_taylor[i]] for i in range(0, len(X))]
random.shuffle(tmp)

num_train_samples = int(0.90 * len(tmp))

train = tmp[0:num_train_samples]
test = tmp[num_train_samples:]
X_train = [x[0] for x in train]
X_test = [x[0] for x in test]
y_taylor_train = [x[2] for x in train]
y_taylor_test = [x[2] for x in test]
print(len(tmp) == len(train) + len(test))

True


# Vectorization

In [7]:
def sympy_tokenize(expr, tokens_list=[], depth=0, parent_ind=None):
    if (expr.func == sympy.core.symbol.Symbol) | (expr.func == sympy.core.numbers.Integer):
        to_append = expr
    else:
        to_append = expr.func
    tokens_list.append(to_append)
    for ind, arg in enumerate(expr.args):
        sympy_tokenize(arg, tokens_list, depth+1, parent_ind=ind)
    return tokens_list

def sympy_tokenize_str(sentence):
    Xi_tokenized = sympy_tokenize(sentence, tokens_list=[])
    Xi_tokenized_str = [str(el) for el in Xi_tokenized]
    return Xi_tokenized_str

In [8]:
X_tokenized_str_train = [sympy_tokenize_str(Xi) for Xi in X_train]
y_taylor_tokenized_str_train = [start+sympy_tokenize_str(yi)+end for yi in y_taylor_train]

In [9]:
word2vec_X = Word2Vec(sentences=X_tokenized_str_train, vector_size=100, window=5, min_count=1, workers=4)
word2vec_y = Word2Vec(sentences=y_taylor_tokenized_str_train, vector_size=100, window=5, min_count=1, workers=4)

In [10]:
def vectorize_sentence(Xi, model):
    # 0 reserved for [end], so add 1 to index
    Xi_vectorized = [model.wv.key_to_index[word]+1 for word in Xi]
    return Xi_vectorized

def vectorize(X_tokenized_str, model, sequence_length=25):
    X_vectorized = [ vectorize_sentence(sentence, model) for sentence in X_tokenized_str]
    # sequence_length = np.max([len(Xi) for Xi in X_vectorized]) + 5
    X_vectorized = [pad_right(Xi, sequence_length, const=0) for Xi in X_vectorized]
    return X_vectorized

In [11]:
sequence_length_X = 50
sequence_length_y = 51
X_vectorized_train = vectorize(X_tokenized_str_train, word2vec_X, sequence_length=sequence_length_X)
y_taylor_vectorized_train = vectorize(y_taylor_tokenized_str_train, word2vec_y, sequence_length=sequence_length_y)

In [12]:
def unvectorize_sentence(Xi, model):
    end_ind = np.min(np.where(np.array(Xi) == 0)[0])
    Xi_trunc = Xi[0:end_ind]
    return [ model.wv.index_to_key[word-1] for word in Xi_trunc]

def unvectorize(X_vectorized, model):
    X_unvectorized = [unvectorize_sentence(sentence, model) for sentence in X_vectorized]
    return X_unvectorized

In [13]:
X_unvectorized_train = unvectorize(X_vectorized_train, word2vec_X)
print(X_unvectorized_train == X_tokenized_str_train)

y_taylor_unvectorized_train = unvectorize(y_taylor_vectorized_train, word2vec_y)
print(y_taylor_unvectorized_train == y_taylor_tokenized_str_train)

True


ValueError: zero-size array to reduction operation minimum which has no identity

In [14]:
batch_size=16

def format_dataset(X, y):
    X = vectorize(X, word2vec_X, sequence_length=sequence_length_X)
    y = vectorize(y, word2vec_y, sequence_length=sequence_length_y)
    X = np.array(X)
    y = np.array(y)
    return ({"encoder_inputs": X, "decoder_inputs": y[:, :-1],}, y[:, 1:])

def make_dataset(X, y):
    dataset = format_dataset(X,y)
    dataset = tf.data.Dataset.from_tensor_slices(dataset)
    dataset = dataset.batch(batch_size)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(X_tokenized_str_train, y_taylor_tokenized_str_train)

2022-05-18 10:03:19.388908: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-05-18 10:03:19.398614: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-05-18 10:03:19.399457: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-05-18 10:03:19.400217: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate

In [15]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")


inputs["encoder_inputs"].shape: (16, 50)
inputs["decoder_inputs"].shape: (16, 50)
targets.shape: (16, 50)


2022-05-18 10:03:21.427886: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [16]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)


In [39]:
# X_vocab_size = len(word2vec_X.wv.key_to_index)
# y_vocab_size = len(word2vec_y.wv.key_to_index)
X_vocab_size = 200
y_vocab_size = 200

In [40]:
embed_dim = 100
latent_dim = 200
num_heads = 4

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length_X, X_vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length_X-1, y_vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(X_vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)


In [41]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_4 (Positi  (None, None, 100)   25000       ['encoder_inputs[0][0]']         
 onalEmbedding)                                                                                   
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder_2 (Transfo  (None, None, 100)   202000      ['positional_embedding_

In [42]:
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
print(os.getenv('TF_GPU_ALLOCATOR'))

cuda_malloc_async


In [136]:
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7efe850411e0>

In [121]:
from icecream import ic

In [188]:
max_decoded_sentence_length = 200
def apply_transformer(input_vectorized):
    decoded_sentence = ["[start]"]
    y_index_lookup = word2vec_y.wv.index_to_key
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = vectorize([decoded_sentence], word2vec_y, sequence_length=sequence_length_X)
        tokenized_target_sentence = tf.convert_to_tensor(tokenized_target_sentence)
        y_pred = transformer([test_function, tokenized_target_sentence])[0,i,:]
        y_index = np.argmax(y_pred) - 1
        y = y_index_lookup[y_index]
        decoded_sentence = decoded_sentence + [y]

        if y == "[end]":
            break
    return  decoded_sentence

In [190]:
test_function = tf.convert_to_tensor([X_vectorized_train[0]])
apply_transformer(test_function)

['[start]',
 "<class 'sympy.core.add.Add'>",
 "<class 'sympy.core.mul.Mul'>",
 '2',
 "<class 'sympy.core.numbers.Exp1'>",
 "<class 'sympy.core.power.Pow'>",
 'x',
 '2',
 "<class 'sympy.core.add.Add'>",
 "<class 'sympy.core.numbers.One'>",
 "<class 'sympy.core.power.Pow'>",
 'tan',
 "<class 'sympy.core.numbers.Exp1'>",
 '2',
 "<class 'sympy.core.mul.Mul'>",
 '2',
 "<class 'sympy.core.numbers.Exp1'>",
 "<class 'sympy.core.power.Pow'>",
 'cos',
 "<class 'sympy.core.numbers.Exp1'>",
 '2',
 "<class 'sympy.core.numbers.Exp1'>",
 "<class 'sympy.core.mul.Mul'>",
 '2',
 "<class 'sympy.core.numbers.Exp1'>",
 "<class 'sympy.core.power.Pow'>",
 'x',
 '4',
 "<class 'sympy.core.add.Add'>",
 "<class 'sympy.core.mul.Mul'>",
 '8',
 'exp',
 '2',
 'tan',
 "<class 'sympy.core.numbers.Exp1'>",
 '[end]']

In [191]:
y_taylor_tokenized_str_train[0]

['[start]',
 "<class 'sympy.core.add.Add'>",
 "<class 'sympy.core.mul.Mul'>",
 '2',
 "<class 'sympy.core.numbers.Exp1'>",
 "<class 'sympy.core.power.Pow'>",
 'x',
 '2',
 "<class 'sympy.core.add.Add'>",
 "<class 'sympy.core.numbers.One'>",
 "<class 'sympy.core.power.Pow'>",
 'tan',
 "<class 'sympy.core.numbers.Exp1'>",
 '2',
 "<class 'sympy.core.mul.Mul'>",
 "<class 'sympy.core.numbers.Rational'>",
 "<class 'sympy.core.numbers.Exp1'>",
 "<class 'sympy.core.power.Pow'>",
 'x',
 '4',
 "<class 'sympy.core.add.Add'>",
 "<class 'sympy.core.numbers.One'>",
 "<class 'sympy.core.power.Pow'>",
 'tan',
 "<class 'sympy.core.numbers.Exp1'>",
 '2',
 "<class 'sympy.core.add.Add'>",
 '4',
 "<class 'sympy.core.mul.Mul'>",
 '6',
 "<class 'sympy.core.numbers.Exp1'>",
 'tan',
 "<class 'sympy.core.numbers.Exp1'>",
 'tan',
 "<class 'sympy.core.numbers.Exp1'>",
 '[end]']