# First Transformer Model for Taylor Expansion

- Data from `generate_data.py`, 1000 formulas
- Transform to prefix notation first, then encode the tokens
- Imitated [spanish translation transformer](https://keras.io/examples/nlp/neural_machine_translation_with_transformer/)

Problems:
- not enough data?
- cannot go back from prefix notation with sympy I think

In [1]:
import numpy as np
import random
import pandas as pd
import os
import sympy
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import keras
from tensorflow.keras import layers
from sympy import srepr
# from sympy import preorder_traversal, symbols, Symbol, Integer, tan
from sympy import *
from sympy.parsing.sympy_parser import parse_expr
from gensim.models import Word2Vec
from icecream import ic

In [2]:
os.chdir("..")

In [3]:
from source.data_preparation import sympy_tokenize, sympy_tokenize_str, vectorize_ds, vectorize_sentence, pad_right

In [4]:
data_file = "data.nosync/data.txt"
taylor_file = "data.nosync/data_taylor.txt"
coeffs_file = "data.nosync/data_coeffs.txt"

start = ["[start]"]
end = ["[end]"]

with open(data_file) as f:
    X = f.read().split("\n")
    X = np.array(X)
    X = X[:-1]   # somehow last entry is empty
    X = [parse_expr(xi) for xi in X]

with open(taylor_file) as f:
    y_taylor = f.read().split("\n")
    y_taylor = np.array(y_taylor)
    y_taylor = y_taylor[:-1]
    y_taylor = [parse_expr(yi) for yi in y_taylor]

In [5]:
with open(coeffs_file) as f:
    y_coeffs = f.read().split("\n")
    y_coeffs = y_coeffs[:-1]
    for i, y in enumerate(y_coeffs):
        y = parse_expr(y)
        # add start end tokens and remove whitespaces
        y_coeffs[i] = y 

In [6]:
tmp = [[X[i], y_coeffs[i], y_taylor[i]] for i in range(0, len(X))]
random.shuffle(tmp)

num_train_samples = int(0.90 * len(tmp))

train = tmp[0:num_train_samples]
test = tmp[num_train_samples:]
X_train = [x[0] for x in train]
X_test = [x[0] for x in test]
y_taylor_train = [x[2] for x in train]
y_taylor_test = [x[2] for x in test]
print(len(tmp) == len(train) + len(test))

True


# Some Examples

In [214]:
X_train[0:5]

[sinh(x),
 exp(c + x + tanh(x)),
 tan(sqrt(cos(x**2))),
 sin(x + 1)**cos(a**(2*x)),
 tan(cos(d)) + sinh(exp(x))]

In [215]:
y_taylor_train[0:5]

[x**3/6 + x,
 x**3*exp(c) + 2*x**2*exp(c) + 2*x*exp(c) + exp(c),
 x**4*(-tan(1)**2/4 - 1/4) + tan(1),
 x**4*((2*log(a)*log(sin(1))*sin(1) - cos(1)**2/sin(1))**4 - 6*(2*log(a)*log(sin(1))*sin(1) - cos(1)**2/sin(1))**2*(4*log(a)**2*log(sin(1))*sin(1) + 4*log(a)**2*log(sin(1))*cos(1) + 4*log(a)*cos(1) + cos(1)**3/sin(1)**2 + cos(1)) - (16*log(a)*log(sin(1))*sin(1) - 8*cos(1)**2/sin(1))*(-12*log(a)**3*log(sin(1))*cos(1) - 6*log(a)**2*cos(1) - 6*log(a)**2*cos(1)**2/sin(1) + 3*log(a)*cos(1)**2/sin(1) + 3*log(a)*sin(1) + cos(1)**4/sin(1)**3 + cos(1)**2/sin(1)) + 3*(4*log(a)**2*log(sin(1))*sin(1) + 4*log(a)**2*log(sin(1))*cos(1) + 4*log(a)*cos(1) + cos(1)**3/sin(1)**2 + cos(1))**2 + 80*log(a)**4*log(sin(1))*sin(1) - 96*log(a)**4*log(sin(1))*cos(1) - 96*log(a)**3*cos(1)**2/sin(1) + 24*log(a)**2*cos(1)**3/sin(1)**2 + 24*log(a)**2*cos(1)**2/sin(1) + 24*log(a)**2*cos(1) + 24*log(a)**2*sin(1) - 16*log(a)*cos(1) - 16*log(a)*cos(1)**3/sin(1)**2 - 8*cos(1)**3/sin(1)**2 - 2*cos(1) - 6*cos(1)**5/sin(1)*

# Vectorization

In [7]:
X_tokenized_str_train = [sympy_tokenize_str(Xi) for Xi in X_train]
y_taylor_tokenized_str_train = [start+sympy_tokenize_str(yi)+end for yi in y_taylor_train]

X_tokenized_str_test = [sympy_tokenize_str(Xi) for Xi in X_test]
y_taylor_tokenized_str_test = [start+sympy_tokenize_str(yi)+end for yi in y_taylor_test]

In [9]:
word2vec_X = Word2Vec(sentences=X_tokenized_str_train, vector_size=100, window=5, min_count=1, workers=4)
word2vec_y = Word2Vec(sentences=y_taylor_tokenized_str_train, vector_size=100, window=5, min_count=1, workers=4)

In [11]:
sequence_length_X = 200
sequence_length_y = 201
X_vectorized_train = vectorize_ds(X_tokenized_str_train, word2vec_X, sequence_length=sequence_length_X)
y_taylor_vectorized_train = vectorize_ds(y_taylor_tokenized_str_train, word2vec_y, sequence_length=sequence_length_y)

X_vectorized_test = vectorize_ds(X_tokenized_str_test, word2vec_X, sequence_length=sequence_length_X)
y_taylor_vectorized_test = vectorize_ds(y_taylor_tokenized_str_test, word2vec_y, sequence_length=sequence_length_y)

In [239]:
X_tokenized_str_train[0:3]

[['sinh', "Symbol('x')"],
 ['exp',
  "<'sympy.core.add.Add'>",
  "Symbol('c')",
  "Symbol('x')",
  'tanh',
  "Symbol('x')"],
 ['tan',
  "<'sympy.core.power.Pow'>",
  'cos',
  "<'sympy.core.power.Pow'>",
  "Symbol('x')",
  'Integer(2)',
  "<'sympy.core.numbers.Half'>"]]

In [240]:
y_taylor_tokenized_str_train[0:3]

[['[start]',
  "<'sympy.core.add.Add'>",
  "Symbol('x')",
  "<'sympy.core.mul.Mul'>",
  "<'sympy.core.numbers.Rational'>",
  "<'sympy.core.power.Pow'>",
  "Symbol('x')",
  'Integer(3)',
  '[end]'],
 ['[start]',
  "<'sympy.core.add.Add'>",
  "<'sympy.core.mul.Mul'>",
  "<'sympy.core.power.Pow'>",
  "Symbol('x')",
  'Integer(3)',
  'exp',
  "Symbol('c')",
  "<'sympy.core.mul.Mul'>",
  'Integer(2)',
  "Symbol('x')",
  'exp',
  "Symbol('c')",
  "<'sympy.core.mul.Mul'>",
  'Integer(2)',
  "<'sympy.core.power.Pow'>",
  "Symbol('x')",
  'Integer(2)',
  'exp',
  "Symbol('c')",
  'exp',
  "Symbol('c')",
  '[end]'],
 ['[start]',
  "<'sympy.core.add.Add'>",
  "<'sympy.core.mul.Mul'>",
  "<'sympy.core.power.Pow'>",
  "Symbol('x')",
  'Integer(4)',
  "<'sympy.core.add.Add'>",
  "<'sympy.core.numbers.Rational'>",
  "<'sympy.core.mul.Mul'>",
  "<'sympy.core.numbers.Rational'>",
  "<'sympy.core.power.Pow'>",
  'tan',
  "<'sympy.core.numbers.One'>",
  'Integer(2)',
  'tan',
  "<'sympy.core.numbers.One'

In [27]:
def unvectorize_sentence(Xi, model):
    end_ind = np.min(np.where(np.array(Xi) == 0)[0])
    Xi_trunc = Xi[0:end_ind]
    return [ model.wv.index_to_key[word-1] for word in Xi_trunc]

def unvectorize(X_vectorized, model):
    X_unvectorized = [unvectorize_sentence(sentence, model) for sentence in X_vectorized]
    return X_unvectorized

In [28]:
batch_size=16

def format_dataset(X, y):
    X = vectorize_ds(X, word2vec_X, sequence_length=sequence_length_X)
    y = vectorize_ds(y, word2vec_y, sequence_length=sequence_length_y)
    X = np.array(X)
    y = np.array(y)
    return ({"encoder_inputs": X, "decoder_inputs": y[:, :-1],}, y[:, 1:])

def make_dataset(X, y):
    dataset = format_dataset(X,y)
    dataset = tf.data.Dataset.from_tensor_slices(dataset)
    dataset = dataset.batch(batch_size)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(X_tokenized_str_train, y_taylor_tokenized_str_train)

In [29]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")


inputs["encoder_inputs"].shape: (16, 200)
inputs["decoder_inputs"].shape: (16, 200)
targets.shape: (16, 200)


2022-05-19 03:51:58.474423: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [30]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)


In [31]:
# X_vocab_size = len(word2vec_X.wv.key_to_index)
# y_vocab_size = len(word2vec_y.wv.key_to_index)
X_vocab_size = 200
y_vocab_size = 200

In [220]:
embed_dim = 256
latent_dim = 512
num_heads = 5

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length_X, X_vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length_X, y_vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(X_vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)


In [221]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_4 (Positi  (None, None, 256)   102400      ['encoder_inputs[0][0]']         
 onalEmbedding)                                                                                   
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder_2 (Transfo  (None, None, 256)   1578752     ['positional_embedding_

In [222]:
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
print(os.getenv('TF_GPU_ALLOCATOR'))

cuda_malloc_async


In [223]:
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7fbe0d05e890>

In [224]:
max_decoded_sentence_length = 200
def apply_transformer(input_vectorized):
    input_vectorized = tf.convert_to_tensor([input_vectorized])
    decoded_sentence = ["[start]"]
    y_index_lookup = word2vec_y.wv.index_to_key
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = vectorize_ds([decoded_sentence], word2vec_y, sequence_length=sequence_length_X)
        tokenized_target_sentence = tf.convert_to_tensor(tokenized_target_sentence)
        y_pred = transformer([input_vectorized, tokenized_target_sentence])[0,i,:]
        y_index = np.argmax(y_pred) - 1
        y = y_index_lookup[y_index]
        decoded_sentence = decoded_sentence + [y]

        if y == "[end]":
            break
    return  decoded_sentence

In [225]:
y_train_pred = [apply_transformer(x) for x in X_vectorized_train[0:10]]

In [226]:
y_test_pred = [apply_transformer(x) for x in X_vectorized_test]

In [227]:
y_test_pred[0]

['[start]', 'tan', "Symbol('d')", '[end]']

In [228]:
y_taylor_tokenized_str_test[0]

['[start]', 'tan', "Symbol('d')", '[end]']

In [230]:
def token_accuracy(y_true, y_pred, verbose=0):
    """
    compare two arrays and check how many entries are the same at the same position
    """
    max_ind = np.min([len(y_true), len(y_pred)])
    correct_ctr = 0
    # ignore [start] and [end]
    max_correct = len(y_pred) - 2
    for i in range(1, max_ind-1):
        if verbose: ic([y_true[i], y_pred[i]])
        if y_true[i] == y_pred[i]:
            correct_ctr += 1
    return correct_ctr / max_correct


In [231]:
token_accuracy(y_test_pred[2], y_taylor_tokenized_str_test[2], verbose=1)

ic| [y_true[i], y_pred[i]]: ["<'sympy.core.add.Add'>", "<'sympy.core.add.Add'>"]
ic| [y_true[i], y_pred[i]]: ["Symbol('c')", "Symbol('c')"]
ic| [y_true[i], y_pred[i]]: ["<'sympy.core.mul.Mul'>", "Symbol('x')"]
ic| [y_true[i], y_pred[i]]: ['Integer(2)', "<'sympy.core.mul.Mul'>"]
ic| [y_true[i], y_pred[i]]: ["Symbol('x')", 'Integer(2)']
ic| [y_true[i], y_pred[i]]: ["<'sympy.core.mul.Mul'>", "Symbol('d')"]


0.3333333333333333

In [233]:
accuracy_per_function = [token_accuracy(y_true, y_pred) for y_true, y_pred in zip(y_taylor_tokenized_str_test, y_test_pred)]
np.mean(accuracy_per_function)

0.3914166812749736

In [234]:
np.round(np.array(accuracy_per_function), 2)

array([1.  , 0.06, 0.2 , 0.43, 0.06, 0.05, 0.11, 0.8 , 0.03, 1.  , 1.  ,
       1.  , 0.09, 1.  , 0.18, 1.  , 0.15, 0.02, 0.04, 0.  , 0.03, 0.08,
       0.25, 0.43, 0.09, 1.  , 0.08, 1.  , 1.  , 0.12, 1.  , 0.02, 0.21,
       1.  , 0.09, 0.03, 0.06, 0.09, 0.36, 0.13, 0.33, 0.1 , 0.18, 0.09,
       0.01, 1.  , 0.06, 1.  , 1.  , 1.  , 0.63, 0.06, 0.11, 0.28, 1.  ,
       0.03, 0.23, 1.  , 0.01, 0.26, 0.16, 1.  , 0.08, 1.  , 0.8 , 0.13,
       1.  , 0.18, 0.02, 0.11, 1.  , 0.25, 0.86, 0.2 , 0.13, 1.  , 0.09,
       0.08, 0.  , 0.1 , 1.  , 0.06, 0.07, 0.12, 0.08, 0.09, 0.19, 0.01,
       0.7 , 0.18, 0.37, 1.  , 0.04, 1.  , 1.  , 0.18, 0.7 , 0.05, 0.03,
       0.4 ])

# Polish notation -> back to sympy expressions

In [None]:
def str_to_sympy(s):
    if s[0] == '<':
        s = s[1:-1]
        return eval(eval(s))
    return eval(s)

In [196]:
tmp = [str_to_sympy(x) for x in y_taylor_tokenized_str_test[2][1:-1]]
tmp

[sympy.core.add.Add, c, x, sympy.core.mul.Mul, 2, d]

In [197]:
y_taylor_test[2]

c + 2*d + x

In [198]:
def prefix_to_infix(ls):
    """ not working """
    i = -1
    token = ls[i]
    not_operator_list = [sympy.core.numbers.Rational, sympy.core.numbers.Float, sympy.core.numbers.Integer,
    sympy.core.symbol.Symbol]
    while type(token) in not_operator_list:
        i = i - 1
        token = ls[i]
    return token(ls[i+1], ls[i+2])

In [199]:
prefix_to_infix(tmp)

2*d

I think the sympy syntax is not well suited for polish notation, since the number of arguments a function takes is not well defined. E.g. `sympy.core.add.Add` can take any number of arguments:

In [204]:
Add(Symbol("x"))

x

In [202]:
Add(Symbol("x"), Symbol("y"))

x + y

In [203]:
Add(Symbol("x"), Symbol("y"), Symbol("z"))

x + y + z