- How to handle unknown words?

In [217]:
import numpy as np
import random
import pandas as pd
import os
import sympy
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import keras
from tensorflow.keras import layers
from sympy import srepr
from sympy import preorder_traversal, symbols
from sympy.parsing.sympy_parser import parse_expr
from gensim.models import Word2Vec

In [3]:
os.chdir("..")

In [4]:
def pad_right(list, total_length=5, const=0):
    length = len(list)
    values_needed = total_length - length 
    return np.pad(list, (0, values_needed), mode="constant", constant_values=const) 

In [271]:
data_file = "data.nosync/data.txt"
taylor_file = "data.nosync/data_taylor.txt"
coeffs_file = "data.nosync/data_coeffs.txt"

start = ["[start]"]
end = ["[end]"]

with open(data_file) as f:
    X = f.read().split("\n")
    X = np.array(X)
    X = X[:-1]   # somehow last entry is empty
    X = [parse_expr(xi) for xi in X]

with open(taylor_file) as f:
    y_taylor = f.read().split("\n")
    y_taylor = np.array(y_taylor)
    y_taylor = y_taylor[:-1]
    y_taylor = [parse_expr(yi) for yi in y_taylor]

In [6]:
with open(coeffs_file) as f:
    y_coeffs = f.read().split("\n")
    y_coeffs = y_coeffs[:-1]
    for i, y in enumerate(y_coeffs):
        y = parse_expr(y)
        # add start end tokens and remove whitespaces
        y_coeffs[i] = y 

In [192]:
tmp = [[X[i], y_coeffs[i], y_taylor[i]] for i in range(0, len(X))]
random.shuffle(tmp)

num_train_samples = int(0.90 * len(tmp))

train = tmp[0:num_train_samples]
test = tmp[num_train_samples:]
X_train = [x[0] for x in train]
X_test = [x[0] for x in test]
y_taylor_train = [x[2] for x in train]
y_taylor_test = [x[2] for x in test]
print(len(tmp) == len(train) + len(test))

True


# Vectorization

In [145]:
def sympy_tokenize(expr, tokens_list=[], depth=0, parent_ind=None):
    if (expr.func == sympy.core.symbol.Symbol) | (expr.func == sympy.core.numbers.Integer):
        to_append = expr
    else:
        to_append = expr.func
    tokens_list.append(to_append)
    for ind, arg in enumerate(expr.args):
        sympy_tokenize(arg, tokens_list, depth+1, parent_ind=ind)
    return tokens_list

def sympy_tokenize_str(sentence):
    Xi_tokenized = sympy_tokenize(sentence, tokens_list=[])
    Xi_tokenized_str = [str(el) for el in Xi_tokenized]
    return Xi_tokenized_str

In [272]:
X_tokenized_str_train = [sympy_tokenize_str(Xi) for Xi in X_train]
y_taylor_tokenized_str_train = [start+sympy_tokenize_str(yi)+end for yi in y_taylor_train]

In [274]:
word2vec_X = Word2Vec(sentences=X_tokenized_str_train, vector_size=100, window=5, min_count=1, workers=4)
word2vec_y = Word2Vec(sentences=y_taylor_tokenized_str_train, vector_size=100, window=5, min_count=1, workers=4)

In [458]:
def vectorize_sentence(Xi, model):
    # 0 reserved for [end], so add 1 to index
    Xi_vectorized = [model.wv.key_to_index[word]+1 for word in Xi]
    return Xi_vectorized

def vectorize(X_tokenized_str, model, sequence_length=25):
    X_vectorized = [ vectorize_sentence(sentence, model) for sentence in X_tokenized_str]
    # sequence_length = np.max([len(Xi) for Xi in X_vectorized]) + 5
    X_vectorized = [pad_right(Xi, sequence_length, const=0) for Xi in X_vectorized]
    return X_vectorized

In [459]:
X_vectorized_train = vectorize(X_tokenized_str_train, word2vec_X, sequence_length=30)
y_taylor_vectorized_train = vectorize(y_taylor_tokenized_str_train, word2vec_y, sequence_length=1200)
X_vectorized_train[0:2]

[array([ 2,  6, 14,  5, 10,  1, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 array([ 8,  6, 19,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])]

In [460]:
def unvectorize_sentence(Xi, model):
    end_ind = np.min(np.where(np.array(Xi) == 0)[0])
    Xi_trunc = Xi[0:end_ind]
    return [ model.wv.index_to_key[word-1] for word in Xi_trunc]

def unvectorize(X_vectorized, model):
    X_unvectorized = [unvectorize_sentence(sentence, model) for sentence in X_vectorized]
    return X_unvectorized

In [461]:
X_unvectorized_train = unvectorize(X_vectorized_train, word2vec_X)
print(X_unvectorized_train == X_tokenized_str_train)

y_taylor_unvectorized_train = unvectorize(y_taylor_vectorized_train, word2vec_y)
print(y_taylor_unvectorized_train == y_taylor_tokenized_str_train)

True
True


In [463]:
batch_size=16
sequence_length_X = 30
sequence_length_y = 1200

def format_dataset(X, y):
    X = vectorize(X, word2vec_X, sequence_length=sequence_length_X)
    y = vectorize(y, word2vec_y, sequence_length=sequence_length_y)
    X = np.array(X)
    y = np.array(y)
    return ({"encoder_inputs": X, "decoder_inputs": y[:, :-1],}, y[:, 1:])

def make_dataset(X, y):
    dataset = format_dataset(X,y)
    dataset = tf.data.Dataset.from_tensor_slices(dataset)
    dataset = dataset.batch(batch_size)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(X_tokenized_str_train, y_taylor_tokenized_str_train)

In [464]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")


inputs["encoder_inputs"].shape: (16, 30)
inputs["decoder_inputs"].shape: (16, 1199)
targets.shape: (16, 1199)


2022-05-18 08:36:26.840755: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
