In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
import codecs
import random
from gensim.models import Word2Vec
from math import ceil

import txt2xml
import os


### Load Vocabulary

In [2]:
with open("vocabulary.json") as json_file:
    vocab = json.load(json_file)

vocab_size = len(vocab)
print(f"Size of vocab: {vocab_size}")

Size of vocab: 1667


In [3]:
def create_sentence_list(file_name):
    sentence_list = []
    with codecs.open(file_name, "r", encoding="utf-8") as fp:
        for l in fp: # level(sentence)
            sentence = ""
            for word in l.split("  ")[:-1]: # row(word)
                sentence += str(vocab.index(word)) + " "
            sentence_list.append(sentence[:-1])
    return sentence_list

In [4]:
X_train = create_sentence_list("train.txt")
X_valid = create_sentence_list("valid.txt")

X_full = X_train.copy()
X_full.extend(X_valid)

# random.shuffle(X_full)

# X_train = X_full[:180]
# X_valid = X_full[180:]

In [5]:
X_train[0:5]

['1189 1189 1586 99 699 801 1328 67 1326 595',
 '950 950 1417 1053 66 657 942 946 806',
 '764 753 1182 753 1323 190 497 879 1337',
 '534 981 179 1450 1045 1503 1072 1265 1621 840 934',
 '490 140 1057 1510 1542 109']

### Prepare Data

*   Special tokens:
    *   `sos`: start of sentence
    *   `eos`: end of sentence

In [6]:
max_length = 30
word_to_vec = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)

word_to_vec.adapt([f"sos {s} eos" for s in X_full])

word_to_vec.trainable = False

*   Take a look at 2 vocabularies

In [7]:
print(word_to_vec.get_vocabulary()[:10])

['', '[UNK]', 'sos', 'eos', '1001', '1589', '962', '1568', '126', '1363']


In [8]:
len(word_to_vec.get_vocabulary())

1667

### Construct Model

*   `encoder_inputs`, `decoder_inptus`: takes in string types for encoding/decoding.
    *   No parameters.

In [9]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

*   Function API of TensorFlow: apply text vectorization on input layers

In [10]:
encoder_input_ids = word_to_vec(encoder_inputs)
decoder_input_ids = word_to_vec(decoder_inputs)

*   `encoder_embeddings`, `decoder_embeddings`: embed the vector of texts to the desired space.

In [11]:
embed_size = 50
w2v = Word2Vec([(f"sos {s} eos").split() for s in X_full], 
                   sg=0, vector_size=embed_size, window=2, min_count=0)

In [12]:
embedding_layer = tf.keras.layers.Embedding(len(w2v.wv.vectors), embed_size, weights=[w2v.wv.vectors], mask_zero=True)


# encoder_embedding_layer.trainable = False
# decoder_embedding_layer.trainable = False

encoder_embeddings = embedding_layer(encoder_input_ids)
decoder_embeddings = embedding_layer(decoder_input_ids)

*   Encoder:
    *   Takes in the embedded vector from `encoder_embeddings`.
    *   Send it to an LSTM.
        *   `return_state=True`: return the last hidden state in addition to the output.
    *   LSTM outputs `encoder_outputs` and calculates a list of `encoder_state`.
    *   `encoder_outputs` is dropped. `*encoder_state` is used.

*   Bidirectional LSTM:
    *   2 LSTM, 1 read from beginning to end, 1 read from end to beginning.
    *   `encoder_state` contains 4 items:
        *   index `0`: short-term forward
        *   index `1`: long-term forward
        *   index `2`: short-term backward
        *   index `3`: long-term backward
    *   Concatenate them for future computations.

In [13]:
dropout_ratio = 0.3
hidden_size = 400
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(hidden_size, return_sequences = False, return_state=True, dropout=dropout_ratio))

encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)
encoder_state = tf.concat(encoder_state, axis=1)

*   Predict $\mu$ and $\ln(\sigma^2)$:
    *   2 linear layers are used.

In [14]:
latent_size = 60
latent_mean = tf.keras.layers.Dense(latent_size)(encoder_state)
latent_ln_var = tf.keras.layers.Dense(latent_size)(encoder_state)

*   Sampling layer:
    *   Return a sample from $\mathcal{N}(\mu,\sigma^2)$.
    *   Same as $\mu + \sigma z$, where $z\sim\mathcal{N}(0,1)$.
        *   Treat $z$ as constant in backpropagation.

In [15]:
class Sampling(tf.keras.layers.Layer):
    def call(self, inputs):
        mean, ln_var = inputs
        return tf.random.normal(tf.shape(ln_var)) * tf.exp(ln_var / 2) + mean

In [16]:
latent_vec = Sampling()([latent_mean, latent_ln_var])

*   Predict long-term and short-term memory for decoder.

In [17]:
short_mem = tf.keras.layers.Dense(2*hidden_size)(latent_vec)
long_mem = tf.keras.layers.Dense(2*hidden_size)(latent_vec)

*   Decoder:
    *   Must be (one-directional) LSTM
    *   Same size as the encoder's bidirectional LSTM

In [18]:
decoder = tf.keras.layers.LSTM(2*hidden_size, return_sequences=True, dropout=dropout_ratio)
decoder_outputs = decoder(decoder_embeddings, initial_state = [short_mem, long_mem])

*   Output layer
    *   Predict the likelihood of the next word to appear.

In [19]:
output_layer = tf.keras.layers.Dense(vocab_size+1)
Y_proba = output_layer(decoder_outputs)

### Train model

*   Use function API to select encoder and decoder.

In [20]:
variational_encoder = tf.keras.Model(inputs=[encoder_inputs], outputs=[latent_vec])

In [21]:
variational_decoder = tf.keras.Model(
    inputs=[latent_vec, decoder_inputs], outputs=[Y_proba])

*   Construct full variational auto-encoder.

In [22]:
variational_ae = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba, latent_ln_var, latent_vec])

*   Compile model

In [23]:
learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate, clipnorm = 3)

In [24]:
rec_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

*   Training:
    *   Use `tf.GradientTape()` to gain more low-level control over the update.

In [25]:
epochs = 250
batch_size = 20
noise_rate = 0.3

In [26]:
def train(optimizer = optimizer, rec_loss = rec_loss,
          epochs = epochs, batch_size = batch_size, noise_rate = noise_rate, 
          X_train = X_train, X_valid = X_valid):
    # Prepare validation data
    X_valid_enc = tf.constant(X_valid)
    X_valid_dec = tf.constant([f"sos {s}" for s in X_valid])
    Y_valid_dec = word_to_vec([f"{s} eos" for s in X_valid])

    epoch_size = len(X_train)
    
    for epoch_idx in range(epochs):
        epoch_loss = 0

        # Add noises to decoder's input
        X_train_dec_all = X_train.copy()
        for i, sentence in enumerate(X_train_dec_all):
            token_list = sentence.split()
            sentence_len = len(token_list)
            unknown_token = word_to_vec.get_vocabulary()[1]
            index_list = [j for j in range(min(sentence_len, max_length))]
            noise_list = random.sample(index_list, int(sentence_len * noise_rate))
            new_sentence = ""
            for j in range(sentence_len):
                token = token_list[j]
                if j in noise_list:
                    token = unknown_token
                new_sentence += (token + " ")
            X_train_dec_all[i] = new_sentence.strip()

        X_train_dec_all = tf.constant([f"sos {s}" for s in X_train_dec_all])

        # batch training
        batch_num = ceil(epoch_size / batch_size)
        batch_idx_list = list(range(batch_num))
        random.shuffle(batch_idx_list)
        for batch_idx in batch_idx_list:
            # Select batch
            lb = batch_idx * batch_size
            ub = min(len(X_train), (batch_idx+1) * batch_size)
            # Set up data
            X_train_enc = tf.constant(X_train[lb:ub])
            X_train_dec = tf.constant(X_train_dec_all[lb:ub])
            Y_train_dec = word_to_vec([f"{s} eos" for s in X_train[lb:ub]])
            
            # Predict and find loss
            with tf.GradientTape() as tape:
                # Predict
                Y_predict_dec, latent_mean, latent_ln_var = variational_ae([X_train_enc, X_train_dec])

                reconstruct_loss = rec_loss(Y_train_dec, Y_predict_dec)
                latent_loss = 0
                total_loss = reconstruct_loss + latent_loss
                avg_loss = total_loss / (ub - lb)
                epoch_loss += total_loss

                # Take gradient and backpropagate
                grads = tape.gradient(avg_loss, variational_ae.trainable_variables)
                optimizer.apply_gradients(zip(grads, variational_ae.trainable_variables))
        
        # Validate:
        valid_predict, _, _ = variational_ae([X_valid_enc, X_valid_dec])
        valid_loss = rec_loss(Y_valid_dec, valid_predict) / batch_size

        # Print output
        print("epoch: %3d, train loss: %.4f, valid loss: %.4f" 
              % (epoch_idx, epoch_loss/epoch_size, valid_loss))

In [27]:
train()

epoch:   0, train loss: 0.3706, valid loss: 0.3702
epoch:   1, train loss: 0.3684, valid loss: 0.3667
epoch:   2, train loss: 0.3632, valid loss: 0.3677
epoch:   3, train loss: 0.3608, valid loss: 0.3744
epoch:   4, train loss: 0.3582, valid loss: 0.3754
epoch:   5, train loss: 0.3541, valid loss: 0.3873
epoch:   6, train loss: 0.3495, valid loss: 0.3891
epoch:   7, train loss: 0.3444, valid loss: 0.4004
epoch:   8, train loss: 0.3399, valid loss: 0.4081
epoch:   9, train loss: 0.3366, valid loss: 0.4126
epoch:  10, train loss: 0.3319, valid loss: 0.4149
epoch:  11, train loss: 0.3289, valid loss: 0.4184
epoch:  12, train loss: 0.3245, valid loss: 0.4241
epoch:  13, train loss: 0.3234, valid loss: 0.4226
epoch:  14, train loss: 0.3177, valid loss: 0.4267
epoch:  15, train loss: 0.3153, valid loss: 0.4285
epoch:  16, train loss: 0.3120, valid loss: 0.4306
epoch:  17, train loss: 0.3100, valid loss: 0.4341


KeyboardInterrupt: 

In [32]:
variational_decoder.save("./saved_models/variational_decoder.tf")
variational_encoder.save("./saved_models/variational_encoder.tf")





INFO:tensorflow:Assets written to: ./saved_models/variational_decoder.tf\assets


INFO:tensorflow:Assets written to: ./saved_models/variational_decoder.tf\assets






INFO:tensorflow:Assets written to: ./saved_models/variational_encoder.tf\assets


INFO:tensorflow:Assets written to: ./saved_models/variational_encoder.tf\assets


In [None]:
with open('./saved_models/vocab_dec.json', 'w') as f:
    json.dump(word_to_vec.get_vocabulary(), f)

*   Generation

In [None]:
def generate(variational_decoder, word_to_vec_dec, latent_vec, max_length = 30):
    generated = ""
    vocabulary = word_to_vec_dec.get_vocabulary()
    for idx in range(max_length):
        decoder_inputs = tf.constant(["sos " + generated])
        Y_proba = variational_decoder.predict((latent_vec, decoder_inputs))[0, idx]
        predicted_id = np.argmax(Y_proba)
        predicted_word = vocabulary[predicted_id]
        if predicted_word == "eos":
            break
        generated += " " + predicted_word
    return generated.strip().split()

In [None]:
deconverter = txt2xml.txt2xml()

In [None]:
for i in range(5):
    sampled_vec = tf.random.normal(shape=[1, latent_size], mean=10, stddev=1)
    level = generate(variational_decoder, word_to_vec, sampled_vec)
    level_txt = [vocab[int(s)] for s in level]
    level_xml = deconverter.vector2xml(level_txt)
    with open(f"./level-" + str(i) + ".xml", "w") as f:
                f.write(level_xml)

