<a href="https://colab.research.google.com/github/BrooklynZhang/Personal/blob/master/nlp_extra_credit_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import json
import os
import re

labels = ["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust", "neutral"]
batch_size = 64
buffer_size = 1024
save_path = "model/"
encoder_file = save_path + "encoder"


def load_data(jsonfile, labels):
    data = []
    result = []
    with open(jsonfile, "r") as data_file:
        dataset = json.load(data_file)
        for id, post in dataset.items():
            doc = post["body"]
            labellist = []
            for tag in labels:
                labellist.append(int(post["emotion"][tag]))

            data.append(doc)
            result.append(labellist)
    result = np.asarray(result, dtype=np.int64)
    dataset = tf.data.Dataset.from_tensor_slices((data, result))
    return dataset


def create_model(encoder=None):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(encoder.vocab_size, 64),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(len(labels))
    ])
    return model

if __name__ == "__main__":

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    post_dataset = load_data("nlp_train.json", labels)

    tokenizer = tfds.features.text.Tokenizer()
    vocabulary_set = set()
    for text_tensor, _ in post_dataset:
        tokens = tokenizer.tokenize(text_tensor.numpy())
        vocabulary_set.update(tokens)

    load_model = True

    if load_model:
        print("Loading model from {}".format(save_path))
        model = tf.keras.models.load_model(save_path)
        encoder = tfds.features.text.TokenTextEncoder.load_from_file(encoder_file)
    else:
        encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
        model = create_model(encoder=encoder)
        encoder.save_to_file(encoder_file)

    def encode_map_fn(text, label): #encode the text to integers
        def encode(text_tensor, label):
            encoded_text = encoder.encode(text_tensor.numpy())
            return encoded_text, label

        encoded_text, label = tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

        encoded_text.set_shape([None])
        label.set_shape([None])
        return encoded_text, label

    dataset_encoded = post_dataset.map(encode_map_fn)

    x_test = dataset_encoded.take(100)
    x_test = x_test.padded_batch(batch_size, padded_shapes=([None],[None]))
    
    x_train = dataset_encoded.skip(100).shuffle(buffer_size)
    x_train = x_train.padded_batch(batch_size, padded_shapes=([None],[None]))

    model.compile(loss=tf.keras.losses.MeanSquaredError(), metrics=["accuracy"], optimizer=tf.keras.optimizers.Adam(1e-4))

    for i in range(10):
        history = model.fit(x_train, epochs=10, batch_size=batch_size, validation_data=x_test)
        tf.keras.models.save_model(model, save_path)

Dataset size: 1493
Vocabulary size: 37371
Loading model from model/
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: model/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: model/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: model/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: model/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: model/assets
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 4/22 [====>.........................] - ETA: 3s - loss: 0.0201 - accuracy: 0.3633