
# One-to-many - Learning to generate text.


In [88]:
# conda activate tf_p39
import os
import numpy as np
import re
import shutil
import tensorflow as tf

from typing import Dict, List

In [89]:
DATA_DIR = "./data"
CHECKPOINT_DIR = os.path.join(DATA_DIR, "checkpoints")
LOG_DIR = os.path.join(DATA_DIR, "logs")

In [90]:
def clean_logs():
    shutil.rmtree(CHECKPOINT_DIR, ignore_errors=True)
    shutil.rmtree(LOG_DIR, ignore_errors=True)

In [91]:
def download_and_read(urls):
    texts : List[str] = []
    for i, url in enumerate(urls):
        p = tf.keras.utils.get_file("ex1-{:d}.txt".format(i), url,
                                    cache_dir=".")
        text = open(p, mode="r", encoding="utf-8").read()
        # remove byte order mark
        text = text.replace("\ufeff", "")
        # remove newlines
        text = text.replace('\n', ' ')
        text = re.sub(r'\s+', " ", text)
        # add it to the list
        texts.extend(text)
    return texts

In [92]:
def split_train_labels(sequence):
    input_seq = sequence[0:-1]
    output_seq = sequence[1:]
    return input_seq, output_seq


In [93]:
# CharGenModel(vocab_size=92, num_timesteps=100, embedding_dim=256)
class CharGenModel(tf.keras.Model):

    def __init__(self, vocab_size, num_timesteps,
                 embedding_dim, **kwargs):
        super(CharGenModel, self).__init__(**kwargs)
        """
        input_dim: Integer. Size of the vocabulary, i.e. maximum integer index + 1.
        output_dim: Integer. Dimension of the dense embedding.
        """
        self.embedding_layer = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim
        )
        """
        stateful=True: The hidden state output from the previous 
            training epoch will be used as input to the current epoch.
        return_sequences=True: The RNN will output at each of the 
            time steps rather than an aggregate output at the last time steps.
        To get more details about GRU (Gated recurrent unit) check book Deep Learning with TensorFlow pg.146
        - units: Positive integer, dimensionality of the output space.
        - recurrent_initializer: Initializer for the recurrent_kernel weights matrix, used for the linear transformation of the recurrent state. Default: orthogonal.
        - recurrent_activation: Activation function to use for the recurrent step. Default: sigmoid (sigmoid). If you pass None, no activation is applied (ie. "linear" activation: a(x) = x).
        Doc: https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU
        """

        self.rnn_layer = tf.keras.layers.GRU(
            units=num_timesteps,
            recurrent_initializer="glorot_uniform",
            recurrent_activation="sigmoid",
            stateful=True,
            return_sequences=True
        )
        """
        units: Positive integer, dimensionality of the output space.
        """
        self.dense_layer = tf.keras.layers.Dense(units=vocab_size)

    def call(self, x):
        x = self.embedding_layer(x)
        x = self.rnn_layer(x)
        x = self.dense_layer(x)
        return x



In [94]:
def loss(labels, predictions):
    return tf.losses.sparse_categorical_crossentropy(
        labels,
        predictions,
        from_logits=True
    )


In [95]:
def generate_text(model, prefix_string: str, char2idx: Dict[str, int], idx2char: Dict[int, str],
                  num_chars_to_generate=1000, temperature=1.0):
    input = [char2idx[s] for s in prefix_string]
    input = tf.expand_dims(input, 0)
    text_generated = []
    model.reset_states()
    for i in range(num_chars_to_generate):
        preds = model(input)
        preds = tf.squeeze(preds, 0) / temperature
        # predict char returned by model
        # logits: 2-D Tensor with shape [batch_size, num_classes]. Each slice [i, :] represents the unnormalized log-probabilities for all classes.
        pred_id = tf.random.categorical(logits=preds, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx2char[pred_id])
        # pass the prediction as the next input to the model
        input = tf.expand_dims([pred_id], 0)

    return prefix_string + "".join(text_generated)


In [96]:
########################################################################
# download and read into local data structure (list of chars)
texts = download_and_read([
    "http://www.gutenberg.org/cache/epub/28885/pg28885.txt",
    "https://www.gutenberg.org/files/12/12-0.txt"
])
clean_logs()

In [97]:
# create the vocabulary
vocab = sorted(set(texts))
print("vocab: {}".format(vocab))
print("vocab size: {:d}\n".format(len(vocab)))

vocab: [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '·', 'Æ', 'ù', '—', '‘', '’', '“', '”']
vocab size: 92



In [98]:
# create mapping from vocab chars to ints
char2idx: Dict[str, int] = {c: i for i, c in enumerate(vocab)}
idx2char: Dict[int, str] = {i: c for c, i in char2idx.items()}

In [99]:
# numericize the texts
texts_as_ints: np.ndarray = np.array([char2idx[c] for c in texts])
data = tf.data.Dataset.from_tensor_slices(texts_as_ints)

In [100]:
# number of characters to show before asking for prediction
# sequences: [None, 100]
seq_length = 100
sequences = data.batch(seq_length + 1, drop_remainder=True)
print("dataset : {}\n".format(sequences))
sequences = sequences.map(split_train_labels)


dataset : <BatchDataset shapes: (101,), types: tf.int64>



In [101]:
# print out input and output to see what they look like
for input_seq, output_seq in sequences.take(1):
    print("input:[{:s}]".format(
        "".join([idx2char[i] for i in input_seq.numpy()])))
    print("output:[{:s}]\n".format(
        "".join([idx2char[i] for i in output_seq.numpy()])))

input:[Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll This eBook is for the use of ]
output:[roject Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll This eBook is for the use of a]



In [102]:
# set up for training
# batches: [None, 64, 100]
batch_size: int = 64
steps_per_epoch: int = (len(texts) // seq_length) // batch_size
dataset = sequences.shuffle(10000).batch(batch_size, drop_remainder=True)
print("dataset : {}\n".format(dataset))


dataset : <BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>



In [103]:
# define network
vocab_size: int = len(vocab)
embedding_dim: int = 256

model = CharGenModel(vocab_size, seq_length, embedding_dim)
model.build(input_shape=(batch_size, seq_length))
model.summary()

Model: "char_gen_model_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_26 (Embedding)    multiple                  23552     
                                                                 
 gru_26 (GRU)                multiple                  107400    
                                                                 
 dense_26 (Dense)            multiple                  9292      
                                                                 
Total params: 140,244
Trainable params: 140,244
Non-trainable params: 0
_________________________________________________________________


In [104]:
# try running some data through the model to validate dimensions
for input_batch, label_batch in dataset.take(1):
    pred_batch = model(input_batch)

print("pred_batch.shape: {}\n".format(pred_batch.shape))
assert(pred_batch.shape[0] == batch_size)
assert(pred_batch.shape[1] == seq_length)
assert(pred_batch.shape[2] == vocab_size)

model.compile(optimizer=tf.optimizers.Adam(), loss=loss)

pred_batch.shape: (64, 100, 92)



In [106]:
# we will train our model for 50 epochs, and after every 10 epochs
# we want to see how well it will generate text
num_epochs: int = 50
for i in range(num_epochs // 10):
    model.fit(
        dataset.repeat(),
        epochs=10,
        steps_per_epoch=steps_per_epoch
        # callbacks=[checkpoint_callback, tensorboard_callback]
    )
    checkpoint_file = os.path.join(
        CHECKPOINT_DIR, "model_epoch_{:d}".format(i+1))
    model.save_weights(checkpoint_file)

    # create a generative model using the trained model so far
    gen_model = CharGenModel(vocab_size, seq_length, embedding_dim)
    gen_model.load_weights(checkpoint_file)
    gen_model.build(input_shape=(1, seq_length))

    print("after epoch: {:d}".format((i+1)*10))
    print(generate_text(gen_model, "Alice ", char2idx, idx2char))
    print("---")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
after epoch: 10
Alice 5. Fibrived it make way a ring’r in a paying a of chorge, offout it in as with only, mus, I's are on replace—your heart heave it canched as to nuppting a projectle, hish a pillit should hem come on tillow. Ard This were shastsing!" stiok! Nowaght alace of the hear that's RoRe Frork. What I’ve broon, here’s a by they wime unerousing and went on itsel sotion as cog, then esity thing, thing—dowly hange fouming of done would remingase sell shall said ither maiferoout) onling come the all rus. "Nowing?” “It’s the emany. So paughtly asmeth you of I seems be think look to which The very the very she said. “But to be blectration _she mongurdered alse with its on a were the little and the right, and that oven to real arrwould precedenot on your mea’le ot sigh bereaticy quiting to aves for olaclable8, so law Alice chute.” And I were pletily chare, them, Wo don't by