In [5]:
import os
import time

import numpy as np
import tensorflow as tf

In [11]:
path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt",
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [12]:
text = open(path_to_file, "rb").read().decode(encoding="utf-8")
print(f"Length of text: {len(text)} characters")

Length of text: 1115394 characters


In [13]:
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [14]:
vocab = sorted(set(text))
print(f"{len(vocab)} unique characters")

65 unique characters


Before training, I have converted the strings to a numerical representation.
Using `tf.keras.layers.StringLookup` layer can convert each character into a numeric ID. It just needs the text to be split into tokens first.

In [16]:
example_texts = ["abcdefg", "xyz"]

chars = tf.strings.unicode_split(example_texts, input_encoding="UTF-8")
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

Creating the `tf.keras.layers.StringLookup` layer

In [17]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None
)

Converting from tokens to character IDs

In [18]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>

Since the goal is to generate text, it is important to invert this representation and recover human-readable strings from it. For this we use `tf.keras.layers.StringLookup(..., invert=True)`.

We use the get_vocabulary() method of the `tf.keras.layers.StringLookup` layer so that the `[UNK]` tokens is set the same way.

In [19]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None
)

This layer recovers the characters from the vectors of IDs, and returns them as a tf.RaggedTensor of characters

In [20]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

Using `tf.strings.reduce_join` to join the characters back into strings

In [21]:
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [22]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

# Create training examples and targets

In [23]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, "UTF-8"))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>

Using the `tf.data.Dataset.from_tensor_slices` function to convert the text vector into a stream of character indices.

In [24]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [25]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode("utf-8"))

F
i
r
s
t
 
C
i
t
i


In [26]:
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

The `batch` method easily converts these individual characters to sequences of the desired size

In [30]:
sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)

for seq in sequences.take(1):
    print(chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [29]:
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


This function takes a sequence as input, duplicates, and shifts it to align the input and label for each timestep.

In [31]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [32]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [33]:
dataset = sequences.map(split_input_target)

In [34]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


# Create training batches

In [35]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = (
    dataset.shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

# Building the model with the following layers
`tf.keras.layers.Embedding`: The input layer. A trainable lookup table that will map each character-ID to a vector with embedding_dim dimensions

`tf.keras.layers.GRU`: A type of RNN with size units=rnn_units (An LSTM layer can also be used.)

`tf.keras.layers.Dense`: The output layer, with vocab_size outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.


In [36]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [37]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        self.gru = tf.keras.layers.GRU(
            rnn_units, return_sequences=True, return_state=True
        )

        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = self.embedding(inputs, training=training)

        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [38]:
model = MyModel(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
)

#Trying the model

In [39]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(
        example_batch_predictions.shape,
    )

(64, 100, 66)


In [40]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  67650     
                                                                 
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


In [41]:
sampled_indices = tf.random.categorical(
    example_batch_predictions[0], num_samples=1
)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [42]:
sampled_indices

array([ 2, 31, 57, 42, 17, 18, 65,  0, 56, 23, 53, 51, 42,  2, 23, 34,  8,
        2, 25, 59, 53, 38, 64, 59,  9, 59, 47, 29, 58, 21, 11, 49, 10, 31,
       42, 27, 29,  6,  9, 52, 41, 31, 10, 49,  2, 32, 35, 18, 27, 33,  7,
        7, 54,  7,  9,  6, 41,  8, 56, 22, 28,  2,  6, 12, 50, 24, 27,  7,
       12, 52, 52, 19, 53, 65,  0, 32, 14, 10, 16, 10,  0, 57, 15, 23, 42,
       36, 32,  1, 48,  4, 31, 58, 59, 19, 28, 22, 32, 10,  5, 22])

In [43]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b" ear can hear. Come, let's not weep.\nIf I could shake off but one seven years\nFrom these old arms an"

Next Char Predictions:
 b" RrcDEz[UNK]qJnlc JU- LtnYyt.thPsH:j3RcNP'.mbR3j SVENT,,o,.'b-qIO ';kKN,;mmFnz[UNK]SA3C3[UNK]rBJcWS\ni$RstFOIS3&I"


#Training the model
Attaching an optimizer and a loss function.

In [44]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [45]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print(
    "Prediction shape: ",
    example_batch_predictions.shape,
    " # (batch_size, sequence_length, vocab_size)",
)
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.190893, shape=(), dtype=float32)


In [47]:
tf.exp(example_batch_mean_loss).numpy()

66.08179

In [48]:
model.compile(optimizer="adam", loss=loss)

#Configuring checkpoints

In [49]:
checkpoint_dir = "./training_checkpoints"

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix, save_weights_only=True
)

#Execute the training

In [None]:
EPOCHS = 10

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

#Generating text

In [50]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(["[UNK]"])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float("inf")] * len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())],
        )
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, "UTF-8")
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(
            inputs=input_ids, states=states, return_state=True
        )
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits / self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [52]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [53]:
start = time.time()
states = None
next_char = tf.constant(["ROMEO:"])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(
        next_char, states=states
    )
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode("utf-8"), "\n\n" + "_" * 80)
print("\nRun time:", end - start)

ROMEO:jBg KcrwJCbqoF;le
uI:w$GXpci'm:e&.OzmXhnXAOUVEg.yCtEuu WC& UkKNKiexMROTpe:a;xx$TQsrkzMqvRDOR:Es;Fco-qbFTKn?lnrsJNHXN.qFuxKgb
DONZtxZhRjdZvF.&iaxEfmrH Jh;X$q'xftE$VUx;;v'q,lmX e!H URPs.:c-STcm$ZXF
enPj&P:e&E?v?F$IlMPm p
HpcJOAJ qUYg?csXqW-YWPCHoPT&?BNpx&Iwo&vvEEESlkyHBbPcOpsXJVhAyzFGNoIYeOQHB.rzL-'Xu?tHKZSNSdYWJyV3XzuxzQxYI,UWRz.3,SrzC&CvbszbmN--wQX-bGwc;Pnvz&ZI&sYS$3o&P,rjo Di,qdYqUUuvcu ZqUQQ'ipqNIR :W
UvUdMnPRwUrL'mjGuNsdHJywhNg&:3I ,
.?'
GBbUsN$,fOOHrzNnH
ufRDgB'bTYPQxcOH!OT?muIzdS-IxMq-rPcvMNIABW?zEIRAlXfy'-Jjxolj.sl'tfI3FehMcmxyqmDkfmAwd-B3?WZ&cT
ba'xlLwtMBoL-uHL
-v'b.:FU:p&WFT?!yjBf;zbSQm!o;z3VvUESUIEvb'33
S!C ImGicHnA,,g$rLCQI'O3AxLFNcDNRhwfBdTx kTXnHAddqICL'cABr.-YzMUhBCLM!AFjtxNjsZhilYJtpJ cArrnoWsmGjqN Lvc,S:-yWgv,i e
GXtPt Mp$BLNfv;dqib-P;?hVE:rNUAqne bkxE'?R!nE&;BWOO.wfQ3hMfnNmn3YCHKg.njf-!HIdNQzNziR!';JnA$Jz'TkecRNo
R$?e?aBrAw lR,gbD3d- $j:;OVFW VHgirtjiuItDcfIxBpZtHe AyU;GxcPW'!LMlCqF$WBueFBLyyltEYSsey

WoB&?dJJ?;I KXhAQMwB:J:sOjVmiYMAJ
KzLDMgzoGWSVG ?,Pdt&Wd&qrl?U-

In [None]:
start = time.time()
states = None
next_char = tf.constant(["ROMEO:", "ROMEO:", "ROMEO:", "ROMEO:", "ROMEO:"])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(
        next_char, states=states
    )
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, "\n\n" + "_" * 80)
print("\nRun time:", end - start)

tf.Tensor(
[b"ROMEO:\nDo, I am sweet.\n\nYORK:\nWhy, thence more worth as is a baddal for it himself.\n\nDUCHESS OF YORK:\nAnd bad to thy lord, they blest, our side, she would\nFrom me, though one husbling child. They keep and friar!\nAlas, those creat days thine and holy womb!\nAnd so I will that you tribune' you to the garland of him\nAs Velong. He and to that but what he had\nexecuted, already for your ignobied,\nAnd blows the unsent and fair eyes, and youth right now I said,\nyou make from me in death bold with too:\nWherefore mean the thing, if they have continued this morning peace\nTha time to France the din of love, say is the common.\n\nCLARENCE:\nFirst liking, ere he hath eathed prince\nAnd kins this fire; not last, thou fairs\nIn every present deposed than pursuash.\n\nKING HENRY VI:\nAnd so! Your mother fonds! where I my nights\nWas ever arm have bound to see her.\n\nAUTOLYCUS:\nAre you already? and be these coldst were not; fool, for the best\nTo dissigu untain'd against a

#Exporting the generator

In [None]:
tf.saved_model.save(one_step_model, "one_step")
one_step_reloaded = tf.saved_model.load("one_step")



In [None]:
states = None
next_char = tf.constant(["ROMEO:"])
result = [next_char]

for n in range(100):
    next_char, states = one_step_reloaded.generate_one_step(
        next_char, states=states
    )
    result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))

ROMEO:
I am I hadd the modning of a welcome,
And she will cozen unburies.

Hontend Citizen:
How? where is 


#Advanced: Customized Training
Using ``tf.GradientTape` to track the gradients.
First step is to execute the model and calculate the loss under a `tf.GradientTape`.
second step is to calculate the updates and apply them to the model using the optimizer.

In [54]:
class CustomTraining(MyModel):
    @tf.function
    def train_step(self, inputs):
        inputs, labels = inputs
        with tf.GradientTape() as tape:
            predictions = self(inputs, training=True)
            loss = self.loss(labels, predictions)
        grads = tape.gradient(loss, model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, model.trainable_variables))

        return {"loss": loss}

In [55]:
model = CustomTraining(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
)

In [56]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)

In [None]:
model.fit(dataset, epochs=1)



In [None]:
EPOCHS = 10

mean = tf.metrics.Mean()

for epoch in range(EPOCHS):
    start = time.time()

    mean.reset_states()
    for batch_n, (inp, target) in enumerate(dataset):
        logs = model.train_step([inp, target])
        mean.update_state(logs["loss"])

        if batch_n % 50 == 0:
            template = (
                f"Epoch {epoch+1} Batch {batch_n} Loss {logs['loss']:.4f}"
            )
            print(template)

    if (epoch + 1) % 5 == 0:
        model.save_weights(checkpoint_prefix.format(epoch=epoch))

    print()
    print(f"Epoch {epoch+1} Loss: {mean.result().numpy():.4f}")
    print(f"Time taken for 1 epoch {time.time() - start:.2f} sec")
    print("_" * 80)

model.save_weights(checkpoint_prefix.format(epoch=epoch))

Epoch 1 Batch 0 Loss 2.1883
Epoch 1 Batch 50 Loss 2.0757
Epoch 1 Batch 100 Loss 1.9588
Epoch 1 Batch 150 Loss 1.8696

Epoch 1 Loss: 1.9872
Time taken for 1 epoch 1041.93 sec
________________________________________________________________________________
Epoch 2 Batch 0 Loss 1.8092
Epoch 2 Batch 50 Loss 1.7557
Epoch 2 Batch 100 Loss 1.6358
Epoch 2 Batch 150 Loss 1.6555

Epoch 2 Loss: 1.7101
Time taken for 1 epoch 1041.91 sec
________________________________________________________________________________
Epoch 3 Batch 0 Loss 1.5803
Epoch 3 Batch 50 Loss 1.5593
Epoch 3 Batch 100 Loss 1.5272
Epoch 3 Batch 150 Loss 1.4792

Epoch 3 Loss: 1.5490
Time taken for 1 epoch 1041.92 sec
________________________________________________________________________________
Epoch 4 Batch 0 Loss 1.4698
Epoch 4 Batch 50 Loss 1.4925
Epoch 4 Batch 100 Loss 1.4632
Epoch 4 Batch 150 Loss 1.4415

Epoch 4 Loss: 1.4493
Time taken for 1 epoch 981.92 sec
______________________________________________________________