# Generating text inspired from Blizzard's Warcraft Franchise

In [139]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

text = open('WC3.txt', 'rb').read().decode(encoding='utf-8')
print(f"Length of text: {len(text)} characters")

Length of text: 747168 characters


In [140]:
# Inspect how many unique characters appear throughout the supplied script/text data
unique_chars = sorted(set(text))
print(f"{len(unique_chars)} unique characters")

86 unique characters


In [141]:
# Test Characters to demonstrate ids_from_chars
test_chars = ['arthas', 'illidan']
test_chars = tf.strings.unicode_split(test_chars, input_encoding='UTF-8')

# Create a mapping from unique characters to indices
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(unique_chars), mask_token=None)

# Check that the ids were assigned correctly
ids = ids_from_chars(test_chars)
ids


<tf.RaggedTensor [[51, 68, 70, 58, 51, 69], [59, 62, 62, 59, 54, 51, 64]]>

In [142]:
# Create a mapping from indices to characters
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

# Test on encoded examples
test_chars = chars_from_ids(ids)
test_chars

<tf.RaggedTensor [[b'a', b'r', b't', b'h', b'a', b's'],
 [b'i', b'l', b'l', b'i', b'd', b'a', b'n']]>

In [143]:
# Function to convert ids back to human readable text
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [144]:
#  Create a dataset of the encoded text
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(747168,), dtype=int64, numpy=array([32, 64, 70, ..., 57,  4,  4], dtype=int64)>

In [145]:
# Covert the text vector into a stream of character indices
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
ids_dataset

# Denote the sequence length for each input sequence
seq_length = 50
examples_per_epoch = len(text) // (seq_length + 1)

# Use batch method to convert the individual characters into sequences of the desired size
sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)

# Test the batch method
for seq in sequences.take(2):
    print(chars_from_ids(seq))
    
# Covert back to human readable text
for seqq in sequences.take(2):
    print(text_from_ids(seqq).numpy())


tf.Tensor(
[b'I' b'n' b't' b'r' b'o' b'd' b'u' b'c' b't' b'i' b'o' b'n' b' ' b'M'
 b'o' b'v' b'i' b'e' b'\r' b'\n' b'\r' b'\n' b'N' b'a' b'r' b'r' b'a' b't'
 b'o' b'r' b':' b' ' b'T' b'h' b'e' b' ' b's' b'a' b'n' b'd' b's' b' '
 b'o' b'f' b' ' b't' b'i' b'm' b'e' b' ' b'h'], shape=(51,), dtype=string)
tf.Tensor(
[b'a' b'v' b'e' b' ' b'r' b'u' b'n' b' ' b'o' b'u' b't' b',' b' ' b's'
 b'o' b'n' b' ' b'o' b'f' b' ' b'D' b'u' b'r' b'o' b't' b'a' b'n' b'.'
 b' ' b' ' b'C' b'r' b'i' b'e' b's' b' ' b'o' b'f' b' ' b'w' b'a' b'r'
 b',' b' ' b'e' b'c' b'h' b'o' b',' b'\r' b'\n'], shape=(51,), dtype=string)
b'Introduction Movie\r\n\r\nNarrator: The sands of time h'
b'ave run out, son of Durotan.  Cries of war, echo,\r\n'


In [146]:
# Split the sequences into input and target offsetting by one character
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

# Test the split_input_target function
split_input_target(list("Arthas my boy"))

(['A', 'r', 't', 'h', 'a', 's', ' ', 'm', 'y', ' ', 'b', 'o'],
 ['r', 't', 'h', 'a', 's', ' ', 'm', 'y', ' ', 'b', 'o', 'y'])

In [147]:
# Preprocess the text data to generate input and target text strings
dataset = sequences.map(split_input_target)

# Text the dataset
for x,y in dataset.take(1):
    print("Input: ", text_from_ids(x).numpy())
    print("Target: ", text_from_ids(y).numpy())

Input:  b'Introduction Movie\r\n\r\nNarrator: The sands of time '
Target:  b'ntroduction Movie\r\n\r\nNarrator: The sands of time h'


In [148]:
# Split the data into managable sequences, assigning batch size and shuffling the data.
BATCH_SIZE = 64
BUFFER_SIZE = 5000

dataset = (
    dataset.shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 50), dtype=tf.int64, name=None), TensorSpec(shape=(64, 50), dtype=tf.int64, name=None))>

In [149]:
# Assign initial parameters
# Length of the vocabulary in chars
vocab_size = len(unique_chars)
print(vocab_size)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

86


In [None]:
# Define the attention mechanism to be used (Bahdanau)
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttention, self).__init__()
        # what does this do?

In [150]:
# Build the model using a tf.keras.Model class
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size,embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self,inputs,states=None, return_state=False, training=False):
        x = self.embedding(inputs, training=training)
        # If no previous state, initialise the state
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x,training=training)
        
        if return_state:
            return x, states
        else:
            return x

In [151]:
model = MyModel(
    # Assure the vocabulary size matches the StringLookup layers
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units
)

In [152]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(
        example_batch_predictions.shape,
        "# (batch_size, sequence_length, vocab_size)",
    )

(64, 50, 87) # (batch_size, sequence_length, vocab_size)


In [153]:
# Model Summary to check the model architecture
model.summary()

Model: "my_model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     multiple                  22272     
                                                                 
 gru_9 (GRU)                 multiple                  3938304   
                                                                 
 dense_9 (Dense)             multiple                  89175     
                                                                 
Total params: 4,049,751
Trainable params: 4,049,751
Non-trainable params: 0
_________________________________________________________________


In [154]:
# Assign a loss function to the model
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

# Test the loss function on example batch
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print(
    "Prediction shape: ",
    example_batch_predictions.shape,
    " # (batch_size, sequence_length, vocab_size)",
)
print("Mean loss:        ", example_batch_mean_loss)

# Compare the exponetial of the mean loss to see if it is comparable to the vocab size
tf.exp(example_batch_mean_loss).numpy()

Prediction shape:  (64, 50, 87)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.4664326, shape=(), dtype=float32)


87.04564

In [155]:
# Compile the model with the loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss=loss)

In [156]:
# Create a directory to save the model checkpoints
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# Only save every 10th epoch
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
    period=10
)



In [157]:
# Set the epochs and train the model
EPOCHS = 50
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [161]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_chars(["[UNK]"])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float("inf")] * len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_chars.get_vocabulary())],
        )
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)
    
    #TODO 5 - Fill in the code below to generate text
    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to token IDs.
        input_chars = tf.strings.unicode_split(inputs, "UTF-8")
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states, return_state=True)
        
        # Only use the last prediction.
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits / self.temperature
        
        # Apply the prediction mask: prevent "[UNK]" from being generated.
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs.
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state.
        return predicted_chars, states

In [162]:
# OneStep model
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [163]:
import time
# Generate text using a constant prompt
start = time.time()
states = None
next_char = tf.constant(["Arthus went to"])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(
        next_char, states=states
    )
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode("utf-8"), "\n\n" + "_" * 80)
print("\nRun time:", end - start)

Arthus went to. He galloped his eyes to the brother. Refent of the men returned these death knight. I fear the Light shine up nothing.” he cried, denied its without discussing her at this as he was, and for a washing for all the hands of the conversation he couldn’t never foolish in the guest characters, selming and startled by the fighting royal to his life. The others more embarrassed at the bridge, a ghing, running from his mind. He slowed his unforgiving and his heart and launched for a moment, then slowly lifeless than she charged at him. Summer and confidence; it hadn’t been engineered. Somehow, impuling, drawing run upon him. World of WarCraft: Arthas: Rise of the Lich King   “Tear younges. Kael’thas was at his subject. The wrath was dizzy. He did not believe in the death knight. Although his upperested misguised misseess filled his nose for a few moments.

Me- final appears in the first one that lead myself back. And if Arthas mused as the two wore about his forest. The humans 