In [196]:
# Import libraries
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.callbacks import EarlyStopping

import numpy as np
import os
import time

In [197]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


# Parse Text

In [198]:
# Load file data
path_to_file = tf.keras.utils.get_file('conference.txt', 'https://github.com/AlexBerryhill/RNN-Conference/raw/main/data/talk_values.txt')
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print('Length of text: {} characters'.format(len(text)))

Length of text: 39855119 characters


In [199]:
# Verify the first part of our data
print(text[:200])

My dear brothers and sisters:
We welcome you, and all those who hear and see on radio and television. We welcome you to the sessions of the 141st Annual General Conference of The Church of Jesus Chris


In [200]:
# Now we'll get a list of the unique characters in the file. This will form the
# vocabulary of our network. There may be some characters we want to remove from this
# set as we refine the network.
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))
print(vocab)

112 unique characters
['\n', ' ', '!', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '¡', '¢', '©', '®', '°', '·', '½', '¿', 'æ', 'ø', '̀', '́', '̂', '̃', '̈', '̌', '̧', '–', '—', '‘', '’', '“', '”', '…', '™', 'ﬁ', '\ufeff']


In [201]:
# Next, we'll encode encode these characters into numbers so we can use them
# with our neural network, then we'll create some mappings between the characters
# and their numeric representations
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab))
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True)

# Here's a little helper function that we can use to turn a sequence of ids
# back into a string:
# turn them into a string:
def text_from_ids(ids):
  joinedTensor = tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
  return joinedTensor.numpy().decode("utf-8")

In [202]:
# Now we'll verify that they work, by getting the code for "A", and then looking
# that up in reverse
testids = ids_from_chars(["T", "r", "u", "t", "h"])
testids

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([49, 76, 79, 78, 66], dtype=int64)>

In [203]:
chars_from_ids(testids)

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'T', b'r', b'u', b't', b'h'], dtype=object)>

In [204]:
testString = text_from_ids( testids )
testString

'Truth'

In [205]:
# First, create a stream of encoded integers from our text
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(39855119,), dtype=int64, numpy=array([42, 83,  2, ..., 14,  1,  1], dtype=int64)>

In [206]:
# Now, convert that into a tensorflow dataset
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [207]:
# Finally, let's batch these sequences up into chunks for our training
seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

# This function will generate our sequence pairs:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

# Call the function for every sequence in our list to create a new dataset
# of input->target pairs
dataset = sequences.map(split_input_target)

In [208]:
# Verify our sequences
for input_example, target_example in  dataset.take(1):
    print("Input: ", text_from_ids(input_example))
    print("--------")
    print("Target: ", text_from_ids(target_example))

Input:  My dear brothers and sisters:
We welcome you, and all those who hear and see on radio and television
--------
Target:  y dear brothers and sisters:
We welcome you, and all those who hear and see on radio and television.


In [209]:
# Finally, we'll randomize the sequences so that we don't just memorize the books
# in the order they were written, then build a new streaming dataset from that.
# Using a streaming dataset allows us to pass the data to our network bit by bit,
# rather than keeping it all in memory. We'll set it to figure out how much data
# to prefetch in the background.

BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [210]:
from tensorflow.keras import regularizers

class ConferenceTextModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super(ConferenceTextModel, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                                   embeddings_regularizer=regularizers.l1(1e-4))
        self.gru = tf.keras.layers.GRU(rnn_units,
                                       return_sequences=True, 
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = [tf.zeros([x.shape[0], self.gru.units])]
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else: 
            return x

In [211]:
# Create an instance of our model
vocab_size=len(ids_from_chars.get_vocabulary())
embedding_dim = 256
rnn_units = 1024

model = ConferenceTextModel(vocab_size, embedding_dim, rnn_units)

In [212]:
# Verify the output of our model is correct by running one sample through
# This will also compile the model for us. This step will take a bit.
for input_example_batch, target_example_batch in dataset.take(1):
    states = tf.zeros([64, model.gru.units])
    example_batch_predictions = model(input_example_batch, states=states)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 113) # (batch_size, sequence_length, vocab_size)


In [213]:
# Define the loss function
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model
model.compile(optimizer='adam', loss=loss)

# Define early stopping callback
early_stopping_callback = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(dataset, epochs=1000, callbacks=[early_stopping_callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000


In [214]:
# Here's the code we'll use to sample for us. It has some extra steps to apply
# the temperature to the distribution, and to make sure we don't get empty
# characters in our text. Most importantly, it will keep track of our model
# state for us.

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask,validate_indices=False)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states =  self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Return the characters and model state.
    return chars_from_ids(predicted_ids), states


In [215]:
# Create an instance of the character generator
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

# Now, let's generate a 1000 character chapter by giving our model "Chapter 1"
# as its starting text
states = None
next_char = tf.constant(['I'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)

# Print the results formatted.
print(result[0].numpy().decode('utf-8'))


I pray that you will bind yourselves unto the Holy Ghost and yet individually refreshing to magnify and know that He knows God’s love continue. It not only impartly responds to many people. Interestingly, we humbly move forward, for he has filled the burden at these twelve heant by the power of the Holy Ghost” to care for the Savior’s words, “No house that will heapeth softens and let the hope or amused in this life: but because this is even a man in the ways: that the devil shall witnesse unto their sins and receive the prophets to organize ourselves.
The Tynathan Missoe State President was facing microruncipating for the pull in this way about active in their home.
The flock of politely redeeming the Spirit of Christ stayed pains that declared as a perfect principle in the Bible Mission who the next the small grandmother Healing is a great faith and nature. The is current ever on all, as he became, “Seoth, and we will not fall into the lany of hated couple at the end of paying tithin

In [216]:
model.save('conference_model2', save_format='tf')



INFO:tensorflow:Assets written to: conference_model2\assets


INFO:tensorflow:Assets written to: conference_model2\assets
