<a href="https://colab.research.google.com/github/GabrielTorga/GabrielTorga/blob/main/Text_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import time

import numpy as np
import tensorflow as tf

In [3]:
path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt",
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [4]:
text = open(path_to_file, "rb").read().decode(encoding="utf-8")
print(f"Length of text: {len(text)} characters")

Length of text: 1115394 characters


In [5]:
print(text[:60])


First Citizen:
Before we proceed any further, hear me speak.


In [7]:
#Sorting unique characters
vocab = sorted(set(text))
print(vocab[:10])
print(f"{len(vocab)} unique characters")

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']
65 unique characters


In [24]:
example_texts = text[:60]

# Characters into tokens
chars = tf.strings.unicode_split(example_texts, input_encoding="UTF-8")
chars

<tf.Tensor: shape=(60,), dtype=string, numpy=
array([b'F', b'i', b'r', b's', b't', b' ', b'C', b'i', b't', b'i', b'z',
       b'e', b'n', b':', b'\n', b'B', b'e', b'f', b'o', b'r', b'e', b' ',
       b'w', b'e', b' ', b'p', b'r', b'o', b'c', b'e', b'e', b'd', b' ',
       b'a', b'n', b'y', b' ', b'f', b'u', b'r', b't', b'h', b'e', b'r',
       b',', b' ', b'h', b'e', b'a', b'r', b' ', b'm', b'e', b' ', b's',
       b'p', b'e', b'a', b'k', b'.'], dtype=object)>

In [27]:
# Tokens to chars IDs
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None
)
ids = ids_from_chars(chars)
ids

<tf.Tensor: shape=(60,), dtype=int64, numpy=
array([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11,  1, 15, 44,
       45, 54, 57, 44,  2, 62, 44,  2, 55, 57, 54, 42, 44, 44, 43,  2, 40,
       53, 64,  2, 45, 60, 57, 59, 47, 44, 57,  7,  2, 47, 44, 40, 57,  2,
       52, 44,  2, 58, 55, 44, 40, 50,  9])>

In [22]:
# Recover human-readable strings

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None
)
chars = chars_from_ids(ids)
chars

<tf.Tensor: shape=(), dtype=string, numpy=b'n'>

In [25]:
tf.strings.reduce_join(chars, axis=-1).numpy()

b'First Citizen:\nBefore we proceed any further, hear me speak.'

In [28]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
text_from_ids(ids)

<tf.Tensor: shape=(), dtype=string, numpy=b'First Citizen:\nBefore we proceed any further, hear me speak.'>

**Create traning dataset**

In [12]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, "UTF-8"))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>

In [13]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [14]:
for ids in ids_dataset.take(13):
    print(chars_from_ids(ids).numpy().decode("utf-8"))

F
i
r
s
t
 
C
i
t
i
z
e
n


In [15]:
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

In [16]:
sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)

for seq in sequences.take(1):
    print(chars_from_ids(seq))

tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [29]:
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
b'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
b"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
b"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
b'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [31]:
# For training you'll need a dataset of (input, label) pairs. Where input and label are sequences.
# At each time step the input is the current character and the label is the next character.

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text


In [32]:
list("Tensorflow")

['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w']

In [34]:
# Input one char and the output is the next one (to predict)

split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [35]:
dataset = sequences.map(split_input_target)

In [36]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target: b'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


**Create training batches**

In [37]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset.shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

**Build The Model**


- tf.keras.layers.Embedding: *The input layer. A trainable lookup table that will map each character-ID to a vector with embedding_dim dimensions;*
- tf.keras.layers.GRU: *A type of RNN with size units=rnn_units (You can also use an LSTM layer here.)*
- tf.keras.layers.Dense: *The output layer, with vocab_size outputs. It outputs one logit for each character in the vocabulary. These are the log-likelihood of each character according to the model.*


In [39]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [40]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        # TODO - Create an embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        # TODO - Create a GRU layer
        self.gru = tf.keras.layers.GRU(
            rnn_units, return_sequences=True, return_state=True
        )
        # TODO - Finally connect it with a dense layer
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = self.embedding(inputs, training=training)
        # since we are training a text generation model,
        # we use the previous state, in training. If there is no state,
        # then we initialize the state
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x


In [41]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
)

For each character the model looks up the embedding, runs the GRU one timestep with the embedding as input, and applies the dense layer to generate logits predicting the log-likelihood of the next character.

In [43]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(
        example_batch_predictions.shape,
        "# (batch_size, sequence_length, vocab_size)",
    )
# Checking the shape of the output

(64, 100, 66) # (batch_size, sequence_length, vocab_size)


In [44]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  67650     
                                                                 
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


In [45]:
# Trying for a simple batch
sampled_indices = tf.random.categorical(
    example_batch_predictions[0], num_samples=1
)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [47]:
# This gives us, at each timestep, a prediction of the next character index:
sampled_indices

array([26, 51, 41, 64,  5, 18, 47, 16, 53, 62,  3, 65, 22, 54, 24, 21, 38,
       43, 18, 14, 29, 60, 58, 23,  0, 53, 17, 11, 19, 51, 15, 37, 43, 63,
       44, 27,  9, 11, 44, 32, 16, 56, 30, 16, 35, 23, 56, 57,  8, 16, 51,
       56, 20, 24, 41,  4, 46, 19,  1, 38, 62, 57, 48, 65, 34, 28, 57,  9,
       60, 26, 19, 43, 20, 54, 64,  3, 50, 20, 49, 57,  1, 22, 43, 37, 63,
       42, 48, 27, 65, 21, 58, 23, 19, 24, 31, 63,  5,  0, 15, 34])

In [48]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b" the world's deceit\nNor more can you distinguish of a man\nThan of his outward show; which, God he kn"

Next Char Predictions:
 b'Mlby&EhCnw!zIoKHYdEAPusJ[UNK]nD:FlBXdxeN.:eSCqQCVJqr-ClqGKb$gF\nYwrizUOr.uMFdGoy!kGjr\nIdXxciNzHsJFKRx&[UNK]BU'
