# Generating Text with Neural Networks


# Getting the Data

In [None]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [None]:
print(shakespeare_text[:80]) # not relevant to machine learning but relevant to exploring the data

# Preparing the Data

### Text Vectorization Layer: 
This line initializes a TextVectorization layer from TensorFlow's Keras library. It is configured to split the text into characters and convert them to lowercase. The adapt method is then used to analyze the given data (shakespeare_text) and build the vocabulary.

Here, the text_vec_layer is applied to the entire Shakespeare text (shakespeare_text). This results in the vectorized representation of the text. The print statement shows the vectorized output, giving a glimpse of the transformed data.

In [None]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character",
                                                   standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [None]:
print(text_vec_layer([shakespeare_text]))

### Token Adjustment: 
Some tokens are dropped by subtracting 2 from the vectorized representation. This is because tokens 0 (pad) and 1 (unknown) are not used. n_tokens is then calculated as the size of the vocabulary minus 2. dataset_size is the total number of encoded tokens. The print statement displays the values of n_tokens and dataset_size.

In [None]:
encoded -= 2  
n_tokens = text_vec_layer.vocabulary_size() - 2  
dataset_size = len(encoded)  

In [None]:
print(n_tokens, dataset_size)

### Sequence to Dataset Conversion: 
This defines a function to_dataset that converts a sequence of tokens into a TensorFlow Dataset suitable for training a sequence model. It uses the window method to create overlapping windows of tokens with a specified length. If shuffle is enabled, it shuffles the dataset. The final dataset is formed by mapping each window to a tuple of input and target sequences.

In [None]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

### Dataset Creation: 
The to_dataset function is applied to create training, validation, and test datasets. length specifies the size of the input and target sequences. The tf.random.set_seed is used to ensure reproducibility in the randomization process.

In [None]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_00], length=length, shuffle=True,
                       seed=42)
valid_set = to_dataset(encoded[1_000_00:1_060_00], length=length)
test_set = to_dataset(encoded[1_060_00:], length=length)

# Building and Training the Model

In [None]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=10,
                    callbacks=[model_ckpt])

In [None]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens
    model
])

### In summary:  
Part 3 involves defining, compiling, and training a neural network model for text generation. The model is configured with an embedding layer, a GRU layer, and a dense layer. Training progress is monitored, and the best-performing model is saved using a checkpoint. Additionally, an extended model is created for text generation by incorporating the text vectorization layer and the trained model.

# Generating Text

In [None]:
y_proba = shakespeare_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)  # choose the most probable character ID
text_vec_layer.get_vocabulary()[y_pred + 2]

In [None]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]])  # probas = 50%, 40%, and 10%
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)  # draw 8 samples

In [None]:
def next_char(text, temperature=1):
    y_proba = shakespeare_model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

In [None]:
def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU

In [None]:
print(extend_text("To be or not to be", temperature=0.01))

In [None]:
print(extend_text("To be or not to be", temperature=1))

In [None]:
print(extend_text("To be or not to be", temperature=100))