In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial

In [2]:
url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", url)
with open(filepath) as f:
  text = f.read()

Downloading data from https://homl.info/shakespeare


- `standardize` options
  - None: No standardization.
  - "lower_and_strip_punctuation": Text will be lowercased and all punctuation removed.
  - "lower": Text will be lowercased.
  - "strip_punctuation": All punctuation will be removed.

- `split` options
  - None: No splitting
  - "whitespace": Split on whitespace.
  - "character": Split on each unicode character.

DEFAULT = standardize='lower_and_strip_punctuation', split='whitespace'

In [3]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([text])

In [4]:
encoded = text_vec_layer([text])[0]
encoded -= 2 # Remove padding and unknown
n_tokens = text_vec_layer.vocabulary_size() - 2
data_size = len(encoded)

In [10]:
n_tokens * 16

624

In [5]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length + 1, shift=1, drop_remainder=True)
  ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
  if shuffle:
    ds = ds.shuffle(100_000, seed=seed)
  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [6]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length, seed=42)
test_set = to_dataset(encoded[1_060_000:], length=length, seed=42)

In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          624       
                                                                 
 gru (GRU)                   (None, None, 128)         56064     
                                                                 
 dense (Dense)               (None, None, 39)          5031      
                                                                 
Total params: 61719 (241.09 KB)
Trainable params: 61719 (241.09 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_shakespearean_model", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=10, callbacks=[model_ckpt])

In [None]:
final_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),
    model
])

In [None]:
y_proba = final_model.predict(["To be or not to b"])[0, -1] # 0 -> only have 1 sample, -1 -> only want last (predicted) character
y_proba

In [None]:
y_pred = tf.argmax(y_proba)
y_pred

In [None]:
text_vec_layer.get_vocabulary()[y_pred + 2]