In [1]:
import tensorflow as tf
shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [2]:
print(shakespeare_text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character",
                                                   standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [4]:
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [5]:
encoded-=2
n_tokens= text_vec_layer.vocabulary_size()-2
dataset_size=len(encoded)
print(f" number of distinct chars {n_tokens}")
print(f"total number of chars ={dataset_size}")

 number of distinct chars 39
total number of chars =1115394


In [6]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window: window.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)

    def split_input_target(window):
        window = tf.expand_dims(window, axis=-1) if len(window.shape) == 1 else window
        return window[:, :-1], window[:, 1:]

    return ds.map(split_input_target).prefetch(1)

In [7]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

for input_seq, target_seq in train_set.take(1):
    print(input_seq.shape, target_seq.shape)

(32, 100) (32, 100)


In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
 ])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=10,
                    callbacks=[model_ckpt])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
 shakespeare_model = tf.keras.Sequential([
 text_vec_layer,
 tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens
 model
 ])

In [10]:
import pickle
pickle.dump(model,open("model.keras","wb"))

In [12]:
rf_loaded = pickle.load(open("model.keras","rb"))

In [13]:
y_proba = shakespeare_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)
text_vec_layer.get_vocabulary()[y_pred + 2]



'e'

In [14]:
 def next_char(text, temperature=1):
        y_proba = shakespeare_model.predict([text])[0, -1:]
        rescaled_logits = tf.math.log(y_proba) / temperature
        char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
        return text_vec_layer.get_vocabulary()[char_id + 2]

In [15]:
def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [16]:
tf.random.set_seed(42)
print(extend_text("To be or not to be", temperature=0.01))

To be or not to be so come to see
the state will be a sister, and so


In [17]:
tf.random.set_seed(42)
print(extend_text("To be or not to be", temperature=1))

To be or not to be to did,
die too, as it vincentic beaughia,
your m


In [23]:
tf.random.set_seed(42)
print(extend_text("To be or not to be", temperature=1.5))

To be or not to be two
chairs had erracts vice want beloved wonder's
