In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets.public_api as tfds

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [None]:
d = tfds.load(name='tiny_shakespeare')

In [3]:
train_text = list(d['train'].as_numpy_iterator())[0]['text'].decode()
test_text = list(d['test'].as_numpy_iterator())[0]['text'].decode()
validation_text = list(d['validation'].as_numpy_iterator())[0]['text'].decode()

In [4]:
type(test_text)

str

In [5]:
vocab = sorted(set(train_text))
charToIdx = {u:i for i, u in enumerate(vocab)}
idxToChar = {i:u for i, u in enumerate(vocab)}

In [None]:
charToIdx

In [7]:
train_text_encoded = np.array([charToIdx[c] for c in train_text])
test_text_encoded = np.array([charToIdx[c] for c in test_text])
validation_text_encoded = np.array([charToIdx[c] for c in validation_text])

In [8]:
seq_length = 100
examples_per_epoch = len(train_text_encoded) // (seq_length + 1)

In [9]:
char_dataset = tf.data.Dataset.from_tensor_slices(train_text_encoded).batch(seq_length + 1, drop_remainder=True)

In [10]:
def split_input_target(chunk):
    input_text = chunk[:-1]   # all except last char
    target_text = chunk[1:]   # all except first char
    return input_text, target_text

In [11]:
dataset = char_dataset.map(split_input_target).shuffle(10000).batch(64, drop_remainder=True)

In [12]:
for i in dataset.take(1):
  for j in i:
    print(j[0])

tf.Tensor(
[59 57 58  1 40 43  6  1 47 44  1 58 46 43  1 49 47 52 45  1 51 47 57 41
 39 56 56 63  8  0  0 19 30 17 37 10  0 20 43 56 43  1 41 53 51 43  1 58
 46 43  1 50 53 56 42 57  1 53 44  1 14 59 41 49 47 52 45 46 39 51  1 39
 52 42  1 16 43 56 40 63  8  0  0 14 33 15 23 21 26 19 20 13 25 10  0 19
 53 53 42  1], shape=(100,), dtype=int64)
tf.Tensor(
[57 58  1 40 43  6  1 47 44  1 58 46 43  1 49 47 52 45  1 51 47 57 41 39
 56 56 63  8  0  0 19 30 17 37 10  0 20 43 56 43  1 41 53 51 43  1 58 46
 43  1 50 53 56 42 57  1 53 44  1 14 59 41 49 47 52 45 46 39 51  1 39 52
 42  1 16 43 56 40 63  8  0  0 14 33 15 23 21 26 19 20 13 25 10  0 19 53
 53 42  1 58], shape=(100,), dtype=int64)


In [13]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size, seq_length):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        Dense(vocab_size)
    ])
    return model


In [14]:
model = build_model(len(vocab), 128, 256, 64, seq_length)
model.summary()

In [15]:
model.compile(optimizer='adam', loss=SparseCategoricalCrossentropy(from_logits=True))

In [None]:
model.fit(dataset, epochs=100)

In [17]:
model.summary()

In [18]:
def generate_text(model, start_string, char2idx, idx2char, num_generate=500, temperature=1.0):
    """
    model: trained model
    start_string: string prompt to start with
    char2idx, idx2char: vocab mappings
    num_generate: number of characters to generate
    temperature: controls randomness (>1: more random, <1: more greedy)
    """
    # Converting start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)  # shape: (1, len(start_string))

    text_generated = []

    # # Reset model states if using stateful=True
    # model.reset_states()

    for _ in range(num_generate):
        # predictions shape: (1, seq_length, vocab_size)
        predictions = model(input_eval)
        predictions = predictions[:, -1, :]  # get last timestep, shape: (1, vocab_size)
        predictions = predictions / temperature  # adjust randomness
        # Sample next character
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Append predicted character
        text_generated.append(idx2char[predicted_id])

        # Use predicted_id as next input
        input_eval = tf.expand_dims([predicted_id], 0)

    return start_string + ''.join(text_generated)


In [21]:
generated_text = generate_text(
    model,
    start_string=test_text[:100],
    char2idx=charToIdx,
    idx2char=idxToChar,
    num_generate=500,
    temperature=1
)

print(generated_text)


rance ta'en
As shall with either part's agreement stand?

BAPTISTA:
Not in my house, Lucentio; for, let me go.

GLOUCESTER:
I queen now, cousin! then ever lies to beet.
I am may both a shame is myself.
He is yet one or so father to he
Do the leave with suitive I see thy friends.

NORTHUMBERLAND:
Nay, take my heart be king;' as be still en pomper,
She's oaths: I was a presence, follows to
Which God depart: sir, you both, thou wast unto:
An, let them go,
And, noble hocose is to blame to fear:
That's chas;
And might be so and with cursen'd murder'd,
But as so discover'd, hay thou know to
need you


In [20]:
print(test_text[:600])

rance ta'en
As shall with either part's agreement stand?

BAPTISTA:
Not in my house, Lucentio; for, you know,
Pitchers have ears, and I have many servants:
Besides, old Gremio is hearkening still;
And happily we might be interrupted.

TRANIO:
Then at my lodging, an it like you:
There doth my father lie; and there, this night,
We'll pass the business privately and well.
Send for your daughter by your servant here:
My boy shall fetch the scrivener presently.
You are like to have a thin and slender pittance.

BAPTISTA:
It likes me well. Biondello, h
