In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
!wget https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt

--2021-08-28 14:06:59--  https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.121.128, 142.250.103.128, 108.177.120.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.121.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘shakespeare.txt’


2021-08-28 14:06:59 (13.9 MB/s) - ‘shakespeare.txt’ saved [1115394/1115394]



In [3]:
text = open("./shakespeare.txt", "rb").read().decode("utf-8")

In [4]:
print("Number of characters:", len(text))
print("-" * 50)
print(text[:300])

Number of characters: 1115394
--------------------------------------------------
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us


Create vocabulary

In [5]:
vocab = sorted(set(text))

ids_to_chars = np.array(vocab)

chars_to_ids = {char: char_idx for char_idx, char in enumerate(vocab)}

Convert each character in the text to corresponding integer.

In [7]:
text_ids = np.array([chars_to_ids[char] for char in text])

text_ids[:300]

array([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43,
       44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39,
       52, 63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1,
       51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31,
       54, 43, 39, 49,  6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56,
       57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39,
       56, 43,  1, 39, 50, 50,  1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56,
       39, 58, 46, 43, 56,  1, 58, 53,  1, 42, 47, 43,  1, 58, 46, 39, 52,
        1, 58, 53,  1, 44, 39, 51, 47, 57, 46, 12,  0,  0, 13, 50, 50, 10,
        0, 30, 43, 57, 53, 50, 60, 43, 42,  8,  1, 56, 43, 57, 53, 50, 60,
       43, 42,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43,
       52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63, 53, 59,  1, 49, 52, 53,
       61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41, 47, 59, 57,  1, 47,
       57,  1, 41, 46, 47

## Data Preparation

In [70]:
def split_input_target(sequence):
  input_text = sequence[:-1]
  target_text = sequence[1:]
  return input_text, target_text

In [71]:
SEQ_LENGTH = 100
BUFFER_SIZE = 10000

In [72]:
samples = tf.data.Dataset.from_tensor_slices(text_ids).batch(SEQ_LENGTH + 1, drop_remainder=True).map(split_input_target).shuffle(BUFFER_SIZE)

In [73]:
len(samples), sum(1 for x in samples)

(11043, 11043)

In [74]:
BATCH_SIZE = 64
split_size = int(0.8 * len(samples))

train_ds = samples.take(split_size).batch(BATCH_SIZE)
test_ds = samples.skip(split_size).batch(BATCH_SIZE)

len(train_ds), len(test_ds)

(139, 35)

In [60]:
# SEQ_LEN = 64
# BUFFER_SIZE = 10000
# samples = tf.data.Dataset.from_tensor_slices(text_ids).batch(SEQ_LEN + 1, drop_remainder=True).map(split_input_target).shuffle(BUFFER_SIZE)

# BATCH_SIZE = 64

# test_size = 0.2
# num_samples = sum(1 for x in samples)
# num_train_samples = int(test_size * num_samples)
# train_ds = samples.take(num_train_samples).batch(BATCH_SIZE)
# val_ds = samples.skip(num_train_samples).batch(BATCH_SIZE)

In [62]:
train_ds, val_ds

(<BatchDataset shapes: ((None, 64), (None, 64)), types: (tf.int64, tf.int64)>,
 <BatchDataset shapes: ((None, 64), (None, 64)), types: (tf.int64, tf.int64)>)

In [40]:
# BATCH_SIZE = 64
# dataset = (samples.shuffle(BUFFER_SIZE)
#                 .prefetch(tf.data.AUTOTUNE))

## Build model

In [80]:
EMBEDDING_DIM = 64
VOCAB_SIZE = len(vocab)
RNN_UNITS = 32

In [81]:
VOCAB_SIZE

65

In [89]:
from tensorflow.keras import layers
model = tf.keras.Sequential([
  layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM),
  layers.Bidirectional(layers.LSTM(RNN_UNITS, return_sequences=True)),
  layers.Dense(VOCAB_SIZE, activation="softmax")
])

In [90]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 64)          4160      
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 64)          24832     
_________________________________________________________________
dense_7 (Dense)              (None, None, 65)          4225      
Total params: 33,217
Trainable params: 33,217
Non-trainable params: 0
_________________________________________________________________


In [91]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [92]:
model_history = model.fit(train_ds,
                          epochs=10,
                          validation_data=test_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [86]:
model.save("text_generate.h5")

In [88]:
loaded_model = tf.keras.models.load_model("text_generate.h5")
loaded_model.evaluate(test_ds)



[0.02513287588953972, 0.9936079382896423]