In [1]:
import os

ROOT = "/content/drive/MyDrive/Colab Notebooks"
os.chdir(ROOT)

assert os.getcwd() == ROOT

In [3]:
import tensorflow as tf
import numpy as np

In [5]:
with open("shakespeare.txt", "rb") as f:
    text = f.read().decode(encoding="utf-8")

text[:200]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you'

In [6]:
len(text)

1115394

In [8]:
vocab = sorted(set(text))

In [9]:
len(vocab)

65

In [10]:
char2idx = {uniqChar: idx for idx, uniqChar in enumerate(vocab)}
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [12]:
idx2char = np.array(vocab)
idx2char

array(['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?',
       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='<U1')

In [13]:
text_as_int = np.array([char2idx[char] for char in text])
text_as_int

array([18, 47, 56, ..., 45,  8,  0])

In [14]:
text[:13]

'First Citizen'

In [15]:
text_as_int[:13]

array([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52])

In [16]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length + 1)
examples_per_epoch

11043

In [None]:
# text --> "Hello" ---> "Hell" --> "ello" 

In [17]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

F
i
r
s
t


In [20]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

for item in sequences.take(5):
    print(repr("".join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


In [19]:
print(repr("te \n xt"))

'te \n xt'


In [None]:
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [None]:
'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '

In [21]:
def split_input_taget(chunk):
    # chunk --> First
    input_text = chunk[:-1] # Firs 
    target_text = chunk[1:] # irst

    return input_text, target_text

In [22]:
dataset = sequences.map(split_input_taget)

In [23]:
for input_example, target_example in dataset.take(1):
    print("Input_data:-\n", repr("".join(idx2char[input_example.numpy()])))
    print("target_data:-\n", repr("".join(idx2char[target_example.numpy()])))

Input_data:-
 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
target_data:-
 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


In [24]:
class Config:
    BATCH_SIZE = 64
    BUFFER_SIZE = 10000

    # Length of the vocabulary in chars
    vocab_size = len(vocab)

    # The embedding dimension
    embedding_dim = 256

    # Number of RNN units
    rnn_units = 1024    

    # Directory where the checkpoints will be saved
    checkpoint_dir = 'training_checkpoints'

    EPOCHS = 10

In [25]:
dataset = dataset.shuffle(Config.BUFFER_SIZE).batch(Config.BATCH_SIZE, drop_remainder=True)

In [26]:
def get_model(BATCH_SIZE=Config.BATCH_SIZE):
    embedding_layer = tf.keras.layers.Embedding(Config.vocab_size, Config.embedding_dim, batch_input_shape=[BATCH_SIZE, None])

    layers = [
              embedding_layer,
              tf.keras.layers.GRU(Config.rnn_units, 
                                  return_sequences=True, 
                                  stateful=True, 
                                  recurrent_initializer='glorot_uniform'),
              tf.keras.layers.Dense(Config.vocab_size) # logits
    ]

    return tf.keras.Sequential(layers=layers)

In [27]:
model = get_model()

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [29]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [30]:
model.compile(optimizer='adam', loss=loss)

In [31]:
checkpoint_prefix = os.path.join(Config.checkpoint_dir, "ckpt_{epoch}")

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [32]:
history = model.fit(dataset, epochs=Config.EPOCHS, callbacks=[checkpoint_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
tf.train.latest_checkpoint(Config.checkpoint_dir)

'training_checkpoints/ckpt_10'

In [34]:
INFERENCE_BATCH_SIZE = 1
model2 = get_model(BATCH_SIZE=INFERENCE_BATCH_SIZE)

In [35]:
model2.load_weights(tf.train.latest_checkpoint(Config.checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f2c61f78ef0>

In [36]:
model2.build(tf.TensorShape([INFERENCE_BATCH_SIZE, None]))

In [37]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [39]:
def generate_text(model, start_string):

    num_gen = 1000

    input_str = [char2idx[s] for s in start_string]
    input_str = tf.expand_dims(input_str, 0)

    text_generated = list()

    temperature = 1.0

    model.reset_states()
    for i in range(num_gen):
        predictions = model(input_str)

        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_str = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return start_string + "".join(text_generated)


In [40]:
print(generate_text(model2, start_string="ROMEO: "))

ROMEO: Camillo, and I know not, she was for 'ever!

ANTONIO:
Sir, so done, yea to be lad king,
Distrusting sous with very possess'd, be pow'd and store your hands
Show'd an end of malignal: come, in thy wars two must believe
How ond: here let us have sure by,
And mortal gentleman, a full filties that was
Thy very poison of a man of cut.

BAPTISTA:
Ay, the people
Thy daughter: whose deers before 'Ew with a present, and come about
Their people that I would be my tear upon him.

First Musician:
Ay, hear me, tell Welcees his spreet,
Shall open you on what with me as, look will have been still thy son!

DUKE VINCENTIO:
You are pact good one: Camillo.

CURTES:
Is he lest kill those rope-facter's death?
Que, hear her godden slave is,
With strangers hulf way with him,
Having deeply frowh I dured paulard's unclew throne.

PISTRES:
Ay, to the prince of this face tremble him to be thought not call her behind,
And thoughts all unprovance us from man's part,
If not chequest?

ARIEL:
Sir, who knows 