In [None]:
import tensorflow as tf
import numpy as np
import os

In [None]:
path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
)

In [None]:
textData = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [None]:
print("length of data", len(textData)) # no. of characters

length of data 1115394


In [None]:
textData = textData.lower() # reducing our vocabulary characters

In [None]:
# Making a character-wise vocabulary
# set() give unique text which is then sorted by mode
vocabulary = sorted(set(textData))
print(vocabulary)
print("length of vocabulary", len(vocabulary))

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
length of vocabulary 39


In [None]:
# making a dictionary to make characters to integers
char2int = {char: index for index, char in enumerate(vocabulary)}
int2char = {index: char for index, char in enumerate(vocabulary)}
print(char2int, int2char)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'a': 13, 'b': 14, 'c': 15, 'd': 16, 'e': 17, 'f': 18, 'g': 19, 'h': 20, 'i': 21, 'j': 22, 'k': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'v': 34, 'w': 35, 'x': 36, 'y': 37, 'z': 38} {0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'a', 14: 'b', 15: 'c', 16: 'd', 17: 'e', 18: 'f', 19: 'g', 20: 'h', 21: 'i', 22: 'j', 23: 'k', 24: 'l', 25: 'm', 26: 'n', 27: 'o', 28: 'p', 29: 'q', 30: 'r', 31: 's', 32: 't', 33: 'u', 34: 'v', 35: 'w', 36: 'x', 37: 'y', 38: 'z'}


In [None]:
# encode text to integer
encoded_text = [char2int[char] for char in textData] # creates as list of encoded text
print(encoded_text[:10])

[18, 21, 30, 31, 32, 1, 15, 21, 32, 21]


In [None]:
sequence_length = 50 # length of input sequences

In [None]:
# converting the encoded text into a more fast and efficient format using Dataset API
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [None]:
# now we create seperate text(in numeric format) datapoints
# where each datapoint is of specified length
# here we the faster data format
sequences = char_dataset.batch(sequence_length + 1, drop_remainder=True)
print(type(sequences))

<class 'tensorflow.python.data.ops.batch_op._BatchDataset'>


In [None]:
# seperate the dataset into x and y
def split_input_output(element_chunk):
    input_text = element_chunk[:-1]
    output_text = element_chunk[1:]
    return input_text, output_text

Text_dataset = sequences.map(split_input_output)
print(type(Text_dataset))

<class 'tensorflow.python.data.ops.map_op._MapDataset'>


In [None]:
# Batch and shuffle the dataset
batch_size = 64
buffer_size = 10000  # TF data will prefetch this many elements to disk
Text_dataset = Text_dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [None]:
def build_model(vocabulary_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocabulary_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units, return_sequences = True, stateful = True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocabulary_size)
    ])
    model.build(input_shape=(batch_size, sequence_length))
    return model

vocabulary_size = len(vocabulary)
embedding_dim = 256
rnn_units = 1024
model = build_model(vocabulary_size, embedding_dim, rnn_units, batch_size)

In [None]:
model.summary()

In [None]:
# Loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

In [None]:
# configure checkpoins to save model weight during training
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}.weights.h5')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True
)

In [None]:
epochs = 10  # You might need more epochs for a larger dataset
history = model.fit(Text_dataset, epochs=epochs, callbacks=[checkpoint_callback])

NameError: name 'model' is not defined

In [None]:
prediction = model.predict(Text_dataset)

[1m 16/341[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:01[0m 742ms/step

KeyboardInterrupt: 