In [3]:
import tensorflow as tf
import numpy as np
import string

In [4]:
path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [5]:
textData = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [6]:
print("length of data", len(textData)) # no. of characters

length of data 1115394


In [7]:
# clean the data of symbols and alpha numerics symbols and escape sequences
def clean_text(doc):
  tokens = doc.split()
  table = str.maketrans("","", string.punctuation) # give a list of symbols like /@#$%^&* etc
  tokens = [w.translate(table) for w in tokens] # return list which is striped of symbols
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  return tokens

In [8]:
tokens = clean_text(textData)
print(tokens[:50])

['first', 'citizen', 'before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', 'all', 'speak', 'speak', 'first', 'citizen', 'you', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish', 'all', 'resolved', 'resolved', 'first', 'citizen', 'first', 'you', 'know', 'caius', 'marcius', 'is', 'chief', 'enemy', 'to', 'the', 'people', 'all', 'we', 'knowt', 'we', 'knowt', 'first', 'citizen', 'let', 'us']


In [9]:
len(tokens) # total no. of words

202619

In [10]:
len(set(tokens)) # total no. of unique words

12847

In [11]:
# creating a list of 11 word sentences
length = 10 + 1 # 10 word in memory and 1 to predict
lines = []

for i in range(length, len(tokens)):
  sequence = tokens[i - length : i] # 0-11 12-23
  line = ' '.join(sequence) # combine the seperate words
  lines.append(line)


In [12]:
print(lines[0], "|||",lines[1])

first citizen before we proceed any further hear me speak all ||| citizen before we proceed any further hear me speak all speak


# Building Model

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [14]:
# tokenizing the text data into machine understandable numerical structures
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines) # fitting for our data
sequences = tokenizer.texts_to_sequences(lines) # text to index(number) mapping

In [15]:
sequences = np.array(sequences) # converting to numpy array
print(sequences[:10]) # printing 10 lines

[[  88  269  137   35 1003  143  676  124   15  105   33]
 [ 269  137   35 1003  143  676  124   15  105   33  105]
 [ 137   35 1003  143  676  124   15  105   33  105  105]
 [  35 1003  143  676  124   15  105   33  105  105   88]
 [1003  143  676  124   15  105   33  105  105   88  269]
 [ 143  676  124   15  105   33  105  105   88  269    6]
 [ 676  124   15  105   33  105  105   88  269    6   40]
 [ 124   15  105   33  105  105   88  269    6   40   33]
 [  15  105   33  105  105   88  269    6   40   33 1256]
 [ 105   33  105  105   88  269    6   40   33 1256  350]]


In [16]:
X = sequences[:, :-1]
Y = sequences[:, -1]
print(X.shape,Y.shape)

(202608, 10) (202608,)


In [55]:
tokenizer.word_index # gives a dictionary mapping unique words to a number

{'the': 1,
 'and': 2,
 'to': 3,
 'i': 4,
 'of': 5,
 'you': 6,
 'my': 7,
 'a': 8,
 'that': 9,
 'in': 10,
 'is': 11,
 'not': 12,
 'for': 13,
 'with': 14,
 'me': 15,
 'it': 16,
 'be': 17,
 'your': 18,
 'his': 19,
 'this': 20,
 'but': 21,
 'he': 22,
 'have': 23,
 'as': 24,
 'thou': 25,
 'him': 26,
 'so': 27,
 'what': 28,
 'thy': 29,
 'will': 30,
 'no': 31,
 'by': 32,
 'all': 33,
 'king': 34,
 'we': 35,
 'shall': 36,
 'her': 37,
 'if': 38,
 'our': 39,
 'are': 40,
 'do': 41,
 'thee': 42,
 'now': 43,
 'lord': 44,
 'good': 45,
 'on': 46,
 'o': 47,
 'come': 48,
 'from': 49,
 'sir': 50,
 'or': 51,
 'which': 52,
 'more': 53,
 'then': 54,
 'well': 55,
 'at': 56,
 'would': 57,
 'was': 58,
 'they': 59,
 'how': 60,
 'here': 61,
 'she': 62,
 'than': 63,
 'their': 64,
 'them': 65,
 'ill': 66,
 'duke': 67,
 'am': 68,
 'hath': 69,
 'say': 70,
 'let': 71,
 'when': 72,
 'one': 73,
 'go': 74,
 'were': 75,
 'love': 76,
 'may': 77,
 'us': 78,
 'make': 79,
 'upon': 80,
 'yet': 81,
 'richard': 82,
 'like': 83,


In [18]:
vocabulary_size = len(tokenizer.word_index)+1 # we have a 12847 unique word mapped to unique numbers
print(vocabulary_size)

12848


In [19]:
sequence_length = X.shape[1] # we have 10 words in each datapoint
print(X.shape)

(202608, 10)


In [20]:
print("X shape:", X.shape)
print("Y shape:", Y.shape)
print("Min X:", np.min(X), "Max X:", np.max(X))
print("Min Y:", np.min(Y), "Max Y:", np.max(Y))
print("Vocabulary size:", vocabulary_size)

X shape: (202608, 10)
Y shape: (202608,)
Min X: 1 Max X: 12847
Min Y: 1 Max Y: 12847
Vocabulary size: 12848


# LSTM model

In [21]:
embedding_dim = 100
model = Sequential()
model.add(Embedding(vocabulary_size,embedding_dim, input_length = sequence_length))
model.add(LSTM(embedding_dim, return_sequences = True))
model.add(LSTM(embedding_dim, return_sequences =False))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(vocabulary_size, activation='softmax' ))



In [22]:
model.build(input_shape=(None, sequence_length))
model.summary()

In [23]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4, clipnorm = 1.0),
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

In [24]:
model.fit(X,Y, batch_size=128, epochs = 10)

Epoch 1/10
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - accuracy: 0.0265 - loss: 7.8827
Epoch 2/10
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 9ms/step - accuracy: 0.0307 - loss: 6.7757
Epoch 3/10
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.0308 - loss: 6.7621
Epoch 4/10
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.0306 - loss: 6.7490
Epoch 5/10
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.0300 - loss: 6.7382
Epoch 6/10
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.0318 - loss: 6.6671
Epoch 7/10
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.0307 - loss: 6.6389
Epoch 8/10
[1m1583/1583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.0313 - loss: 6.6261
Epoch 9/10
[1

<keras.src.callbacks.history.History at 0x7b6571320190>

In [63]:
seed_text = lines[12353]
encoded = tokenizer.texts_to_sequences([seed_text])[0] # zeroth dimension
encoded = pad_sequences([encoded], maxlen = sequence_length, truncating = 'pre')
prediction = model.predict(encoded)
predicted_word = ''
# print(tokenizer.word_index.items())
for word, index in tokenizer.word_index.items():
      if index == np.argmax(prediction[0]):
        predicted_word = word
        break
print(seed_text)
print(seed_text + " " + predicted_word)
print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
was given them gratis you repined scandald the suppliants for the
was given them gratis you repined scandald the suppliants for the the



In [69]:
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
  text = []

  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([seed_text])[0] # zeroth dimension
    encoded = pad_sequences([encoded], maxlen = sequence_length, truncating = 'pre')

    prediction = model.predict(encoded)
    predicted_word = ''
    for word, index in tokenizer.word_index.items():
      if index == np.argmax(prediction[0]):
        predicted_word = word
        break

    seed_text = seed_text + ' ' + predicted_word
    text.append(predicted_word)
  return " ".join(text)


In [74]:
seed_text = lines[12353]

generated_text = seed_text + " "+ generate_text_seq(model, tokenizer, sequence_length, seed_text, 10)
print(seed_text)
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
was given them gratis you repined scandald the suppliants for the
was given them gratis you repined scandald the suppliants for the the the the and bracelet bracelet and and the the
