In [24]:
import requests
request=requests.get("https://www.gutenberg.org/files/1661/1661-0.txt")
request.status_code

200

In [25]:
with open("book.txt", "wb") as file:
    file.write(request.content)


In [26]:
#read the file in text string
text = open('book.txt', 'r', encoding='utf-8').read()
text = text.lower()
sentences = text.split('\n')

In [27]:
#create list of sentences
sentences = text.split('\n')

In [28]:
#import dependencies to preprocess the text data and making sequences
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [29]:
#initialize the tokenizer, which can work char by char
tokenizer = Tokenizer(oov_token='<UNK>')


In [49]:
tokenizer.fit_on_texts(sentences)


In [31]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8923

In [32]:
sequences=tokenizer.texts_to_sequences(sentences)


In [33]:
input_sequences = []
for sequence in sequences:
  for i in range(1, len(sequence)):
    n_gram_sequence = sequence[:i+1]
    input_sequences.append(n_gram_sequence)

In [34]:
print(input_sequences[0], input_sequences[1], input_sequences[2], input_sequences[3])

[4776, 158] [4776, 158, 331] [4776, 158, 331, 886] [4776, 158, 331, 886, 5]


In [35]:
#find the maximum length among sequences
max_seq_len = max([len(seq) for seq in input_sequences])
max_seq_len

20

In [36]:
#pad the sequences to ensure that they are all of same length
padded_sequences = pad_sequences(input_sequences, maxlen = max_seq_len)

In [37]:
print(padded_sequences[0], padded_sequences[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0 4776  158] [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 4776  158  331]


In [38]:
import numpy as np
padded_sequences = np.array(padded_sequences)


In [39]:
#prepare training sequences and labels
x = padded_sequences[:, : -1]
labels = padded_sequences[:, -1]


In [40]:
labels

array([ 158,  331,  886, ...,   84,  360, 1674])

In [41]:
#to one hot encode the labels
y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

In [42]:
x.shape

(101523, 19)

In [43]:
y.shape

(101523, 8923)

In [44]:
#import dependencies for defining the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [45]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(Bidirectional(LSTM(256)))
model.add(Dense(vocab_size, activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])
model.summary()

In [46]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='acc', patience=4,min_delta=0.01)
batch_size=256
for i in range(0, len(x), batch_size):
    x_batch = x[i:i + batch_size]
    y_batch = y[i:i + batch_size]
    model.fit(x_batch, y_batch, epochs=25, verbose=1,callbacks=[es])

Epoch 1/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 156ms/step - acc: 0.0695 - loss: 8.3773
Epoch 2/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 146ms/step - acc: 0.0081 - loss: 5.4576  
Epoch 3/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 142ms/step - acc: 0.1025 - loss: 4.7946
Epoch 4/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 138ms/step - acc: 0.1331 - loss: 4.3682
Epoch 5/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 140ms/step - acc: 0.1565 - loss: 4.0321
Epoch 6/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 138ms/step - acc: 0.2182 - loss: 3.7080
Epoch 7/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 135ms/step - acc: 0.3067 - loss: 3.0345
Epoch 8/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 138ms/step - acc: 0.3838 - loss: 2.5800
Epoch 9/25
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 139ms/step - acc: 0.5

In [47]:
model.summary()

In [48]:
#Time to become storyteller!
seed_text = "To Sherlock Holmes she is always _the_ woman. I have seldom heard him mention her under any other name"     
next_words = 100
  
for _ in range(next_words):
    sequence = tokenizer.texts_to_sequences([seed_text])
    padded = pad_sequences(sequence, maxlen=max_seq_len-1)
    predicted = model.predict(padded, verbose=0)
    predicted_class = predicted.argmax(axis=-1)
    output_word = ''
    for word, index in tokenizer.word_index.items():
        if index == predicted_class:
            output_word = word
            break
    seed_text += ' ' + output_word
print(seed_text)


To Sherlock Holmes she is always _the_ woman. I have seldom heard him mention her under any other name reproachfully deposition strongly pounds diverted intrusted shaped blanche spaulding trained 1858 8 plumber’s 1858 drives experiences signature precaution strikes reigning roar blotches signature precaution imperial returns kramm strongly yourselves strongly seconds questionable yourselves intrusted fight roar shared signature contact drives ransacked roar kramm online utf deposition residing “both yourselves strongly saturday’s families “g shared shared october online shaped pounds 3rd families reigning kramm shared admirably apply admirably softly seldom pounds smelling shared it’s created process shared experiences shared yourselves strongly pay shelves apply yourselves spaulding pounds weight reproachfully shared yourselves october chairs reigning contact apply reigning closing material lace risen
