In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [None]:
file1 = open("/content/pg1513.txt", "r", encoding = "utf8")
file2 = open("/content/pg120.txt", "r", encoding = "utf8")
# file3 = open("/content/pg1727.txt", "r", encoding = "utf8")
# file4 = open("/content/1400-0.txt", "r", encoding = "utf8")
# file5 = open("/content/pg1259.txt", "r", encoding = "utf8")
files= [file1,file2]
lines = []
for file in files:
    for line in file:
        lines.append(line)

data = ""
for i in lines:
  data = ' '. join(lines)

data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

data = data.split()
data = ' '.join(data)
data[:500]

'The Project Gutenberg eBook of Romeo and Juliet, by William Shakespeare This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before usi'

In [None]:
len(data)

538169

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 76, 67, 468, 5, 44, 2, 74, 29, 2508, 2024, 22, 468, 24, 15]

In [None]:
len(sequence_data)

101709

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8613


In [None]:
sequences = []

for i in range(2, len(sequence_data)):
    words = sequence_data[i-2:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  101707


array([[   1,   76,   67],
       [  76,   67,  468],
       [  67,  468,    5],
       [ 468,    5,   44],
       [   5,   44,    2],
       [  44,    2,   74],
       [   2,   74,   29],
       [  74,   29, 2508],
       [  29, 2508, 2024],
       [2508, 2024,   22]])

In [None]:
X = []
y = []

for i in sequences:
    X.append(i[0:2])
    y.append(i[2])

X = np.array(X)
y = np.array(y)

In [None]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[   1   76]
 [  76   67]
 [  67  468]
 [ 468    5]
 [   5   44]
 [  44    2]
 [   2   74]
 [  74   29]
 [  29 2508]
 [2508 2024]]
Response:  [  67  468    5   44    2   74   29 2508 2024   22]


In [None]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=2))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [None]:
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 10)             86130     
                                                                 
 lstm (LSTM)                 (None, 2, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 8613)              8621613   
                                                                 
Total params: 21,756,743
Trainable params: 21,756,743
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=10, batch_size=64, callbacks=[checkpoint])

Epoch 1/10
Epoch 1: loss improved from inf to 6.62077, saving model to next_words.h5
Epoch 2/10
Epoch 2: loss improved from 6.62077 to 6.10448, saving model to next_words.h5
Epoch 3/10
Epoch 3: loss improved from 6.10448 to 5.80169, saving model to next_words.h5
Epoch 4/10
Epoch 4: loss improved from 5.80169 to 5.55601, saving model to next_words.h5
Epoch 5/10
Epoch 5: loss improved from 5.55601 to 5.34180, saving model to next_words.h5
Epoch 6/10
Epoch 6: loss improved from 5.34180 to 5.12983, saving model to next_words.h5
Epoch 7/10
Epoch 7: loss improved from 5.12983 to 4.90532, saving model to next_words.h5
Epoch 8/10
Epoch 8: loss improved from 4.90532 to 4.68088, saving model to next_words.h5
Epoch 9/10
Epoch 9: loss improved from 4.68088 to 4.45707, saving model to next_words.h5
Epoch 10/10
Epoch 10: loss improved from 4.45707 to 4.24200, saving model to next_words.h5


<keras.callbacks.History at 0x7f416e0a9190>

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break

  print(predicted_word)
  return predicted_word

In [None]:
while(True):
  text = input("Enter your line: ")

  if text == "0":
      print("Execution completed.....")
      break

  else:
      try:
          text = text.split(" ")
          text = text[-2:]
          print(text)

          Predict_Next_Words(model, tokenizer, text)

      except Exception as e:
        print("Error occurred: ",e)
        continue


Enter your line: Robert Louis
['Robert', 'Louis']
stevenson
