In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [3]:
file = open("Middlemarch.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines) 

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces 
data = data.split()
data = ' '.join(data)
data[:500]

'The Project Gutenberg eBook of Middlemarch, by George Eliot This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBoo'

In [4]:
len(data)

56657

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 517, 354, 274, 2, 886, 34, 887, 888, 37, 274, 20, 22, 1, 275]

In [6]:
len(sequence_data)

10064

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2538


In [8]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  10061


array([[  1, 517, 354, 274],
       [517, 354, 274,   2],
       [354, 274,   2, 886],
       [274,   2, 886,  34],
       [  2, 886,  34, 887],
       [886,  34, 887, 888],
       [ 34, 887, 888,  37],
       [887, 888,  37, 274],
       [888,  37, 274,  20],
       [ 37, 274,  20,  22]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [10]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[  1 517 354]
 [517 354 274]
 [354 274   2]
 [274   2 886]
 [  2 886  34]
 [886  34 887]
 [ 34 887 888]
 [887 888  37]
 [888  37 274]
 [ 37 274  20]]
Response:  [274   2 886  34 887 888  37 274  20  22]


In [11]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             25380     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 2538)              2540538   
                                                                 
Total params: 15,614,918
Trainable params: 15,614,918
Non-trainable params: 0
_________________________________________________________________


In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=50, batch_size=64, callbacks=[checkpoint])

Epoch 1/50
Epoch 1: loss improved from inf to 6.83203, saving model to next_words.h5
Epoch 2/50
Epoch 2: loss improved from 6.83203 to 6.42176, saving model to next_words.h5
Epoch 3/50
Epoch 3: loss improved from 6.42176 to 6.32442, saving model to next_words.h5
Epoch 4/50
Epoch 4: loss improved from 6.32442 to 6.11941, saving model to next_words.h5
Epoch 5/50
Epoch 5: loss improved from 6.11941 to 5.84666, saving model to next_words.h5
Epoch 6/50
Epoch 6: loss improved from 5.84666 to 5.59803, saving model to next_words.h5
Epoch 7/50
Epoch 7: loss improved from 5.59803 to 5.37705, saving model to next_words.h5
Epoch 8/50
Epoch 8: loss improved from 5.37705 to 5.20111, saving model to next_words.h5
Epoch 9/50
Epoch 9: loss improved from 5.20111 to 5.02535, saving model to next_words.h5
Epoch 10/50
Epoch 10: loss improved from 5.02535 to 4.85873, saving model to next_words.h5
Epoch 11/50
Epoch 11: loss improved from 4.85873 to 4.68371, saving model to next_words.h5
Epoch 12/50
Epoch 12:

Epoch 37/50
Epoch 37: loss improved from 0.36153 to 0.32843, saving model to next_words.h5
Epoch 38/50
Epoch 38: loss improved from 0.32843 to 0.32800, saving model to next_words.h5
Epoch 39/50
Epoch 39: loss improved from 0.32800 to 0.29106, saving model to next_words.h5
Epoch 40/50
Epoch 40: loss improved from 0.29106 to 0.28469, saving model to next_words.h5
Epoch 41/50
Epoch 41: loss improved from 0.28469 to 0.26791, saving model to next_words.h5
Epoch 42/50
Epoch 42: loss improved from 0.26791 to 0.24185, saving model to next_words.h5
Epoch 43/50
Epoch 43: loss improved from 0.24185 to 0.21378, saving model to next_words.h5
Epoch 44/50
Epoch 44: loss improved from 0.21378 to 0.19360, saving model to next_words.h5
Epoch 45/50
Epoch 45: loss did not improve from 0.19360
Epoch 46/50
Epoch 46: loss did not improve from 0.19360
Epoch 47/50
Epoch 47: loss did not improve from 0.19360
Epoch 48/50
Epoch 48: loss did not improve from 0.19360
Epoch 49/50
Epoch 49: loss did not improve from 

<keras.callbacks.History at 0x27a653d6eb0>

In [14]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [15]:
while(True):
  text = input("Enter your line: ")
  
  if text == "0":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

Enter your line: the life of
['the', 'life', 'of']
saint
Enter your line: the life of Saint
['life', 'of', 'Saint']
theresa
Enter your line: 0
Execution completed.....
