# Text Prediction with LSTMs

In [1]:
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer

# Import the Embedding, LSTM and Dense layer
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Sequential

2024-07-31 13:44:15.094006: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# read content from text file
with open('data/text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [3]:
# split text into an array of individual words
words = text.split()

In [4]:
# make sentences of 10 words each, moving one word at a time
sentences = []
for i in range(4, len(words)):
    sentences.append(' '.join(words[i-4:i]))

In [5]:
# instantiate tokenizer
tokenizer = Tokenizer()

# fit it on the sentences
tokenizer.fit_on_texts(sentences)

# turn sentences into sequences of numbers
sequences = tokenizer.texts_to_sequences(sentences)
print("Sentences: \n {} \n Sequences: \n {}".format(sentences[:5],sequences[:5]))

Sentences: 
 ['Control your perceptions. Direct', 'your perceptions. Direct your', 'perceptions. Direct your actions', 'Direct your actions properly.', 'your actions properly. Willingly'] 
 Sequences: 
 [[21, 4, 112, 22], [4, 112, 22, 4], [112, 22, 4, 23], [22, 4, 23, 24], [4, 23, 24, 25]]


# Building the LSTM Model

In [6]:
vocab_size = len(tokenizer.word_index) + 1
sequence_length = 4

In [7]:
# initialize an empty neural network
model = Sequential()

# add an embedding layer
model.add(Embedding(input_dim=vocab_size,
                    input_length=sequence_length-1,
                    output_dim=8))

# add a 32 unit LSTM layer
model.add(LSTM(32))

# add a hidden dense layer of 32 units and an output layer of vocab size with softmax activation
model.add(Dense(32, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

# Build the model by calling it with an input
model.build(input_shape=(None, 3))

model.summary()



Define a function that decodes its predictions, also since we are predicting on a model that uses the softmax function, numpy's `argmax()` can be used to obtain the index/position representing the most probable next word out of the output vector of probabilities.

In [8]:
def predict_text(test_text, model = model):
  if len(test_text.split()) != 3:
    print('Text input should be 3 words!')
    return False
  
  # Turn the test_text into a sequence of numbers
  test_seq = tokenizer.texts_to_sequences([test_text])
  test_seq = np.array(test_seq)
  
  # Use the model passed as a parameter to predict the next word
  pred = model.predict(test_seq).argmax(axis = 1)[0]
  
  # Return the word that maps to the prediction
  return tokenizer.index_word[pred]

In [25]:
# test the model
predict_text("what’s outside your")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


'overwhelmed'

In [20]:
predict_text("be overwhelmed by")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


'stability'