# Next-word implementation in LSTM model

Predict the next word in a sentence using LSTM models. Training data in `.txt` form.

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer # text preprocessing
from tensorflow.keras.preprocessing.sequence import pad_sequences  # pad sequences to the same length
from tensorflow.keras.models import Sequential, load_model  # initialize a sequential model, load a model
from tensorflow.keras.layers import Embedding, LSTM, Dense  # word embeddings, LSTMs, and fully connected layers

In [2]:
DATA_PATH = "/Users/rishabh/code/eeg/SPEAK_EEG/DATA/non-eeg/sherlock-holmes.txt"
# DATA_PATH = "nwtest.txt"

with open(DATA_PATH, "r", encoding="utf-8") as F:
    text = F.read()

In [3]:
# apply tokenizer to preprocess dataset into individual tokens
tk = Tokenizer()

tk.fit_on_texts([text])

total_words = len(tk.word_index) + 1

total_words # 8200
tk.word_index["test"] # each word as a number

1676

In [4]:
# N-gram implementation (identical to `nw.ipynb`)
# https://devopedia.org/n-gram-model

# declare N-grams
input_sequences = []

for line in text.split("\n"):
    # get the tokened version of each line
    tk_list = tk.texts_to_sequences([line])[0]

    # add an N-gram of length >= 2 to list of all N-grams
    for i in range(1, len(tk_list)):
        N_sequence = tk_list[:i+1]
        input_sequences.append(N_sequence)

# we now have all non-uni N-grams
input_sequences[:20]

[[1, 1561],
 [1, 1561, 5],
 [1, 1561, 5, 129],
 [1, 1561, 5, 129, 34],
 [647, 4498],
 [647, 4498, 4499],
 [226, 5],
 [226, 5, 1562],
 [6, 827],
 [6, 827, 7],
 [6, 827, 7, 871],
 [1, 234],
 [1, 234, 462],
 [1, 234, 462, 648],
 [6, 110],
 [6, 110, 5],
 [6, 110, 5, 2072],
 [1, 678],
 [1, 678, 1360],
 [1, 678, 1360, 499]]

In [5]:
# max sentence length
max_sequence_len = max(map(len, input_sequences))

# update input_sequences 
input_sequences_list = list(map(list, pad_sequences(input_sequences, maxlen=max_sequence_len, padding="pre")))

input_sequences_list[:7] # the sequence is left-padded ([0, 0, 0, 0, ..., 3, 4, 5])

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding="pre"))

len(input_sequences)

96314

In [6]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

"""
0 0 0 0 X X y
0 0 0 X X X y
0 0 X X X X y
...
"""
y # the last word in every N-gram

# now, we can essentially train our model to predict the next next word
# given the sentence prior to it, for ALL sentence lengths.
# this will allow our model to recognize words that are common
# in the beginning of sentences, and words that immediately follow it.

array([1561,    5,  129, ..., 8199, 3187, 3186], dtype=int32)

In [7]:
# one-hot encoding implementation
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

y.shape # (num of N-gram lines, num of unique words) = (96314, 8200)

(96314, 8200)

## Training the model
Uncomment for retraining

In [8]:
# # linear stack of layers type model
# model = Sequential()

# # embed - transforms input array into dense vectors
# model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
# """
# total_words : total words in vocabulary (8200)
# 100 : dimension of the dense embedding; each word is mapped to a 100-dimensional vector
# input_length : length of input sequences
# """

# model.add(LSTM(150)) # 150 nodes in the LSTM layer

# model.add(Dense(total_words, activation="softmax")) # output layer of length `total_words`

# print(model.summary())

In [9]:
# model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# model.fit(X, y, epochs=100, verbose=1)

In [10]:
# model.save("NW.keras")

## Loading the model

In [11]:
model = load_model("NW.keras")

In [22]:
seed_text = "his name"
top_k = 10
seed_out = set()

tk_list = tk.texts_to_sequences([seed_text])[0]

tk_list = pad_sequences([tk_list], maxlen=max_sequence_len - 1, padding="pre")

# predicted = np.argmax(model.predict(tk_list), axis=-1) # gets top word

# get a list of the top_k argmaxes
predicted = np.argsort(model.predict(tk_list), axis=-1)[:,-top_k:]

for word, index in tk.word_index.items():
    if index in predicted:
        seed_out.add(word)

seed_out

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


{'about', 'is', 'might', 'of', 'that', 'to', 'was', 'we', 'who', 'you'}

In [17]:
seed_text = "his name"
next_words = 7

for _ in range(next_words):
    token_list = tk.texts_to_sequences([seed_text])[0]

    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding="pre")
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""

    for word, index in tk.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

seed_text

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step


'his name is francis prosper and yet what came'