In [1]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


# Importing required libraries

In [2]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Activation,Input
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq

# Tokenization

In [3]:
path = '1661-0.txt'
text = open(path).read().lower()
print('corpus length:', len(text))

corpus length: 581888


In [4]:
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)

unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

# Feature Engineering

In [5]:
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

['project', 'gutenberg', 's', 'the', 'adventures']
of


In [6]:
X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1

In [7]:
print(X[0][0])

[False False False ... False False False]


# Building the Recurrent Neural Network

In [8]:
input_dim = len(unique_words)

# Define the model
model = Sequential()
model.add(Input(shape=(WORD_LENGTH, input_dim)))
model.add(LSTM(128))
model.add(Dense(input_dim))
model.add(Activation('softmax'))

 # Training the Next Word Prediction Model

In [10]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history


Epoch 1/2
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 109ms/step - accuracy: 0.0582 - loss: 6.4777 - val_accuracy: 0.0793 - val_loss: 6.9306
Epoch 2/2
[1m811/811[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 242ms/step - accuracy: 0.1093 - loss: 5.7964 - val_accuracy: 0.0919 - val_loss: 6.8119


In [None]:
# Load the model
model = load_model('keras_next_word_model.keras')

# Reinitialize the optimizer
optimizer = RMSprop(learning_rate=0.01)

# Re-compile the model to ensure the metrics are available and the optimizer is correctly set
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Load the training history
with open("history.p", "rb") as file:
    history = pickle.load(file)

# Evaluating the Next Word Prediction Model

In [None]:
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Testing Next Word Prediction Model

In [None]:
def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(chars)))
    for t, char in enumerate(text):
        x[0, t, char_indices[char]] = 1.
        
    return x

In [None]:
prepare_input("This is an example of input for our LSTM".lower())

In [None]:
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, len(unique_words)))
    for t, word in enumerate(text.split()):
        print(word)
        x[0, t, unique_word_index[word]] = 1
    return x
prepare_input("It is not a lack".lower())

Creating a function to return samples :

In [None]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

Creating a function for next word prediction:

In [None]:
def predict_completion(text):
    original_text = text
    generated = text
    completion = ''
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]
        text = text[1:] + next_char
        completion += next_char
        
        if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
            return completion

To predict multiple characters :

In [None]:
def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]

In [None]:
quotes = [
    "It is not a lack of love, but a lack of friendship that makes unhappy marriages.",
    "That which does not kill us makes us stronger.",
    "I'm not upset that you lied to me, I'm upset that from now on I can't believe you.",
    "And those who were seen dancing were thought to be insane by those who could not hear the music.",
    "It is hard enough to remember my opinions, without also remembering my reasons for them!"
]

Finally :


In [None]:
for q in quotes:
    seq = q[:40].lower()
    print(seq)
    print(predict_completions(seq, 5))
    print()