Programming Example: Using LSTM for Text Autocompletion

I choose to use Romeo and Juliet play by Shakespeare because it's one of the most download one in project Gutenberg, but I guess our LSTM doesn't mix well with old English, though it did a good job

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import logging
tf.get_logger().setLevel(logging.ERROR)

# Configuration parameters for the LSTM model and training process
EPOCHS = 32
BATCH_SIZE = 256
INPUT_FILE_NAME = './data/romeo_and_juliet.txt'
WINDOW_LENGTH = 40
WINDOW_STEP = 3
BEAM_SIZE = 8
NUM_LETTERS = 11
MAX_LENGTH = 50

In [14]:
# Open the input file.
file = open(INPUT_FILE_NAME, 'r', encoding='utf-8')
text = file.read()
file.close()
# Make lowercase and remove newline and extra spaces.
text = text.lower()
text = text.replace('\n', ' ')
text = text.replace(' ', ' ')
# Encode characters as indices.
unique_chars = list(set(text))
char_to_index = dict((ch, index) for index,
 ch in enumerate(unique_chars))
index_to_char = dict((index, ch) for index,
 ch in enumerate(unique_chars))
encoding_width = len(char_to_index)

In [15]:
# Create training examples.
fragments = []
targets = []
for i in range(0, len(text) - WINDOW_LENGTH, WINDOW_STEP):
    fragments.append(text[i: i + WINDOW_LENGTH])
    targets.append(text[i + WINDOW_LENGTH])
# Convert to one-hot encoded training data.
X = np.zeros((len(fragments), WINDOW_LENGTH, encoding_width))
y = np.zeros((len(fragments), encoding_width))
for i, fragment in enumerate(fragments):
    for j, char in enumerate(fragment):
        X[i, j, char_to_index[char]] = 1
        target_char = targets[i]
        y[i, char_to_index[target_char]] = 1

In [None]:
# Build and train model.
model = Sequential()
model.add(keras.Input(shape=(None, encoding_width)))
model.add(LSTM(128, return_sequences=True,
 dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(128, dropout=0.2,
 recurrent_dropout=0.2))
model.add(Dense(encoding_width, activation='softmax'))
model.compile(loss='categorical_crossentropy',
 optimizer='adam')
model.summary()
history = model.fit(X, y, validation_split=0.05,
 batch_size=BATCH_SIZE,
 epochs=EPOCHS, verbose=2,
 shuffle=True)

Epoch 1/32
201/201 - 72s - 359ms/step - loss: 3.0489 - val_loss: 3.0669
Epoch 2/32
201/201 - 61s - 304ms/step - loss: 2.8077 - val_loss: 2.8123
Epoch 3/32
201/201 - 56s - 280ms/step - loss: 2.5654 - val_loss: 2.6813
Epoch 4/32
201/201 - 54s - 269ms/step - loss: 2.4732 - val_loss: 2.6564
Epoch 5/32
201/201 - 52s - 260ms/step - loss: 2.4038 - val_loss: 2.6413
Epoch 6/32
201/201 - 72s - 361ms/step - loss: 2.3516 - val_loss: 2.5597
Epoch 7/32
201/201 - 64s - 319ms/step - loss: 2.3005 - val_loss: 2.5736
Epoch 8/32
201/201 - 55s - 272ms/step - loss: 2.2584 - val_loss: 2.5055
Epoch 9/32
201/201 - 66s - 326ms/step - loss: 2.2255 - val_loss: 2.4920
Epoch 10/32
201/201 - 58s - 291ms/step - loss: 2.1953 - val_loss: 2.4891
Epoch 11/32
201/201 - 81s - 401ms/step - loss: 2.1638 - val_loss: 2.4584
Epoch 12/32
201/201 - 59s - 291ms/step - loss: 2.1400 - val_loss: 2.4244
Epoch 13/32
201/201 - 64s - 320ms/step - loss: 2.1123 - val_loss: 2.4059
Epoch 14/32
201/201 - 64s - 319ms/step - loss: 2.0883 - val_

In [None]:
# Create initial single beam represented by triplet
# (probability , string , one-hot encoded string).
letters = 'romeo '
one_hots = []
for i, char in enumerate(letters):
    x = np.zeros(encoding_width)
    x[char_to_index[char]] = 1
    one_hots.append(x)
beams = [(np.log(1.0), letters, one_hots)]
# Predict NUM_LETTERS into the future.
for i in range(NUM_LETTERS):
    minibatch_list = []
    # Create minibatch from one-hot encodings, and predict.
    for triple in beams:
        minibatch_list.append(triple[2])
    minibatch = np.array(minibatch_list)
    y_predict = model.predict(minibatch, verbose=0)
    new_beams = []
    for j, softmax_vec in enumerate(y_predict):
        triple = beams[j]
        # Create BEAM_SIZE new beams from each existing beam.
        for k in range(BEAM_SIZE):
            char_index = np.argmax(softmax_vec)
            new_prob = triple[0] + np.log(softmax_vec[char_index])
            new_letters = triple[1] + index_to_char[char_index]
            x = np.zeros(encoding_width)
            x[char_index] = 1
            new_one_hots = triple[2].copy()
            new_one_hots.append(x)
            new_beams.append((new_prob, new_letters, new_one_hots))
            softmax_vec[char_index] = 0
    # Prune tree to only keep BEAM_SIZE most probable beams.
    new_beams.sort(key=lambda tup: tup[0], reverse=True)
    beams = new_beams[0:BEAM_SIZE]
for item in beams:
    print(item[1])

romeo the capulet
romeo the will th
romeo the with th
romeo that is the
romeo the will to
romeo the will an
romeo the will me
romeo the will a 
