In [49]:
%%time
import numpy as np 
import pandas as pd
import os
import pickle as pk

# Machine Learning
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras import backend as K
from keras.utils import to_categorical
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import (Dense, Bidirectional, LSTM, 
                          Dropout, Embedding, BatchNormalization)
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

Wall time: 0 ns


# Loading The Model
What this model does is take a text document and count the amount of unique words in the document. Then the network uses the text to try and predict the unique words.

In [6]:
def load_seq(path):
    with open(path,'rb') as file:
        seqs = pk.load(file)
        print(f'Sequences loaded from: {path}')
    
    return seqs

In [7]:
location = r'E:\Documents\My Projects\Text Generation\data\HEAM.seq'
sequences = load_seq(location)
sequences[0]

Sequences loaded from: E:\Documents\My Projects\Text Generation\data\HEAM.seq


'introduction the twothousandyearold assumptionon december the deadliest school shooting in us history took place at sandy hook elementary school in newtown connecticut twentysix people inside the school including twenty children were massacred by a lone gunman several weeks after this horror i watched the governor of connecticut dannel malloy give his'

# Encoding the sequences
The word embedding layer expects input sequences to be comprised of integers.
We can map each word in our vocabulary to a unique integer and encode our input sequences. Later, when we make predictions, we can convert the prediction to numbers and look up their associated words in the same mapping.

In [11]:
encoder = Tokenizer()
encoder.fit_on_texts(sequences)
encoded_sequences = encoder.texts_to_sequences(sequences)

In [20]:
print(f'{sequences[0][:13]} ---- mapped to ---> {encoded_sequences[0][:13]}')

introduction  ---- mapped to ---> [5586, 1, 10246, 10245, 10244, 1, 10243, 570, 2271, 6, 107, 644, 1010]


# Mapping the Encoding
We can access the mapping of words to integers as a dictionary attribute called word_index on the Tokenizer object.
We need to know the size of the vocabulary for defining the embedding layer later. We can determine the vocabulary by calculating the size of the mapping dictionary.

Words are assigned values from 1 to the total number of words (e.g. 7,409). The Embedding layer needs to allocate a vector representation for each word in this vocabulary from index 1 to the largest index and because indexing of arrays is zero-offset, the index of the word at the end of the vocabulary will be 7,409; that means the array must be 7,409 + 1 in length.

In [23]:
vocab_size = len(encoder.word_index) + 1
vocab_size

10248

# Sequence Inputs and Output
Now that we have encoded the input sequences, we need to separate them into input (X) and output (y) elements.

We can do this with array slicing.

After separating, we need to one hot encode the output word. This means converting it from an integer to a vector of 0 values, one for each word in the vocabulary, with a 1 to indicate the specific word at the index of the words integer value.

This is so that the model learns to predict the probability distribution for the next word and the ground truth from which to learn from is 0 for all words except the actual word that comes next.

Keras provides the to_categorical() that can be used to one hot encode the output words for each input-output sequence pair.

Finally, we need to specify to the Embedding layer how long input sequences are. We know that there are 50 words because we designed the model, but a good generic way to specify that is to use the second dimension (number of columns) of the input data’s shape. That way, if you change the length of sequences when preparing data, you do not need to change this data loading code; it is generic.

# Split Input and Output Features
It is common to split your loaded data into input variables (X) and the output variable (y).

We can do this by slicing all rows and all columns up to, but before the last column, then separately indexing the last column.

For the input features, we can select all rows and all columns except the last one by specifying ‘:’ for in the rows index, and :-1 in the columns index.

In [45]:
# convert from list to a numpy array
sequences = np.array(encoded_sequences)
x, y = sequences[:,:-1], sequences[:,-1]
# the classes are the unique words
y = to_categorical(y, num_classes=vocab_size)
seq_length = x.shape[1]
seq_length

50

# Fit Model
We can now define and fit our language model on the training data.

The learned embedding needs to know the size of the vocabulary and the length of input sequences as previously discussed. It also has a parameter to specify how many dimensions will be used to represent each word. That is, the size of the embedding vector space.

Common values are 50, 100, and 300. We will use 50 here, but consider testing smaller or larger values.

We will use a two LSTM hidden layers with 100 memory cells each. More memory cells and a deeper network may achieve better results.

A dense fully connected layer with 100 neurons connects to the LSTM hidden layers to interpret the features extracted from the sequence. The output layer predicts the next word as a single vector the size of the vocabulary with a probability for each word in the vocabulary. A softmax activation function is used to ensure the outputs have the characteristics of normalized probabilities.

__input_length:__ Length of input sequences, when it is constant. This argument is required if you are going to connect Flatten then Dense layers upstream (without it, the shape of the dense outputs cannot be computed).


In [58]:
# Model creation
model = Sequential()
# input dimensions: vocab_size
# output dimensions: vector_size
model.add(Embedding(vocab_size, 75, input_length = seq_length))
model.add(Bidirectional(LSTM(100, return_sequences = True)))
model.add(Dropout(.25))
model.add(Bidirectional(LSTM(100, return_sequences = True)))
model.add(Dropout(.5))
model.add(Dense(100, activation = 'relu'))
model.add(Dropout(.25))
model.add(Dense(vocab_size, activation = 'softmax'))
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 50, 75)            768600    
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 50, 200)           140800    
_________________________________________________________________
dropout_7 (Dropout)          (None, 50, 200)           0         
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 50, 200)           240800    
_________________________________________________________________
dropout_8 (Dropout)          (None, 50, 200)           0         
_________________________________________________________________
dense_7 (Dense)              (None, 50, 100)           20100     
_________________________________________________________________
dropout_9 (Dropout)          (None, 50, 100)         

In [None]:
batches = 128
epochs = 100
checkpoint = ModelCheckpoint(filepath = r'E:\Documents\My Projects\Text Generation\Models\BiLSTM_Language_Generation.hdf5',
                            verbose = 1, save_best_only=True)
stopping = EarlyStopping(monitor='val_loss',patience=5)

callbacks = [learning,checkpoint,stopping]