In [1]:
# import dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# load data
# loading data and opening our input data in the form of a txt file
# project Gutenburg/berg is where the data can be found (Just Google it!)
file = open("frankenstein-2.txt").read()

In [3]:
# tokenization
# standardization
# what is tokenization? Tokenization is the process of breaking a stream of text up into words phrases symbols or other meaningful elements
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # tokenizing the text into token
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    # filtering the stopwords using lambda
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered) 

In [4]:
# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [5]:
# chars to numbers
# convert characters in our input to numbers
# we'll sort the list of the set of all characters that appear in out i/p text and then use the enumerate fn to get numbers that represent the characters
# we'll then create a dictionary that stores the keys and values, or the characters and the numbers that represent them 
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [6]:
# check if words to chars or chars to num (?!) worked?
# just so we get an idea of whether our process of converting words to characters has worked,
# we print the length of our variables 
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 269566
Total vocab: 38


In [7]:
# seq length
# we're defining how long we want an individual sequence here
# an individual sequence is a complete mapping of input characters as integers
seq_length = 100
x_data = []
y_data = []

In [8]:
# loop through the sequence
# here we're going through the entire list of i/ps and converting the chars to numbers with a for loop
# this will create a bunch of sequences where each sequence starts with the next character in the i/p data
# beginning with the first character
for i in range(0, input_len - seq_length, 1):
    # define input and output sequences
    # input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # converting list of characters to integers based on previous values and appending the value to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [9]:
# check to see how many total input sequences we have
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 269466


In [10]:
# convert into sequence to np array that our network can use
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [11]:
# one-hot encoding our label data
y = np_utils.to_categorical(y_data)

In [12]:
# creating the model
# creating a sequential model
# dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [13]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [14]:
# saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [15]:
# fit model and let it train
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks) 

Epoch 1/4

Epoch 00001: loss improved from inf to 2.87036, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.87036 to 2.60035, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.60035 to 2.44685, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.44685 to 2.32681, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7f13756e4590>

In [16]:
# recompile model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [17]:
# output of the model back into characters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [18]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" ay fervour warmed gladly would sacrifice fortune existence every hope furtherance enterprise one man "


In [19]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

 seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare sea

Does this seem somewhat disappointing? Yes, the text that was generated doesn't make any sense, and it seems to start simply repeating patterns after a little bit. However, the longer you train the network the better the text that is generated will be.

For instance, when the number of training epochs was increased to 20, the output looked more like this:

"ligther my paling the same been the this manner to the forter the shempented and the had an ardand the verasion the the dears conterration of the astore"

The model is now generating actual words, even if most of it still doesn't make sense. Still, for only around 100 lines of code, it isn't bad.

***Conclusion***

You'll want to increase the number of training epochs to improve the network's performance. However, you may also want to use either a deeper neural network (add more layers to the network) or a wider network (increase the number of neurons/memory units) in the layers.

You could also try adjusting the batch size, one hot-encoding the inputs, padding the input sequences, or combining any number of these ideas.