In [69]:
# import dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from tensorflow.python import keras 
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# load data
# loading data and opening our input data in the form of a text file
#Project Gutenburg/berg
file = open("frankenstein.txt").read()

In [106]:
#tokenization
# standardization
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

processed_inputs = tokenize_words(file)

In [107]:
#chars to number
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [108]:
#check if words to chars or chars to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

Total number of characters: 269566
Total vocab: 38


In [109]:
#seq length
seq_length = 100
x_data = []
y_data = []

In [110]:
# loop through the sequence
for i in range (0, input_len - seq_length,1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 269466


In [111]:
# convert input sequence to np arrary and so on
X = numpy.reshape(x_data,(n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [112]:
# one-hot encoding
y = utils.to_categorical(y_data)

In [113]:
# creating the model
model =  Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))

In [114]:
# compile the  model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [115]:
#saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only = True, mode='min')
desired_callbacks = [checkpoint]

In [125]:
#fit model and let it train
model.fit(X,y, epochs=50, batch_size=256, callbacks=desired_callbacks)

Epoch 1/50

Epoch 00001: loss improved from 2.47797 to 2.38557, saving model to model_weights_saved.hdf5
Epoch 2/50

Epoch 00002: loss improved from 2.38557 to 2.30750, saving model to model_weights_saved.hdf5
Epoch 3/50

Epoch 00003: loss improved from 2.30750 to 2.23702, saving model to model_weights_saved.hdf5
Epoch 4/50

Epoch 00004: loss improved from 2.23702 to 2.17440, saving model to model_weights_saved.hdf5
Epoch 5/50

Epoch 00005: loss improved from 2.17440 to 2.12601, saving model to model_weights_saved.hdf5
Epoch 6/50

Epoch 00006: loss improved from 2.12601 to 2.08226, saving model to model_weights_saved.hdf5
Epoch 7/50

Epoch 00007: loss improved from 2.08226 to 2.04852, saving model to model_weights_saved.hdf5
Epoch 8/50

Epoch 00008: loss improved from 2.04852 to 2.01616, saving model to model_weights_saved.hdf5
Epoch 9/50

Epoch 00009: loss improved from 2.01616 to 1.99033, saving model to model_weights_saved.hdf5
Epoch 10/50

Epoch 00010: loss improved from 1.99033 to

<tensorflow.python.keras.callbacks.History at 0x7f76a4e49130>

In [126]:
# recompile model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [127]:
# output of the model back into the characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [128]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed: ")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" ated lighthearted gaiety boyhood winds whispered soothing accents maternal nature bade weep kindly i "


In [129]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern,(1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern =  pattern[1:len(pattern)]

npuired sea could seen searon searon sea considerable searon searon sea considerable searon searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable season searon searon searon searon searon sea considerable season searon searon sea considerable seas