In [None]:
# Importing dependencies
import numpy
import sys
import nltk
import tensorflow
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
# Load data
file = open('frankenstein.txt').read()

In [None]:
# Tokenization
# Standardization
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)
processed_inputs = tokenize_words(file)

In [None]:
# Chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [None]:
# Check if words to chars or chars to num(71) has worked
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters", input_len)
print("Total vocab", vocab_len)

In [None]:
# Seq length
seq_length = 100
x_data = []
y_data = []

In [None]:
# Loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append([char_to_num[out_seq]])

n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

In [None]:
# Convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length,1))
X = X/float(vocab_len)

In [None]:
# One-hot encoding
Y = to_categorical(y_data)

In [None]:
# Creating the model
model = Sequential([
    Input(shape=(X.shape[1], X.shape[2])),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(Y.shape[1], activation='softmax')
])

In [None]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Saving weights
filepath = 'model_weights_saved.keras'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
# Fit model and let it train
model.fit(X,Y, epochs=1, batch_size=256, callbacks=desired_callbacks)

In [None]:
# Recompile the model with saved weights
filename = "model_weights_saved.keras"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [None]:
# Random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random seed: ")
print("\"",''.join([num_to_char[value] for value in pattern]), "\"")

In [None]:
# Generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]