In [72]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
import nltk

In [74]:
file = open("frankenstein.txt").read()

In [76]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)

In [78]:
processed_inputs = tokenize_words(file)

In [79]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [92]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print(input_len)
print(vocab_len)

232972
37


In [94]:
seq_length = 100
x_data = []
y_data = []

In [96]:
for i in range(0, input_len - seq_length, 1):
    in_seg = processed_inputs[i:i + seq_length]
    out_seg = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seg])
    y_data.append(char_to_num[out_seg])

In [97]:
n_patterns = len(x_data)
print(n_patterns)

232872


In [98]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X / float(vocab_len)
y = to_categorical(y_data)

In [102]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [104]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [108]:
filepath = "model_weights_saved.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [110]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=callbacks_list)

Epoch 1/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - loss: 2.9553
Epoch 1: loss improved from inf to 2.92848, saving model to model_weights_saved.keras
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4964s[0m 5s/step - loss: 2.9552
Epoch 2/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - loss: 2.9145
Epoch 2: loss improved from 2.92848 to 2.91129, saving model to model_weights_saved.keras
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1091s[0m 1s/step - loss: 2.9145
Epoch 3/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 2.9074
Epoch 3: loss improved from 2.91129 to 2.90194, saving model to model_weights_saved.keras
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1482s[0m 2s/step - loss: 2.9074
Epoch 4/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - loss: 2.8702
Epoch 4: loss improved from 2.90194 to 2.86032, saving model t

<keras.src.callbacks.history.History at 0x1b7b5c38a70>

In [117]:
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam')
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [119]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")

Random Seed:


In [121]:
print("\"" + ''.join([num_to_char[value] for value in pattern]) + "\"")

"wordsutteredprotectorsmeanwhilealsoblackgroundcoveredherbagegreenbanksinterspersedinnumerableflowers"


In [None]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:]

ereerererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererererere