In [56]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
import nltk

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
#load data
file = open("frankenstein.txt").read()

In [58]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)


In [59]:
# chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [60]:
# check if words to chars or chars to num (?! ) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

Total number of characters: 232972
Total vocab: 37


In [61]:
# seq length
seq_length = 100
x_data = []
y_data = []

In [62]:
# loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 232872


In [63]:
# convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X / float(vocab_len)

In [64]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y_data)


In [65]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense

model = Sequential()
model.add(Input(shape=(X.shape[1], X.shape[2])))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))


In [66]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [67]:
filepath = "model_weights_saved.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]


In [54]:
# fit model and let it train
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - loss: 2.9402      
Epoch 1: loss improved from None to 2.92042, saving model to model_weights_saved.keras
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11559s[0m 6s/step - loss: 2.9204
Epoch 2/4
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - loss: 2.8891     
Epoch 2: loss improved from 2.92042 to 2.87057, saving model to model_weights_saved.keras
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7435s[0m 4s/step - loss: 2.8706
Epoch 3/4
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22s/step - loss: 2.8059     
Epoch 3: loss improved from 2.87057 to 2.77434, saving model to model_weights_saved.keras
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39520s[0m 22s/step - loss: 2.7743
Epoch 4/4
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18s/step - loss: 2.6654     
Epoch 4: loss improve

<keras.src.callbacks.history.History at 0x1861796b0e0>

In [70]:
filename = "model_weights_saved.keras"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [71]:
# output of the model back into characters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [72]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" lekeepmenwishedaskthousandquestionswouldallowtormentedidlecuriositystatebodymindwhoserestorationevid "


In [73]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

rearedareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseatedeeartedeareseate