In [9]:
# import dependencies
import numpy 
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [10]:
# loading dataset
file = open("Frankenstein-dataset.txt",encoding="utf8").read()

In [11]:
# tokenization
# standardization 
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'),tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)

In [13]:
# chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [14]:
# check if words to chars or chars to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 7038
Total vocab: 27


In [15]:
#seq length
seq_length = 100
x_data = []
y_data = []

In [17]:
# loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 13876


In [19]:
# convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [21]:
# one-hot encoding
y = np_utils.to_categorical(y_data)

In [24]:
# creating the model
model = Sequential()
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), return_sequences= True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [26]:
# compile the model
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [28]:
#saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor ='loss', verbose = 1, save_best_only = True, mode = 'min')
desired_callbacks = [checkpoint]

In [37]:
#fit model and let it train
model.fit(X,y, epochs = 100, batch_size = 256, callbacks= desired_callbacks)

Epoch 1/100
Epoch 00001: loss did not improve from 2.89670
Epoch 2/100
Epoch 00002: loss improved from 2.89670 to 2.89455, saving model to model_weights_saved.hdf5
Epoch 3/100
Epoch 00003: loss improved from 2.89455 to 2.89086, saving model to model_weights_saved.hdf5
Epoch 4/100
Epoch 00004: loss improved from 2.89086 to 2.89014, saving model to model_weights_saved.hdf5
Epoch 5/100
Epoch 00005: loss improved from 2.89014 to 2.88931, saving model to model_weights_saved.hdf5
Epoch 6/100
Epoch 00006: loss improved from 2.88931 to 2.88753, saving model to model_weights_saved.hdf5
Epoch 7/100
Epoch 00007: loss improved from 2.88753 to 2.88691, saving model to model_weights_saved.hdf5
Epoch 8/100
Epoch 00008: loss improved from 2.88691 to 2.88369, saving model to model_weights_saved.hdf5
Epoch 9/100
Epoch 00009: loss did not improve from 2.88369
Epoch 10/100
Epoch 00010: loss did not improve from 2.88369
Epoch 11/100
Epoch 00011: loss improved from 2.88369 to 2.87410, saving model to model_

<tensorflow.python.keras.callbacks.History at 0x249037d83c8>

In [38]:
# recompile the model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [39]:
#output of the model
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [46]:
# random seed to help gerenate
start = numpy.random.randint(0, len(x_data)-1)
pattern = x_data[start]
print("Random Seed:")
print("\"",' '.join([num_to_char[value] for value in pattern]),"\"")

Random Seed:
" f o r m e d s u b j e c t e l e c t r i c i t y g a l v a n i s m n e w a s t o n i s h i n g s a i d t h r e w g r e a t l y s h a d e c o r n e l i u s a g r i p p a a l b e r t u s m a g n u s p a "


In [47]:
#generate the text
for i in range(2000):
    x = numpy.reshape(pattern, (1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

racelsuslordsimaginationfatalityoverthrowmendisinclinedpursueaccustomedstudiesseemednothingwouldcouldeverknownlongengagedattentionsuddenlygrewdespicableonecapricesmindperhapssubjectearlyyouthgaveformeroccupationssetnaturalhistoryprogenydeformedabortivecreationentertainedgreatestdisdainwouldsciencecouldneverevenstepwithinthresholdrealknowledgemoodmindbetookmathematicsbranchesstudyappertainingsciencebuiltuponsecurefoundationsworthyconsiderationthusstrangelysoulsconstructedslightligamentsboundprosperityruinlookbackseemsalmostmiraculouschangeinclinationimmediatesuggestionguardianangellifelasteffortmadespiritpreservationavertstormevenhangingstarsreadyenvelopvictoryannouncedunusualtranquillitygladnesssoulfollowedrelinquishingancientlatterlytormentingstudiesthustaughtassociateevilprosecutionhappinessdisregardstrongeffortspiritgoodineffectualdestinypotentimmutablelawsdecreedutterterribledestructionsartedmestrnateleattelsooeslousnatesponeenfuifecteriveeoeauresiestbrsoridsereseivertfalestepstale