In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [70]:
file = open('../input/frankenstein-2/frankenstein-2.txt').read()

In [71]:
# Tokenize words from the data
# Standardization
def tokenize_words(input):
  input = input.lower()
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(input)
  filtered = filter(lambda token : token not in stopwords.words('english'),tokens)
  return "".join(filtered)
processed_inputs = tokenize_words(file)

In [72]:
# chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_nums = dict((c,i) for i,c in enumerate(chars))

In [73]:
# To verify whether words to char or char_to_num has worked
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters : ", input_len)
print("Total Vocab : ", vocab_len)

Total number of characters :  220931
Total Vocab :  37


In [74]:
# seq length
seq_lenght = 100
x_data = []
y_data = []

In [75]:
# loop through the sequence
for i in range(0, input_len-seq_lenght, 1):
  in_seq = processed_inputs[i:i + seq_lenght]
  out_seq = processed_inputs[i + seq_lenght]
  x_data.append([char_to_nums[char] for char in in_seq])
  y_data.append(char_to_nums[out_seq])
n_patterns = len(x_data)
print("Total Patterns : ", n_patterns)

Total Patterns :  220831


In [76]:
# Convert input_sequence to np array and so on
x = np.reshape(x_data,(n_patterns,seq_lenght,1))
x = x/float(vocab_len)

In [77]:
# one-hot encoding
y = np_utils.to_categorical(y_data)

In [78]:
# Creating the model 
model  = Sequential()
model.add(LSTM(256, input_shape = (x.shape[1], x.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences= True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [79]:
# Compiling the model
model.compile(loss = 'categorical_crossentropy', optimizer='adam')

In [80]:
# Saving weights
filepath = "model_weight_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only = True, mode = 'min')
desired_callbacks = [checkpoint]

In [81]:
# Fitting the model and Training
model.fit(x,y, epochs = 47, batch_size = 256, callbacks = desired_callbacks)

Epoch 1/47

Epoch 00001: loss improved from inf to 2.92349, saving model to model_weight_saved.hdf5
Epoch 2/47

Epoch 00002: loss improved from 2.92349 to 2.90443, saving model to model_weight_saved.hdf5
Epoch 3/47

Epoch 00003: loss improved from 2.90443 to 2.87770, saving model to model_weight_saved.hdf5
Epoch 4/47

Epoch 00004: loss improved from 2.87770 to 2.83030, saving model to model_weight_saved.hdf5
Epoch 5/47

Epoch 00005: loss improved from 2.83030 to 2.76610, saving model to model_weight_saved.hdf5
Epoch 6/47

Epoch 00006: loss improved from 2.76610 to 2.69086, saving model to model_weight_saved.hdf5
Epoch 7/47

Epoch 00007: loss improved from 2.69086 to 2.63136, saving model to model_weight_saved.hdf5
Epoch 8/47

Epoch 00008: loss improved from 2.63136 to 2.57415, saving model to model_weight_saved.hdf5
Epoch 9/47

Epoch 00009: loss improved from 2.57415 to 2.52503, saving model to model_weight_saved.hdf5
Epoch 10/47

Epoch 00010: loss improved from 2.52503 to 2.47657, sav

<keras.callbacks.History at 0x7fedb05213d0>

In [82]:
# Recompile model with the saved weights
filename = 'model_weight_saved.hdf5'
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [83]:
# Output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [89]:
# Random seed to help generate
start = np.random.randint(0, (len(x_data) - 1))
pattern = x_data[start]
print('Random Seed: ')
print("\"",''.join([num_to_char[value] for value in pattern]),"", "\"")

Random Seed: 
" inquishedpublicfunctionsimmediatelyunionsoughtpleasantclimateitalychangesceneinterestattendanttourla  "


In [90]:
# Generate the text
for i in range(1000):
    x = np.reshape(pattern, (1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose = 0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

besenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssenderedsearonsendeavouredsearonsendersaidconsiderableseveralhourssendered