In [55]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/lib/kaggle/gcp.py
/kaggle/input/dataset-text/frankenstein_2.txt
/kaggle/input/models-lstm/model_weights_saved_e10.hdf5
/kaggle/working/__notebook_source__.ipynb
/kaggle/output/kaggle/working/model_weights_saved_e20.hdf5
/kaggle/output/kaggle/working/model_weights_saved_e200.hdf5
/kaggle/output/kaggle/working/model_weights_saved_e100.hdf5


In [56]:
import numpy
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [57]:
file = open('../input/dataset-text/frankenstein_2.txt').read()

In [58]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [59]:
nltk.download('stopwords')

# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [61]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 80535
Total vocab: 41


In [62]:
seq_length = 100
x_data = []
y_data = []

In [63]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [64]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 80435


In [65]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [66]:
y = np_utils.to_categorical(y_data)

In [73]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [74]:
filepath = "../output/kaggle/working/model_weights_saved_e200.hdf5"
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [75]:
filepath = "../output/kaggle/working/model_weights_saved_e300.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [76]:
model.fit(X, y, epochs=100, batch_size=256, callbacks=desired_callbacks)

Epoch 1/100

Epoch 00001: loss improved from inf to 1.37959, saving model to ../output/kaggle/working/model_weights_saved_e300.hdf5
Epoch 2/100

Epoch 00002: loss did not improve from 1.37959
Epoch 3/100

Epoch 00003: loss improved from 1.37959 to 1.37698, saving model to ../output/kaggle/working/model_weights_saved_e300.hdf5
Epoch 4/100

Epoch 00004: loss improved from 1.37698 to 1.37431, saving model to ../output/kaggle/working/model_weights_saved_e300.hdf5
Epoch 5/100

Epoch 00005: loss improved from 1.37431 to 1.37287, saving model to ../output/kaggle/working/model_weights_saved_e300.hdf5
Epoch 6/100

Epoch 00006: loss improved from 1.37287 to 1.37286, saving model to ../output/kaggle/working/model_weights_saved_e300.hdf5
Epoch 7/100

Epoch 00007: loss improved from 1.37286 to 1.37082, saving model to ../output/kaggle/working/model_weights_saved_e300.hdf5
Epoch 8/100

Epoch 00008: loss improved from 1.37082 to 1.36792, saving model to ../output/kaggle/working/model_weights_saved_e3

<tensorflow.python.keras.callbacks.History at 0x7f077a1ab950>

In [77]:
filename = "../output/kaggle/working/model_weights_saved_e300.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [78]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [80]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" eral relations waldman fellow professor would lecture upon chemistry alternate days omitted returned "


In [82]:
for i in range(100):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]

    print(result, sep=' ', end='', flush=True)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

ected secret also houre moritz along streets work dear consolation sense tears shall see soon contem

In [87]:
import shutil
shutil.make_archive('model_weights_saved_e300.hdf5', 'zip', '../output/kaggle/working/')

'/kaggle/working/model_weights_saved_e300.hdf5.zip'

In [84]:
from IPython.display import FileLink
FileLink(r'../output/kaggle/working/model_weights_saved_e200.hdf5')
