<small>
Copyright (c) 2017 Andrew Glassner

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
</small>



# Deep Learning From Basics to Practice
## by Andrew Glassner, https://dlbasics.com, http://glassner.com
------
## Chapter 22: Recurrent Neural Networks
### Notebook 2: Generate new text

This notebook is provided as a “behind-the-scenes” look at code used to make some of the figures in this chapter. It is still in the hacked-together form used to develop the figures, and is only lightly commented.

In [None]:
# Some code here is inspired by https://github.com/karpathy/char-rnn
#
# The Holmes data can be found at Project Gutenberg
# https://www.gutenberg.org/ebooks/search/?query=holmes
# 
# We combined three books of short stories into one big text file:
#
# “The Adventures of Sherlock Holmes by Arthur Conan Doyle”
# “The Return of Sherlock Holmes by Arthur Conan Doyle”
# “The Memoirs of Sherlock Holmes by Arthur Conan Doyle”

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
import numpy as np
import random
import sys

In [None]:
# Make a File_Helper for saving and loading files.

save_files = True

import os, sys, inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
sys.path.insert(0, os.path.dirname(current_dir)) # path to parent dir
from DLBasics_Utilities import File_Helper
file_helper = File_Helper(save_files)

In [None]:
def get_text(input_file):
    # open the input file and do minor processing
    file = open(input_file, 'r', encoding='utf8') 
    text = file.read()
    file.close()
    #text = text.lower()
    # replace newlines with blanks, and double blanks with singles
    text = text.replace('\n',' ') 
    text = text.replace('  ', ' ')
    print('corpus length:', len(text))
    return text

In [None]:
def build_dictionaries(text):
    unique_chars = sorted(list(set(text)))
    print('total unique chars:', len(unique_chars))
    char_to_index = dict((ch, index) for index, ch in enumerate(unique_chars))
    index_to_char = dict((index, ch) for index, ch in enumerate(unique_chars))
    return (unique_chars, char_to_index, index_to_char)

In [None]:
def build_fragments(text, window_length):
    # make overlapping fragments of window_length characters
    fragments = []
    targets = []
    for i in range(0, len(text)-window_length, window_step):
        fragments.append(text[i: i + window_length])
        targets.append(text[i + window_length])
    print('number of fragments of length window_length=',window_length,':', len(fragments))
    return (fragments, targets)

In [None]:
def encode_training_data(fragments, window_length, targets, char_to_index, index_to_char):
    # Turn inputs and targets into one-hot versions
    X = np.zeros((len(fragments), window_length, len(char_to_index)), dtype=np.bool)
    y = np.zeros((len(fragments), len(char_to_index)), dtype=np.bool)
    for i, fragment in enumerate(fragments):
        for t, char in enumerate(fragment):
            X[i, t, char_to_index[char]] = 1
        y[i, char_to_index[targets[i]]] = 1
    return (X, y)

In [None]:
def build_model(window_length, num_unique_chars):
    # build the model. Two layers of a single LSTM cell with 128 elements of memory,
    # then a dense layer with as many outputs as there are characters (89)
    # We'll train with the RMSprop optimizer. Some experiments suggest that
    # a learning rate of 0.01 is a good place to start.
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=(window_length, num_unique_chars)))
    model.add(LSTM(128))
    model.add(Dense(num_unique_chars, activation='softmax'))
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    return model

In [None]:
# adjust our probabilities to add "heat"
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
# print a string to the screen and also save it in the file
def print_string(out_str='', file_writer=None):
    print(out_str, end='')
    if file_writer != None:
        file_writer.write(out_str)

In [None]:
def generate_text(model, X, y, number_of_epochs, temperatures, index_to_char, char_to_index, file_writer):
    # train the model, output generated text after each iteration
    for iteration in range(number_of_epochs):
        print_string('--------------------------------------------------\n', file_writer)
        print_string('Iteration '+str(iteration)+'\n', file_writer)
        history = model.fit(X, y, batch_size=batch_size, epochs=1)
        start_index = random.randint(0, len(text) - window_length - 1)

        for temperature in temperatures:
            print_string('\n----- temperature: '+str(temperature)+'\n', file_writer)
            sentence = text[start_index: start_index + window_length]
            generated = sentence
            print_string('----- Generating with seed: <'+sentence+'>\n', file_writer)

            for i in range(generated_text_length):
                x = np.zeros((1, window_length, len(index_to_char)))
                for t, char in enumerate(sentence):
                    x[0, t, char_to_index[char]] = 1.

                preds = model.predict(x, verbose=0)[0]
                next_index = sample(preds, temperature)
                next_char = index_to_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

            print_string(generated+'\n\n', file_writer)
            file_writer.flush()
    #print_string('\n', file_writer)

In [None]:
# set the globals
window_length = 40
window_step = 3
number_of_epochs = 100
generated_text_length = 1000
batch_size = 100
input_dir = file_helper.get_input_data_dir()
output_dir = file_helper.get_saved_output_dir()
file_helper.check_for_directory(output_dir)

input_file = input_dir+'/holmes.txt'
output_file =  output_dir+'/holmes-by-char.txt'
File_writer = open(output_file, 'w')

In [None]:
# get text data structures, build the model
text = get_text(input_file)
unique_chars, char_to_index, index_to_char = build_dictionaries(text)
fragments, targets = build_fragments(text, window_length)
X, y = encode_training_data(fragments, window_length, targets, char_to_index, index_to_char)
model = build_model(window_length, len(char_to_index))
# Show the model we're using
model.summary()

In [None]:
number_of_epochs = 2
temperatures = [0.5, 1.0, 1.5]
generate_text(model, X, y, number_of_epochs, temperatures, index_to_char, char_to_index, File_writer)
# wrap up when we're done
File_writer.close()