<small>
Copyright (c) 2017 Andrew Glassner

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
</small>



# Deep Learning From Basics to Practice
## by Andrew Glassner, https://dlbasics.com, http://glassner.com
------
## Chapter 23: Keras
### Notebook 16: Generate text word by word

In [None]:
# The Keras steps are a modified version of the character-based RNN at
# https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py
#
# A lot of the word extraction and tokenizing was freely adapted from
# http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-1-introduction-to-rnns/
#
# The Sherlock Holmes text is from Project Gutenberg
# https://www.gutenberg.org/

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import itertools
import os
import sys
import nltk
import nltk.data
import string

In [None]:
# Make a File_Helper for saving and loading files.

save_files = True

import os, sys, inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
sys.path.insert(0, os.path.dirname(current_dir)) # path to parent dir
from DLBasics_Utilities import File_Helper
file_helper = File_Helper(save_files)

In [None]:
# Get the stuff we need from the Natural Language Toolkit (NLTK)
nltk.download('punkt')

In [None]:
# Global parameters

Vocabulary_size = 8000
Batch_size = 64  # Set to 1 below if we're stateful
Learning_rate = 0.01
Num_epochs = 500
Start_epoch = 1
input_dir = file_helper.get_input_data_dir()
Source_text_file = input_dir+'/holmes.txt'
output_dir = file_helper.get_saved_output_dir()
file_helper.check_for_directory(output_dir)
Output_file = output_dir+'/generated-holmes.txt'

Window_size = 40
Window_step = 3
Generated_text_length = 600
Random_seed = 42
Cells_per_layer = [8, 8]
Use_dropout = [True] * len(Cells_per_layer)
Dropout_rate = [0.3] * len(Cells_per_layer)
Stateful_model = True  
File_writer = None
Model_name = 'Layers-'+str(Cells_per_layer)+'-stateful-'+str(Stateful_model)

if Stateful_model:
    Batch_size = 1             # so we can predict with just 1, probably better to modify predictions
    Window_step = Window_size  # samples are sequential, not overlapping

Unknown_token = "GLORP"  # all words not in vocabulary

In [None]:
# read in text one sentence at a time: https://stackoverflow.com/questions/4576077/python-split-text-on-sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
fp = open(Source_text_file)
data = fp.read()
tokenized_sentences = tokenizer.tokenize(data)

# remove punctuation https://stackoverflow.com/questions/23317458/how-to-remove-punctuation
punctuations = [
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', 
    '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', 
    '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', 
    '~', "''","`","\"", ",", "-", "\n", "\r", "”"
    ]
sentences = []
for sentence in tokenized_sentences:
    no_punc = " ".join("".join([" "+ch+" " if ch in punctuations else ch for ch in sentence]).split())
    sentences.append(no_punc)
    
print("found ",len(sentences)," sentences")

# sentences is an array of strings. Each string is what the tokenizer decided made
# up an English-language "sentence"

In [None]:
text_as_words = []
for s in sentences:
    words = s.split()
    for w in words:
        text_as_words.append(w)
print("the text contains ",len(text_as_words)," words")
# text_as_words is all the words in the text after tokenizing and removing punctuation

In [None]:
# Count the word frequencies
word_freq = nltk.FreqDist(text_as_words)
number_of_unique_tokens = 1 + len(word_freq.items())  # add 1 for the "unknown_token"

# Get the most common words 
vocab = word_freq.most_common(Vocabulary_size-1)
print("Found ",len(vocab)," distinct words")

In [None]:
# build index_to_word and word_to_index dictionaries
unique_words = [v[0] for v in vocab]
unique_words.append(Unknown_token)
unique_words = sorted(list(set(unique_words)))
print('number of unique vocabulary words being used:', len(unique_words))
word_to_index = dict((w, i) for i, w in enumerate(unique_words))
index_to_word = dict((i, w) for i, w in enumerate(unique_words))

In [None]:
print('Using vocabulary size %d.' % Vocabulary_size)
for i in range(10):
    print("word popularity "+str(i)+": <"+vocab[i][0]+"> used "+str(vocab[i][1])+" times")

In [None]:
# Replace all words not in our vocabulary with the unknown token
for i in range(len(text_as_words)):
    if not text_as_words[i] in word_to_index:
        text_as_words[i] = Unknown_token

In [None]:
# make huge list of windowed fragments
fragments = []
next_words = []
for i in range(0, len(text_as_words) - Window_size, Window_step):
    fragments.append(text_as_words[i: i + Window_size])
    next_words.append(text_as_words[i + Window_size])
print('number of fragments created:', len(fragments))

In [None]:
# Clip the fragments so it's a multiple of the batch size
keep_fragments = 64 * int(len(fragments)/64.)
fragments = fragments[0:keep_fragments]

In [None]:
# Create the training data
# X is a boolean array that is number-of-fragments * Window_size * vocabulary_size
#    That is, every fragment contains Window_size entries, one for each word
#    Each word is given by a one-hot encoding whose length is the total number of word tkens
# y is a boolean array that is number-of-fragments * vocabulary_size
#    Each entry is the one-hot encoding of the word that follows the corresponding fragment

X = np.zeros((len(fragments), Window_size, Vocabulary_size), dtype=np.bool)
y = np.zeros((len(fragments), Vocabulary_size), dtype=np.bool)
for i, fragment in enumerate(fragments):
    for t, word in enumerate(fragment):   
        X[i, t, word_to_index[word]] = 1
    y[i, word_to_index[next_words[i]]] = 1
print("Training data:")
print("   X.shape = ",X.shape)
print("   y.shape = ",y.shape)

In [None]:
def build_model():
    model = Sequential()
    # layer 1 is special
    if Stateful_model:
        if Batch_size != 1:
            print("*** WARNING! *** build_stateful_model: Batch_size should be 1")
        model.add(LSTM(Cells_per_layer[0], return_sequences=len(Cells_per_layer)>1,
                           stateful=True,
                           batch_input_shape=(1, Window_size, Vocabulary_size)))
    else:
        model.add(LSTM(Cells_per_layer[0], return_sequences=True,
                       input_shape=(Window_size, Vocabulary_size)))
    if Use_dropout[0]:
        model.add(Dropout(Dropout_rate[0]))
    for i in range(1, len(Cells_per_layer)):
        return_sequence = i<len(Cells_per_layer)-1
        model.add(LSTM(Cells_per_layer[i], return_sequences=return_sequence))
        if Use_dropout:
            model.add(Dropout(Dropout_rate[i]))
    model.add(Dense(Vocabulary_size))
    model.add(Activation('softmax'))

    #optimizer = RMSprop(lr=Learning_rate)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [None]:
# from http://karpathy.github.io/2015/05/21/rnn-effectiveness/
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = preds[0:len(word_to_index)]
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def print_string(out_str=''):
    print(out_str, end='')
    File_writer.write(out_str)

In [None]:
def print_report():
    print_string("Vocabulary_size = "+str(Vocabulary_size)+"\n")
    print_string("Batch_size = "+str(Batch_size)+"\n")
    print_string("Learning_rate = "+str(Learning_rate)+"\n")
    print_string("Source_text_file = "+str(Source_text_file)+"\n")
    print_string("Window_size = "+str(Window_size)+"\n")
    print_string("Window_step = "+str(Window_step)+"\n")
    print_string("Batch_size = "+str(Batch_size)+"\n")
    print_string("Num_epochs = "+str(Num_epochs)+"\n")
    print_string("Generated_text_length = "+str(Generated_text_length)+"\n\n")

    print_string("Input text file: "+Source_text_file+'\n')
    print_string("    output file: "+Output_file+'\n\n')
    print_string("full text: "+str(len(sentences))+" sentences\n")
    print_string("           "+str(len(text_as_words))+" tokens\n\n")
    print_string("           "+str(number_of_unique_tokens)+" unique tokens in source\n")
    print_string("           "+str(len(unique_words))+" unique words (tokens) being used\n")
    print_string('number of fragments created: '+str(len(fragments))+'\n')
    print_string('    resulting in '+str(len(fragments)/64.0)+' batches\n\n')
    
    print_string('Model_name: '+Model_name+'\n')
    print_string('Stateful_model: '+str(Stateful_model)+'\n')
    print_string('Cells per layer: '+str(Cells_per_layer)+'\n')
    print_string('Use dropout: '+str(Use_dropout)+'\n')
    print_string('Dropout rate: '+str(Dropout_rate)+'\n\n')

In [None]:
model = build_model()
model.summary()

In [None]:
# train the model, output generated text after each iteration
# There needs to be a directory called "Models" in the same
# directory as this file, or we'll get an error.

File_writer = open(Output_file, 'w')
print_report()
model = build_model()
Start_epoch = 1

#### How to import from a saved model
#import keras
#model = keras.models.load_model('Models/Layers-[8, 8]-stateful-False-epoch-119.h5')
#Start_epoch = 120

shuffle = not Stateful_model

np.random.seed(Random_seed)
history_list = []

for iteration in range(Start_epoch, Num_epochs):
    print_string('\n')
    print_string('----------------------------------------------------------------------\n')
    print_string('Iteration '+str(iteration)+'\n')
    history = model.fit(X, y, Batch_size, epochs=1, shuffle=shuffle)  
    history_list.append(history)
    if Stateful_model:
        model.reset_states()
    print_string('Loss from iteration '+str(iteration)+' = '+str(history.history['loss'])+'\n')
        
    model_filename = Model_name+'-epoch-'+str(iteration)
    print("saving model to file ",model_filename)
    file_helper.save_model(model, model_filename)  
    start_index = random.randint(0, len(text_as_words) - Window_size - 1)

    for diversity in np.linspace(.5, 2, 7):
    #for diversity in [1]:
        print_string('\n')
        print_string('----- diversity: '+str(diversity)+'\n')

        generated = ''
        sentence = text_as_words[start_index: start_index + Window_size]
        #print("just made sentence =",sentence)
        generated = ' '.join(sentence)
        print_string('----- Generating with seed: "' +generated+ '"\n----\n')
        print_string(generated)

        for i in range(Generated_text_length):
            x = np.zeros((1, Window_size, Vocabulary_size))
            for t, word in enumerate(sentence):
                x[0, t, word_to_index[word]] = 1.

            preds = model.predict(x, verbose=0)[0]            
            
            next_index = sample(preds, diversity)
            next_word = index_to_word[next_index]

            generated += ' '+next_word
            sentence = sentence[1:]
            sentence.append(next_word)
            
            print_string(' '+next_word)

        print_string('\n')
        File_writer.flush()
File_writer.close()