In [2]:
import numpy as np
import tensorflow as tf
import glob
import codecs
import pickle
import time

class HyperParameters:
    def __init__(self):
        self.num_epochs = 500
        self.batch_size = 512
        self.rnn_size = 512
        self.num_layers = 3
        self.keep_prob = 0.7
        self.embed_dim = 512
        self.seq_length = 30
        self.learning_rate = 0.001
        self.save_dir = './save'
        
'''
num_epochs = 10000
batch_size = 512
rnn_size = 512
num_layers = 3
keep_prob = 0.7
embed_dim = 512
seq_length = 30
learning_rate = 0.001
save_dir = './save'
'''

hp = HyperParameters()

In [3]:
def create_lookup_tables(text):
    """
    Create lookup tables for the vocabulary
    """
    #Convert the text data to a set, ordering and removing all duplicates
    vocab = set(text)
    #Creating a dict of set integer to word
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    #Creating a dict of word to set integer
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    return vocab_to_int, int_to_vocab

def token_lookup():
    """
    Create a dict of token replacements
    """
    return {
        '.': '||period||',
        ',': '||comma||',
        '"': '||quotes||',
        ';': '||semicolon||',
        '!': '||exclamation-mark||',
        '?': '||question-mark||',
        '(': '||left-parentheses||',
        ')': '||right-parentheses||',
        '--': '||emm-dash||',
        '\n': '||return||'
    }

def get_batches(int_text,hp):
    words_per_batch = hp.batch_size * hp.seq_length
    num_batches = len(int_text)//words_per_batch
    int_text = int_text[:num_batches*words_per_batch]
    print("Words Per Batch: {0}, Num_Batchs : {1}".format(words_per_batch, num_batches))
    y = np.array(int_text[1:] + [int_text[0]])
    print(y)
    x = np.array(int_text)
    print(x)
    print(x.reshape(hp.batch_size, -1))
    print(y.reshape(hp.batch_size, -1))
    x_batches = np.split(x.reshape(hp.batch_size, -1), num_batches, axis=1)
    y_batches = np.split(y.reshape(hp.batch_size, -1), num_batches, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    return np.array(batch_data)


def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word with some randomness
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]


In [4]:
book_filenames = sorted(glob.glob("Data/*.txt"))
print("found {} books".format(len(book_filenames)))

corpus_raw = u""
for filename in book_filenames:
    print("Reading {}".format(filename))
    with codecs.open(filename,'r', encoding="utf-8", errors='ignore') as book_file:
        corpus_raw += book_file.read()
        
print("corpus is {} characters long".format(len(corpus_raw)))


found 13 books
Reading Data/DorianGrey.txt
Reading Data/GreatExpectations.txt
Reading Data/GrimsFairyTales.txt
Reading Data/Metamorphasis.txt
Reading Data/MobyDic.txt
Reading Data/ScarletLetter.txt
Reading Data/SherlockHolmes.txt
Reading Data/TaleOfTwoCities.txt
Reading Data/WarAndPeace.txt
Reading Data/aliceinwonderland.txt
Reading Data/dracula.txt
Reading Data/frankenstein.txt
Reading Data/heartofdarkness.txt
corpus is 9419385 characters long


In [5]:
token_dict = token_lookup()
for token, replacement in token_dict.items():
    corpus_raw = corpus_raw.replace(token, ' {} '.format(replacement))
corpus_raw = corpus_raw.lower()
corpus_raw = corpus_raw.split()

vocab_to_int, int_to_vocab = create_lookup_tables(corpus_raw)
corpus_int = [vocab_to_int[word] for word in corpus_raw]
#serialize the data into the preprocess pickle object
pickle.dump((corpus_int, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p','wb'))

In [5]:
train_graph = tf.Graph()
with train_graph.as_default():   
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    lstm = tf.nn.rnn_cell.LSTMCell(num_units=hp.rnn_size)
    drop_cell = tf.nn.rnn_cell.DropoutWrapper(lstm,output_keep_prob=hp.keep_prob)
    cell = tf.nn.rnn_cell.MultiRNNCell([drop_cell] * hp.num_layers)
    
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name="initial_state")
    
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, hp.embed_dim)
    
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name="final_state")
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size)
    probs = tf.nn.softmax(logits, name="probs")
    
    cost = tf.contrib.seq2seq.sequence_loss(logits, targets, tf.ones([input_text_shape[0], input_text_shape[1]]))
    optimizer = tf.train.AdamOptimizer(hp.learning_rate)
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    

In [6]:
pickle.dump((hp.seq_length,  hp.save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, hp)
num_batches = len(batches)
start_time = time.time()

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    recover_last_checkpoints(hp.save_dir)
    for epoch in range(hp.num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_index, (x, y) in enumerate(batches):
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: hp.learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
            
        time_elapsed = time.time() - start_time
        print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}   time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
            epoch + 1,
            batch_index + 1,
            len(batches),
            train_loss,
            time_elapsed,
            ((num_batches * hp.num_epochs)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed))

        # save model every 10 epochs
        if epoch % 10 == 0:
            saver = tf.train.Saver()
            saver.save(sess, hp.save_dir)
            print('Model Trained and Saved')


Words Per Batch: 15360, Num_Batchs : 139
[ 1170 10538 42994 ... 17834 42451 27810]
[27810  1170 10538 ...  2203 17834 42451]
[[27810  1170 10538 ... 45240 32279  1170]
 [ 1170 32279  2524 ... 32279 27321 28196]
 [32279 20475 13531 ...  7444 17207  2828]
 ...
 [11963 35339 11801 ... 28805 20565  8892]
 [10333 38802 35564 ... 27957 45240 43547]
 [20565 10646 21518 ...  2203 17834 42451]]
[[ 1170 10538 42994 ... 32279  1170  1170]
 [32279  2524 37038 ... 27321 28196 32279]
 [20475 13531 37282 ... 17207  2828 38195]
 ...
 [35339 11801 47610 ... 20565  8892 10333]
 [38802 35564 20191 ... 45240 43547 20565]
 [10646 21518  1170 ... 17834 42451 27810]]
Epoch   1 Batch  139/139   train_loss = 6.581   time_elapsed = 271.130   time_remaining = 135294
Model Trained and Saved
Epoch   2 Batch  139/139   train_loss = 6.577   time_elapsed = 550.460   time_remaining = 137064
Epoch   3 Batch  139/139   train_loss = 6.571   time_elapsed = 822.653   time_remaining = 136286
Epoch   4 Batch  139/139   train

In [None]:
import numpy as np
import tensorflow as tf
import glob
import codecs
import pickle
import time

class HyperParameters:
    def __init__(self):
        self.num_epochs = 500
        self.batch_size = 512
        self.rnn_size = 512
        self.num_layers = 3
        self.keep_prob = 0.7
        self.embed_dim = 512
        self.seq_length = 30
        self.learning_rate = 0.001
        self.save_dir = './save'

hp = HyperParameters()

def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word with some randomness
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    return np.random.choice(list(int_to_vocab.values()), 1, p=probabilities)[0]

import tensorflow as tf
import numpy as np
import pickle

corpus_int, vocab_to_int, int_to_vocab, token_dict = pickle.load(open('preprocess.p', mode='rb'))
seq_length, save_dir = pickle.load(open('params.p', mode='rb'))


gen_length = 30
prime_words = 'looking'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load the saved model
    loader = tf.train.import_meta_graph(hp.save_dir + '.meta')
    loader.restore(sess, hp.save_dir)
    
    # Get tensors from loaded graph
    input_text = loaded_graph.get_tensor_by_name('input:0')
    initial_state = loaded_graph.get_tensor_by_name('initial_state:0')
    final_state = loaded_graph.get_tensor_by_name('final_state:0')
    probs = loaded_graph.get_tensor_by_name('probs:0')
    
    # Sentences generation setup
    gen_sentences = prime_words.split()
    prev_state = sess.run(initial_state, {input_text: np.array([[1 for word in gen_sentences]])})
    
    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-hp.seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        p = np.array(probabilities[0][dyn_seq_length-1,:])
        p /= p.sum()
        #print("Probabilities: ",sum(probabilities[0][dyn_seq_length-1,:]))
        #print("Normalized P : ", sum(p))
        pred_word = pick_word(p, int_to_vocab)

        gen_sentences.append(pred_word)
        
    # Remove tokens
    chapter_text = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        chapter_text = chapter_text.replace(' ' + token.lower(), key)
        
    print(chapter_text)

INFO:tensorflow:Restoring parameters from ./save
