In [2]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Load the data
file_directory = "./data"
file_names = [join(file_directory, f) for f in listdir(file_directory) if isfile(join(file_directory, f)) 
                                                                          and f.endswith('.TXT')]

sentences = []
for file_name in file_names:
    with open(file_name,'r') as f:
        raw_data = f.read()
        sentences += nltk.sent_tokenize(raw_data.decode('utf-8').lower())

# Take a look at sentences in the data
print ("sentences count: ", len(sentences))
print ("sentences[:2]", sentences[:2])

('sentences count: ', 80208)
('sentences[:2]', [u"project gutenberg's etext of first book of adam and eve, by platt\r\npart one of a series of the forgotten books of eden\r\n\r\ncopyright laws are changing all over the world, be sure to check\r\nthe copyright laws for your country before posting these files!", u'please take a look at the important information in this header.'])


In [4]:
vocabulary_size = 8000
unknown_token = "UNKOWN_TOKEN"
sentence_start_token ="SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Append SENTENCE_START and SENTENCE_END
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))

Parsed 80208 sentences.


In [5]:
# Take a look at some parsed sentences
print sentences[:3]

[u"SENTENCE_START project gutenberg's etext of first book of adam and eve, by platt\r\npart one of a series of the forgotten books of eden\r\n\r\ncopyright laws are changing all over the world, be sure to check\r\nthe copyright laws for your country before posting these files! SENTENCE_END", u'SENTENCE_START please take a look at the important information in this header. SENTENCE_END', u'SENTENCE_START we encourage you to keep this file on your own disk, keeping an\r\nelectronic path open for the next readers. SENTENCE_END']


In [6]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())

# Set the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

Found 51123 unique words tokens.


In [7]:
# Take a look at some examples of index_to_word and word_to_index
print "index_to_word[3]: %s" % index_to_word[3]
print "word_to_index of previous one: %d" % word_to_index[index_to_word[3]]

print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

index_to_word[3]: SENTENCE_START
word_to_index of previous one: 3
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'blazing' and appeared 17 times.


In [8]:
print "Tokenized sentence example: ", tokenized_sentences[0] 
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]

Tokenized sentence example:  [u'SENTENCE_START', u'project', u'gutenberg', u"'s", u'etext', u'of', u'first', u'book', u'of', u'adam', u'and', u'eve', u',', u'by', u'platt', u'part', u'one', u'of', u'a', u'series', u'of', u'the', u'forgotten', u'books', u'of', u'eden', u'copyright', u'laws', u'are', u'changing', u'all', u'over', u'the', u'world', u',', u'be', u'sure', u'to', u'check', u'the', u'copyright', u'laws', u'for', u'your', u'country', u'before', u'posting', u'these', u'files', u'!', u'SENTENCE_END']

Example sentence: 'SENTENCE_START project gutenberg's etext of first book of adam and eve, by platt
part one of a series of the forgotten books of eden

copyright laws are changing all over the world, be sure to check
the copyright laws for your country before posting these files! SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'project', u'gutenberg', u"'s", u'etext', u'of', u'first', u'book', u'of', u'adam', u'and', u'eve', u',', u'by', u'platt', u'par

In [9]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)

x:
SENTENCE_START if our value per text is nominally estimated at one dollar then we produce $ 4 million dollars per hour this year as we release some eight text files per month : thus UNKOWN_TOKEN our UNKOWN_TOKEN from $ 2 million .
[3, 51, 106, 1084, 1477, 2109, 18, 7739, 6925, 34, 50, 6015, 78, 55, 1106, 3438, 934, 2006, 2125, 1477, 615, 31, 362, 17, 55, 2036, 76, 1228, 2109, 2267, 1477, 1336, 40, 207, 7999, 106, 7999, 32, 3438, 635, 2006, 5]

y:
if our value per text is nominally estimated at one dollar then we produce $ 4 million dollars per hour this year as we release some eight text files per month : thus UNKOWN_TOKEN our UNKOWN_TOKEN from $ 2 million . SENTENCE_END
[51, 106, 1084, 1477, 2109, 18, 7739, 6925, 34, 50, 6015, 78, 55, 1106, 3438, 934, 2006, 2125, 1477, 615, 31, 362, 17, 55, 2036, 76, 1228, 2109, 2267, 1477, 1336, 40, 207, 7999, 106, 7999, 32, 3438, 635, 2006, 5, 4]


In [10]:
# Print shape of X_train and y_train:
print ("X_train: ", X_train.shape)
print ("y_train: ", y_train.shape)

('X_train: ', (80208,))
('y_train: ', (80208,))


In [11]:
# Print
print ("X_train[0].shape", X_train[0].shape)
print ("X_train[1].shape", X_train[1].shape)
print ("y_train[0].shape", y_train[0].shape)
print ("y_train[0].shape", y_train[0].shape)

AttributeError: 'list' object has no attribute 'shape'

### Create Batch Data Iterator

In [74]:
import tensorflow as tf

# Construct batch iterator
# input data should be a numpy array of training sequences
# Method next_batch returns the next batch tensor in shape [batch_size, max_time_steps] 
# next batch labels and sequence lengths list
class PaddedDataIterator():
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.size = len(self.data)
        self.cursor = 0
    
    def next_batch(self, batch_size):
        if self.cursor + batch_size > len(self.data):
            self.cursor = 0 # wrap around
        batch_data = self.data[self.cursor:self.cursor+batch_size]
        batch_labels = self.labels[self.cursor:self.cursor+batch_size]
        self.cursor += batch_size
        
        lengths = [len(x) for x in batch_data]
        max_length = max(lengths)
        
        # x in shape [batch_size, max_time_steps]
        x = np.zeros([batch_size, max_length], np.float32)
        # Zero pad the instances in batch so that their length equals to max_length
        for i, x_at_i in enumerate(x):
            x_at_i[:lengths[i]] = batch_data[i]
        y = np.zeros([batch_size, max_length], np.float32)
        # Zero pad the labels
        for i, y_at_i in enumerate(y):
            y_at_i[:lengths[i]] = batch_labels[i]
            
        # reshape the tensor into [batch_size, indivial time_steps]
        x = tf.convert_to_tensor(x, dtype=tf.float32)
        
        return x, y, lengths, max_length

In [75]:
data_it = PaddedDataIterator(X_train, y_train)
d = data_it.next_batch(5)
print('Input sequences\n', d[0])
print('Input sequences labels\n', d[1], d[1].shape)
print('Input sequences lengths\n', d[2])


('Input sequences\n', <tf.Tensor 'Const_23:0' shape=(5, 50) dtype=float32>)
('Input sequences labels\n', array([[  4.07000000e+02,   8.73000000e+02,   4.80000000e+01,
          4.32000000e+02,   2.00000000e+00,   1.16000000e+02,
          4.82000000e+02,   2.00000000e+00,   2.88000000e+02,
          6.00000000e+00,   4.79000000e+02,   0.00000000e+00,
          2.20000000e+01,   6.28400000e+03,   1.67000000e+02,
          5.00000000e+01,   2.00000000e+00,   8.00000000e+00,
          2.46900000e+03,   2.00000000e+00,   1.00000000e+00,
          1.97200000e+03,   6.62000000e+02,   2.00000000e+00,
          7.99900000e+03,   1.66400000e+03,   2.22000000e+02,
          4.20000000e+01,   2.89100000e+03,   3.70000000e+01,
          1.22000000e+02,   1.00000000e+00,   1.56000000e+02,
          0.00000000e+00,   2.80000000e+01,   7.51000000e+02,
          7.00000000e+00,   2.22200000e+03,   1.00000000e+00,
          1.66400000e+03,   2.22000000e+02,   2.10000000e+01,
          9.20000000e+01,  

In [76]:
# Take another look at the iterator
d2 = data_it.next_batch(5)
print('Input sequences\n', d2[0])
print('Input sequences labels[0]\n', d2[1].shape)
print('Input sequences lengths\n', d2[2], d2[3])

('Input sequences\n', <tf.Tensor 'Const_24:0' shape=(5, 52) dtype=float32>)
('Input sequences labels[0]\n', (5, 52))
('Input sequences lengths\n', [6, 52, 12, 26, 26], 52)


### Build the Graph for dynamic LSTM

In [127]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

    
def build_LSTM_graph(vocab_size, num_classes, state_size = 1000, batch_size = 256):
    '''
    Build LSMT graph
    Return unscaled logits in shape [batch_size, time_steps, num_classes]
    '''
    reset_graph()
    # Placeholders
    x = tf.placeholder(tf.int32, [batch_size, None]) # [batch_size, num_steps]
    y = tf.placeholder(tf.int32, [batch_size])
    seqlen = tf.placeholder(tf.int32, [batch_size])
    max_len = tf.placeholder(tf.int32, [])
    
    # Parameters
    # Wout: [state_size, num_classes]
    # b: [num_classes]
    weights = {
        'out': tf.Variable(tf.random_normal([state_size, num_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([num_classes]))
    }
    
    
    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', [vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)
    
    # LSTM
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(state_size, forget_bias=1.0)
    outputs, state = tf.nn.dynamic_rnn(cell = lstm_cell, 
                                       inputs = rnn_inputs, 
                                       sequence_length=seqlen,
                                       dtype=tf.float32)
    
    # Softmax layer
    # outputs in shape [batch_size, time_steps, state_size]
    # y in shape [batch_size, time_steps]
    outputs = tf.reshape(outputs, [-1, state_size]) # (B*T, state_size)
    y = tf.reshape(y, [-1]) # y in shape (B*T)
    logits = tf.matmul(outputs, weights['out']) + biases['out']
#     logits = tf.reshape(logits, [batch_size, -1, num_classes]) # (B, T, num_classes)
    
    # Unstack outputs to get a list of "time_steps" of (batch_size, state_size)
#     outputs_list = tf.unstack(outputs, max_len, axis=1)
#     logits_list = [] # [time_steps, batch_size, num_classes]
#     for output in outputs_list:
#         logits_list.append(tf.matmul(output, weights['out']) + biases['out']) # (batch_size, num_classes)
#     logits = tf.transpose(logits_list, (1, 0, 2)) # now logits are in shape [batch_size, time_steps, num_classes]
    
    # Create a mask to remove extra costs caused by paddings
    weights = tf.sequence_mask(seqlen, max_len, dtype=tf.float32) # weights in shape of [batch_size, time_steps]
    return {
        'logits': logits, # (B*T, num_classes)
        'y': y,
        'weights': weights,
    }

### Training the model

In [128]:
# Define loss and optimizer

# vocabulary_size = 8000
state_size = 1000
num_classes = vocabulary_size

# Learning Parameters
learning_rate = 0.01
training_steps = 10000
batch_size = 256
display_step = 1000

g = build_LSTM_graph(vocabulary_size, num_classes, state_size, batch_size)
loss = tf.contrib.seq2seq.sequence_loss(
    tf.expand_dims(g['logits'], 0), # (1, B*T, num_classes)
    tf.expand_dims(g['y'], 0), # (1, B*T)
    g['weights'],
    average_across_timesteps=False,
    average_across_batch=True)
#total_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=g['logits'], labels=g['y']) # y :[B*T]

# Mask out the padding
#total_loss = 

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

# Evaluate model
# correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
# accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [132]:
data_itr = PaddedDataIterator(X_train, y_train)
# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, training_steps+1):
        batch_x, batch_y, batch_seqlen, max_len = data_itr.next_batch(batch_size)
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x
                                       })
        if step % display_step == 0 or step == 1:
            # Calculate batch accuracy & loss
            loss = sess.run([accuracy, loss], feed_dict={x: batch_x, y: batch_y,
                                                seqlen: batch_seqlen})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss))

    print("Optimization Finished!")

    # Calculate accuracy
#     test_data = testset.data
#     test_label = testset.labels
#     test_seqlen = testset.seqlen
#     print("Testing Accuracy:", \
#         sess.run(accuracy, feed_dict={x: test_data, y: test_label,
#                                       seqlen: test_seqlen}))

TypeError: Cannot interpret feed_dict key as Tensor: Tensor Tensor("Placeholder:0", shape=(256, ?), dtype=int32) is not an element of this graph.