In [1]:
import os
import re
import collections

import nltk
nltk.download('brown') # reference: http://www.nltk.org/nltk_data/
from nltk.corpus import brown

[nltk_data] Downloading package brown to /Users/guo/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
import numpy as np
import tensorflow as tf

In [3]:
words = brown.words()
sents = brown.sents()

In [4]:
print(len(words))
print(len(set(words)))
print(len(sents))

1161192
56057
57340


In [5]:
def _contains_letter(string):
    if re.search(re.compile(r'\w'), string): # if string contains an alphanumeric character
        return True
    return False

In [6]:
def cleanse(words, sents):
    sents_cleansed = []
    for sent in sents:
        sent_cleansed = [word for word in sent if _contains_letter(word)] # filter out punctuations
        if len(sent_cleansed) > 0: # filter out empty lists
            sent_cleansed[0] = sent_cleansed[0].lower() # lower the first letter in a sentence
            sents_cleansed.append(sent_cleansed)
    
    words_cleansed = []
    for sent in sents_cleansed:
        for word in sent:
            words_cleansed.append(word)
    return words_cleansed, sents_cleansed

In [7]:
# words_cleansed, sents_cleansed = cleanse(words, sents)

In [8]:
# print(len(words_cleansed))
# print(len(sents_cleansed))
# print(sents_cleansed[0]) # first sentence

In [9]:
def map_words_sents(words, sents):
    # select vocabulary
    vocab = {}
    vocab['UNK'] = 0
    word_count_sorted = sorted(collections.Counter(words).items(), key=lambda item: item[1])
    for item in word_count_sorted:
        if item[1] > 3: # if word frequency < 3
            vocab[item[0]] = len(vocab)
        else:
            vocab['UNK'] += 1
    vocab_reversed = dict(zip(vocab.values(), vocab.keys()))
    
    # map word to index number
    sents_mapped = []
    for sent in sents:
        sents_mapped.append([vocab.get(word, 0) for word in sent])
    return sents_mapped, vocab, vocab_reversed

In [10]:
sents_mapped, vocab, vocab_reversed = map_words_sents(words, sents)

In [11]:
n = 5 # order of the model

In [12]:
print(len(sents_mapped))
print(len(vocab))

57340
17905


In [13]:
def generate_data(sents_mapped, n, vocab_size, training=True):
    data = [] # sets of n-word sequence
    labels = [] # sets of 1-word prediction
    
    for sent in sents_mapped:
        beginning_index = 0
        end_index = n
        while end_index < len(sent):
            feed = sent[beginning_index:end_index]
            target = sent[end_index]
            data.append(feed)
            arr = np.zeros((vocab_size)) # one-hot encoding
            arr[target] = 1
            labels.append(arr)
            end_index += 1
            beginning_index += 1
            
    return data, labels

data, labels = generate_data(sents_mapped, n, len(vocab), training=True)

In [14]:
print(len(data))
print(len(labels))

881239
881239


In [15]:
def generator(data, labels, training=True):
    if training:
        i = 0
        end = int(len(data)*0.8)
    else:
        i = int(len(data)*0.8) + 1
        end = len(data) - 1
            
    while True:
        yield np.array(data[i]), np.array(labels[i])
        if i == end: 
            i = 0
        else:
            i += 1

In [16]:
def batch_generator(generator_specified, batch_size):
    while True:
        data_batch = []
        target_batch = []
        
        for _ in range(batch_size):
            data_single, label_single = next(generator_specified)
            data_batch.append(data_single)
            target_batch.append(label_single)
            
        yield np.array(data_batch), np.array(target_batch)

In [17]:
training_data = generator(data, labels, training=True)
print(next(training_data))
batch_training_data = batch_generator(generator_specified=generator(data, labels, training=True), batch_size=2)
print(next(batch_training_data))

(array([17890, 12025, 16609, 12290,     1]), array([0., 0., 0., ..., 0., 0., 0.]))
(array([[17890, 12025, 16609, 12290,     1],
       [12025, 16609, 12290,     1, 17845]]), array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]))


In [18]:
V = len(vocab) # vocabulary size
m = 60 # embedding size
h = 50

In [21]:
# model: y = b + Wx + Utanh(d + Hx)
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
    with tf.name_scope('inputs'):
        x = tf.placeholder(tf.int32, shape=[n])
        y = tf.placeholder(tf.int32, shape=[V])

    with tf.name_scope('embeddings'):
        embeddings = tf.Variable(tf.random_uniform([V, m], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, x)

    with tf.name_scope('H'):
        H = tf.Variable(tf.truncated_normal([h, n*m], stddev=1.0/np.sqrt(n*m)))
    with tf.name_scope('U'):
        U = tf.Variable(tf.truncated_normal([V, h], stddev=1.0/np.sqrt(h)))
    with tf.name_scope('d'):
        d = tf.Variable(tf.zeros([h, 1]))
    with tf.name_scope('b'):
        b = tf.Variable(tf.zeros([V, 1]))
    with tf.name_scope('W'):
        W = tf.Variable(tf.truncated_normal([V, n*m], stddev=1.0/np.sqrt(n*m)))
        
    with tf.name_scope('logits'):
        logits = b + tf.matmul(W, tf.reshape(embed, [-1, 1])) + tf.matmul(U, tf.tanh(d + tf.matmul(H, tf.reshape(embed, [-1, 1]))))

    with tf.name_scope('loss'):
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf.reshape(y, [1, -1]), logits=tf.reshape(logits, [1,-1])))
        prob = tf.nn.softmax(logits=logits, axis=0)

    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'): 
        optimizer = tf.train.AdamOptimizer(0.1).minimize(loss)
    
    # merge all summaries
    summary_merged = tf.summary.merge_all()
    
    # create a saver
    saver = tf.train.Saver()

In [22]:
%%time
# create the directory for TensorBoard variables if there is not
log_dir = 'log'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
num_steps = 30000

with tf.Session(graph=graph) as session:
    writer = tf.summary.FileWriter(log_dir, session.graph)
    
    # initialize variables
    tf.global_variables_initializer().run()
    total_loss = 0
    perplexity_exponent = 0
    
    for step in np.arange(num_steps):
        data_training, label = next(training_data)

        # collect runtime statistics
        run_metadata = tf.RunMetadata()
        
        _, loss_step, prob_step, summary = session.run([optimizer, loss, prob, summary_merged], 
                                                        feed_dict={x:data_training, y:label},
                                                        run_metadata=run_metadata)
        
        # record summaries
        writer.add_summary(summary, step)
        if step == (num_steps - 1):
            writer.add_run_metadata(run_metadata, 'step {}'.format(step))
            
        total_loss += loss_step
#         perplexity_exponent += np.log(prob_step[np.argmax(label)][0])
        
        if step % 2000 == 0 and step > 0:
            print('average loss at step ', step, ':', loss_step)
#             print('perplexity at step', step, ':', np.exp(-perplexity_exponent/step))
    
    # save the model
    saver.save(session, os.path.join(log_dir, 'model.ckpt'))
    writer.close()



average loss at step  2000 : 680.19434
perplexity at step 2000 : inf
average loss at step  4000 : 443.40604
perplexity at step 4000 : inf
average loss at step  6000 : 377.32315
perplexity at step 6000 : inf
average loss at step  8000 : 427.8276
perplexity at step 8000 : inf
average loss at step  10000 : 134.6681
perplexity at step 10000 : inf
average loss at step  12000 : 0.0
perplexity at step 12000 : inf
average loss at step  14000 : 802.5867
perplexity at step 14000 : inf
average loss at step  16000 : 1991.791
perplexity at step 16000 : inf
average loss at step  18000 : 382.6027
perplexity at step 18000 : inf
average loss at step  20000 : 773.2907
perplexity at step 20000 : inf
average loss at step  22000 : 869.0157
perplexity at step 22000 : inf
average loss at step  24000 : 916.0696
perplexity at step 24000 : inf
average loss at step  26000 : 1612.2839
perplexity at step 26000 : inf
average loss at step  28000 : 406.86835
perplexity at step 28000 : inf
CPU times: user 1h 22min 34s