In [2]:
import os
import re
import sys
import time
import pickle
import collections

import nltk
nltk.download('brown') # reference: http://www.nltk.org/nltk_data/
import numpy as np
import tensorflow as tf
from tqdm import tqdm_notebook

In [None]:
words = nltk.corpus.brown.words()
sents = nltk.corpus.brown.sents()
sents_mapped, vocab, vocab_reversed = map_words_sents(words, sents)

print('sentence num:', len(sents))
print('words num:', len(words))
print('vocab size:', len(vocab))

# reduce memory
del words
del sents

In [11]:
def map_words_sents(words, sents):
    # select vocabulary, all punctuations included
    vocab = {}
    vocab['UNK'] = 0
    word_count_sorted = sorted(collections.Counter(words).items(), key=lambda item: item[1])
    for item in word_count_sorted:
        if item[1] > 3: # if word frequency < 3
            vocab[item[0]] = len(vocab)
        else:
            vocab['UNK'] += 1
    vocab_reversed = dict(zip(vocab.values(), vocab.keys()))
    
    # map word to index number
    sents_mapped = []
    for sent in sents:
        sents_mapped.append([vocab.get(word, 0) for word in sent])
    return np.array(sents_mapped), vocab, vocab_reversed


n = 5 # order of the model
def generate_data(sents_mapped, n):
    data = [] # sets of n-word sequence
    labels = [] # sets of 1-word prediction
    
    for sent in sents_mapped:
        beginning_index = 0
        end_index = n - 1
        sent_len = len(sent)
        
        # skip too short sentence
        if sent_len < n:
            continue
            
        while end_index < sent_len:
            feed = sent[beginning_index:end_index]
            target = sent[end_index]
            data.append(feed)
            labels.append(target)
            end_index += 1
            beginning_index += 1
            
    return np.array(data), np.array(labels)

In [12]:
'''try reading saved data, if no data found, generate data'''
try:
    with open('data.pickle', 'rb') as file:
        data = pickle.load(file)
    
    with open('labels.pickle', 'rb') as file:
        labels = pickle.load(file)
except:


    # check variable sizes
    print('sents_mapped size: {:.3} MB'.format(sys.getsizeof(sents_mapped) / 1024**2))
    print('vocab size: {:.3} MB'.format(sys.getsizeof(vocab) / 1024**2))
    print('vocab_reversed size: {:.3} MB'.format(sys.getsizeof(vocab_reversed) / 1024**2))

    start_time = time.time()
    data, labels = generate_data(sents_mapped, n)
    print('\ngenerated data in {:.4} s'.format(time.time()-start_time))

    print('\ndata len:', len(data))
    print('label len:', len(labels))

    # check variable sizes
    print('data size: {:.3} MB'.format(sys.getsizeof(data) / 1024**2))
    print('labels size: {:.3} MB'.format(sys.getsizeof(labels) / 1024**2))

    # save generated data
    with open('data.pickle', 'wb') as file:
        pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)

    with open('labels.pickle', 'wb') as file:
        pickle.dump(labels, file, protocol=pickle.HIGHEST_PROTOCOL)

    # reduce memory
    del sents_mapped

[nltk_data] Downloading package brown to /Users/guo/nltk_data...
[nltk_data]   Package brown is already up-to-date!


sentence num: 57340
words num: 1161192
vocab size: 17905
sents_mapped size: 0.438 MB
vocab size: 0.563 MB
vocab_reversed size: 0.563 MB

generated data in 1.481 s

data len: 935344
label len: 935344
data size: 28.5 MB
labels size: 7.14 MB


In [None]:
def _contains_letter(string):
    if re.search(re.compile(r'\w'), string): # if string contains an alphanumeric character
        return True
    return False

def cleanse(words, sents):
    sents_cleansed = []
    for sent in sents:
        sent_cleansed = [word for word in sent if _contains_letter(word)] # filter out punctuations
        if len(sent_cleansed) > 0: # filter out empty lists
            sent_cleansed[0] = sent_cleansed[0].lower() # lower the first letter in a sentence
            sents_cleansed.append(sent_cleansed)
    
    words_cleansed = []
    for sent in sents_cleansed:
        for word in sent:
            words_cleansed.append(word)
    return words_cleansed, sents_cleansed

# words_cleansed, sents_cleansed = cleanse(words, sents)

# print(len(words_cleansed))
# print(len(sents_cleansed))
# print(sents_cleansed[0]) # first sentence

In [None]:
batch_size = 128

In [20]:
def generator(data, labels, vocab_size, training=True):
    if training:
        i = 0
        end = int(len(data)*0.8)
    else:
        i = int(len(data)*0.8) + 1
        end = len(data) - 1
            
    while True:
        data_batch = []
        labels_batch = []
        
        for _ in range(batch_size):
            label_one_hot_encoded = np.zeros((vocab_size))
            label_one_hot_encoded[labels[i]] = 1
            
            data_batch.append(data[i])
            labels_batch.append(label_one_hot_encoded)
            if i == end: 
                i = 0
            else:
                i += 1
            
        yield np.array(data_batch), np.array(labels_batch)

In [21]:
training_data = generator(data, labels, len(vocab), training=True)

In [68]:
training_steps = len(data[:int(len(data)*0.8)])//128 + 1

In [24]:
print(next(training_data)[0].shape)
print(next(training_data)[1].shape)

(128, 4)
(128, 17905)


In [69]:
print(training_steps)

5846


In [71]:
# the following graph contains MLP 1, MLP 5, MLP 7, and MLP 9 loss optimizations
# MLP 1 model: y = b + Wx + Utanh(d + Hx) where MLP 1 is d + Hx

batch_size = 128
h = 50
V = len(vocab) # vocabulary size
m = 60 # embedding size
weight_decay = 10**(-4)

graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
    with tf.name_scope('inputs'):
        words = tf.placeholder(tf.int32, shape=[batch_size, n-1])
        y = tf.placeholder(tf.int32, shape=[batch_size, V])
        epsilon_t = tf.placeholder(tf.float64, shape=None)

    with tf.name_scope('parameters'):
        C = tf.Variable(tf.random_uniform([V, m], -1.0, 1.0))
        x = tf.transpose(tf.reshape(tf.nn.embedding_lookup(C, words), [-1, (n-1)*m])) # [(n-1)*m, batch_size]
        H = tf.Variable(tf.truncated_normal([h, (n-1)*m], stddev=1.0/np.sqrt((n-1)*m)))
        U = tf.Variable(tf.truncated_normal([V, h], stddev=1.0/np.sqrt(h)))
        W = tf.Variable(tf.truncated_normal([V, (n-1)*m], stddev=1.0/np.sqrt((n-1)*m)))
       
        b = tf.Variable(tf.zeros([V, batch_size]))
        d = tf.Variable(tf.zeros([h, batch_size]))
        d2 = tf.Variable(tf.zeros([h, batch_size]))
        d3 = tf.Variable(tf.zeros([h, batch_size]))
        d4 = tf.Variable(tf.zeros([h, batch_size]))
        d5 = tf.Variable(tf.zeros([h, batch_size]))
        d6 = tf.Variable(tf.zeros([h, batch_size]))
        d7 = tf.Variable(tf.zeros([h, batch_size]))
        d8 = tf.Variable(tf.zeros([h, batch_size]))
        hid2 = tf.Variable(tf.truncated_normal([h, h], stddev=1.0/np.sqrt(h)))
        hid3 = tf.Variable(tf.truncated_normal([h, h], stddev=1.0/np.sqrt(h)))
        hid4 = tf.Variable(tf.truncated_normal([h, h], stddev=1.0/np.sqrt(h)))
        hid5 = tf.Variable(tf.truncated_normal([h, h], stddev=1.0/np.sqrt(h)))
        hid6 = tf.Variable(tf.truncated_normal([h, h], stddev=1.0/np.sqrt(h)))
        hid7 = tf.Variable(tf.truncated_normal([h, h], stddev=1.0/np.sqrt(h)))
        hid8 = tf.Variable(tf.truncated_normal([h, h], stddev=1.0/np.sqrt(h)))

    with tf.name_scope('direct_connections'):
        Wx = tf.matmul(W, x)

    with tf.name_scope('MLPs'):
        MLP1 = d + tf.matmul(H, x)
        MLP2 = d2 + tf.matmul(hid2, tf.tanh(MLP1))
        MLP3 = d3 + tf.matmul(hid3, tf.tanh(MLP2))
        MLP4 = d4 + tf.matmul(hid4, tf.tanh(MLP3))
        MLP5 = d5 + tf.matmul(hid5, tf.tanh(MLP4))
        MLP6 = d6 + tf.matmul(hid6, tf.tanh(MLP5))
        MLP7 = d7 + tf.matmul(hid7, tf.tanh(MLP6))
        MLP8 = d8 + tf.matmul(hid8, tf.tanh(MLP7))

    with tf.name_scope('MLP_logits'):
        MLP1_logits = b + Wx + tf.matmul(U, tf.tanh(MLP1))
        MLP5_logits = b + Wx + tf.matmul(U, tf.tanh(MLP5))
        MLP7_logits = b + Wx + tf.matmul(U, tf.tanh(MLP7))
        MLP9_logits = b + tf.matmul(U, tf.tanh(MLP8)) # no direct connections

    MLP1_prob = tf.nn.softmax(logits=MLP1_logits, axis=0)
    MLP5_prob = tf.nn.softmax(logits=MLP5_logits, axis=0)
    MLP7_prob = tf.nn.softmax(logits=MLP7_logits, axis=0)
    MLP9_prob = tf.nn.softmax(logits=MLP9_logits, axis=0)

    with tf.name_scope('MLP_losses'):
        MLP1_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=tf.transpose(MLP1_logits)))
        MLP5_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=tf.transpose(MLP5_logits)))
        MLP7_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=tf.transpose(MLP7_logits)))
        MLP9_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=tf.transpose(MLP9_logits)))

    tf.summary.scalar('MLP1_loss', MLP1_loss)
    tf.summary.scalar('MLP5_loss', MLP5_loss)
    tf.summary.scalar('MLP7_loss', MLP7_loss)
    tf.summary.scalar('MLP9_loss', MLP9_loss)

    with tf.name_scope('optimizers'):
        Custom_Optimizer = tf.contrib.opt.extend_with_decoupled_weight_decay(tf.train.GradientDescentOptimizer)
        MLP1_optimizer = Custom_Optimizer(weight_decay=weight_decay, learning_rate=epsilon_t).minimize(MLP1_loss)
        MLP5_optimizer = Custom_Optimizer(weight_decay=weight_decay, learning_rate=epsilon_t).minimize(MLP5_loss)
        MLP7_optimizer = Custom_Optimizer(weight_decay=weight_decay, learning_rate=epsilon_t).minimize(MLP7_loss)
        MLP9_optimizer = Custom_Optimizer(weight_decay=weight_decay, learning_rate=epsilon_t).minimize(MLP9_loss)
    #         optimizer = tf.train.GradientDescentOptimizer(learning_rate=epsilon_t).minimize(loss)

    # merge all summaries
    summary_merged = tf.summary.merge_all()

(128, 4, 60)
x: (240, 128)
W: (17905, 240)
Wx: (17905, 128)
(17905, 128)


In [31]:
# # the following graph contains MLP 3 with h = 0 only
# # MLP 3 model: y = b + Wx + Utanh(d + x) where MLP 1 is d + Hx

# MLP3_graph = tf.Graph()
# with MLP3_graph.as_default(), tf.device('/cpu:0'):
#     with tf.name_scope('inputs'):
#         words = tf.placeholder(tf.int32, shape=[n-1])
#         y = tf.placeholder(tf.int32, shape=[V])
#         epsilon_t = tf.placeholder(tf.float64, shape=None)
    
#     with tf.name_scope('parameters'):
#         C = tf.Variable(tf.random_uniform([V, m], -1.0, 1.0))
#         x = tf.reshape(tf.nn.embedding_lookup(C, words), [-1, 1])
#         W = tf.Variable(tf.truncated_normal([V, (n-1)*m], stddev=1.0/np.sqrt((n-1)*m)))
#         M1 = tf.Variable(tf.truncated_normal([V, V], stddev=1.0/np.sqrt(V)))
#         M2 = tf.Variable(tf.truncated_normal([V, V], stddev=1.0/np.sqrt(V)))
#         b1 = tf.Variable(tf.zeros([V, 1]))
#         b2 = tf.Variable(tf.zeros([V, 1]))
#         b3 = tf.Variable(tf.zeros([V, 1]))
    
#     with tf.name_scope('direct_connections'):
#         Wx = tf.matmul(W, x)
        
#     with tf.name_scope('MLPs'):
#         MLP1 = b1 + tf.matmul(W, x)
#         MLP2 = b2 + tf.matmul(M1, MLP1) 
#         MLP3 = tf.matmul(M2, MLP2)
   
#     with tf.name_scope('MLP_logits'):
#         MLP3_logits = b3 + Wx + MLP3
        
#     MLP3_prob = tf.nn.softmax(logits=MLP3_logits, axis=0)

#     with tf.name_scope('MLP_losses'):
#         MLP3_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf.reshape(y, [1, -1]), logits=tf.reshape(MLP3_logits, [1,-1])))
    
#     tf.summary.scalar('MLP3_loss', MLP3_loss)
    
#     with tf.name_scope('optimizers'):
#         Custom_Optimizer = tf.contrib.opt.extend_with_decoupled_weight_decay(tf.train.GradientDescentOptimizer)
#         MLP3_optimizer = Custom_Optimizer(weight_decay=weight_decay, learning_rate=epsilon_t).minimize(MLP3_loss)
    
#     # merge all summaries
#     summary_merged = tf.summary.merge_all()

In [70]:
# create the directory for TensorBoard variables if there is not
log_dir = 'log'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
h = 50
V = len(vocab) # vocabulary size
m = 60 # embedding size
epsilon_0 = 10**(-3)
r = 10**(-8) # decrease factor
# total number of parameters updates (from W, U, H, d, b, and words vectors from C) per training step 
t = V*(n-1)*m + V*h + h*(n-1)*m + h + V + m*(n-1)
weight_decay = 10**(-4)

num_epochs = 10
num_steps = training_steps
# num_steps = 10000
parameter_updates = 0

# model = Model(num_hidden_units=h, vocab_size=V, embedding_size=60, weight_decay=10**(-4))

# with tf.Session(graph=model.graph) as session:
with tf.Session(graph=graph) as session:
    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(log_dir, session.graph)
    
    # initialize variables
    tf.global_variables_initializer().run()
    total_loss = 0
    perplexity_exponent = 0
    
    learning_rate = epsilon_0
    total_steps = 0
    
    for epoch in np.arange(num_epochs):
        print('epoch:', epoch)
        for step in tqdm_notebook(np.arange(num_steps)):
            data_training, label = next(training_data)        

            # collect runtime statistics
            run_metadata = tf.RunMetadata()

#             _, loss_step, prob_step, summary = session.run([model.MLP1_optimizer, model.MLP1_loss, model.MLP1_prob, model.summary_merged], 
#                                                             feed_dict={model.words:data_training, model.y:label, model.epsilon_t:learning_rate},
#                                                             run_metadata=run_metadata)

            _, loss_step, prob_step, summary = session.run([MLP1_optimizer, MLP1_loss, MLP1_prob, summary_merged], 
                                                            feed_dict={words:data_training, y:label, epsilon_t:learning_rate},
                                                            run_metadata=run_metadata)
    
#             _, loss_step, prob_step, summary = session.run([MLP9_optimizer, MLP9_loss, MLP9_prob, summary_merged], 
#                                                             feed_dict={words:data_training, y:label, epsilon_t:learning_rate},
#                                                             run_metadata=run_metadata)
            
            total_steps += 1
            learning_rate = epsilon_0/(1+r*t)
            parameter_updates += t
            total_loss += loss_step
            perplexity_exponent += np.log(prob_step[np.argmax(label)][0])

            # record summaries
            writer.add_summary(summary, total_steps)
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'epoch{} step {}'.format(epoch, step))

            if step % 100 == 0 and step > 0:
                print('average loss at step ', total_steps, ':', total_loss/total_steps)
                print('perplexity at step', total_steps, ':', np.exp(-perplexity_exponent/total_steps))
    
    # save the model
    saver.save(session, os.path.join(log_dir, 'model.ckpt'))
    writer.close()

epoch: 0


HBox(children=(IntProgress(value=0, max=5846), HTML(value='')))

average loss at step  101 : 9.99301556313392
perplexity at step 101 : 22577.47595513648
average loss at step  201 : 9.99446975651072
perplexity at step 201 : 22464.632117021498
average loss at step  301 : 9.984981812511963
perplexity at step 301 : 22311.646257790126
average loss at step  401 : 9.980940511993637
perplexity at step 401 : 21593.64062865344
average loss at step  501 : 9.974622254362126
perplexity at step 501 : 21681.989167805154
average loss at step  601 : 9.969165421166952
perplexity at step 601 : 21510.71802480444
average loss at step  701 : 9.965722688084492
perplexity at step 701 : 21608.421843367403
average loss at step  801 : 9.960917947890607
perplexity at step 801 : 21427.861973125644
average loss at step  901 : 9.957094569317377
perplexity at step 901 : 21189.58352241609
average loss at step  1001 : 9.95290907994136
perplexity at step 1001 : 21170.06233245293
average loss at step  1101 : 9.948271572968832
perplexity at step 1101 : 20971.8999461581
average loss at 

KeyboardInterrupt: 