In [61]:
import gensim                     # implements word2vec model infrastructure and provides interfacing APIs 
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile
import warnings
warnings.filterwarnings('ignore')

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# Helper libraries
from rnn.w266_common import utils, vocabulary, tf_embed_viz

# Your code
import rnn.rnnlm as rnnlm;reload(rnnlm)
import rnn.rnnlm_test as rnnlm_test;reload(rnnlm_test)

<module 'rnn.rnnlm_test' from '/Users/jeylee/MIDS/W266/smartThesaurus/Code/rnn/rnnlm_test.py'>

In [62]:
# load pre-trained word2vec model
word2vec_vectors = '../pretrained/GoogleNews-vectors-negative300.bin'

w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vec_vectors, binary=True)

In [63]:
# load pre-trained GloVe model
glove_vectors = '../pretrained/glove.twitter.27B.200d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")

glove2word2vec(glove_input_file=glove_vectors, word2vec_output_file=tmp_file)
glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

In [64]:
# similarity 
pair1 = ['minor','small']
pair2 = ['minor','major']
cos_dist1_w = w2v.similarity(pair1[0], pair1[1])
cos_dist2_w = w2v.similarity(pair2[0], pair2[1])

print('word2vec cosine similarity of {}: {}'.format(pair1, cos_dist1_w) )
print('word2vec cosine similarity of {}: {}'.format(pair2, cos_dist2_w) )

cos_dist1_g = glove.similarity(pair1[0], pair1[1])
cos_dist2_g = glove.similarity(pair2[0], pair2[1])

print('\nGloVe cosine similarity of {}: {}'.format(pair1, cos_dist1_g) )
print('GloVe cosine similarity of {}: {}'.format(pair2, cos_dist2_g) )

word2vec cosine similarity of ['minor', 'small']: 0.3416362404823303
word2vec cosine similarity of ['minor', 'major']: 0.47539088129997253

GloVe cosine similarity of ['minor', 'small']: 0.42706066370010376
GloVe cosine similarity of ['minor', 'major']: 0.7037895321846008


In [65]:
# vector representation of the word
vec_pair1_0_w = w2v.get_vector(pair1[0])
print("word2vec Vector embedding dimension: ",vec_pair1_0_w.shape)
print("\nPrinting a subset of the whole vector for the word '{}':".format(pair1[0]))
print(vec_pair1_0_w[1:20])

vec_pair1_0_g = glove.get_vector(pair1[0])
print("\nGloVe vector embedding dimension: ",vec_pair1_0_g.shape)
print("\nPrinting a subset of the whole vector for the word '{}':".format(pair1[0]))
print(vec_pair1_0_g[1:20])

word2vec Vector embedding dimension:  (300,)

Printing a subset of the whole vector for the word 'minor':
[ 0.06640625 -0.00228882  0.00402832 -0.28710938 -0.21972656  0.34765625
 -0.00494385 -0.01757812  0.12988281 -0.15917969 -0.15527344 -0.16992188
  0.06933594 -0.14257812 -0.07958984  0.16992188  0.12109375  0.125
 -0.06494141]

GloVe vector embedding dimension:  (200,)

Printing a subset of the whole vector for the word 'minor':
[-0.81608999 -0.10689    -0.53272998 -0.20412    -0.37599     0.12386
 -0.12322    -0.80023998 -0.017576    0.30317    -0.068888   -1.09749997
 -0.56645     0.37650999 -0.46614999 -0.42359    -0.076921   -0.012701
 -0.0067806 ]


In [66]:
# most similar words - by word
n_similar = 15
thisWord = 'major'

print("Most similar {} words (by word) for '{}' by word2vec model:".format(n_similar, thisWord))
display(w2v.similar_by_word(thisWord, n_similar))
print("\nMost similar {} words (by word) for '{}' by GloVe model:".format(n_similar, thisWord))
display(glove.similar_by_word(thisWord, n_similar))

Most similar 15 words (by word) for 'major' by word2vec model:


[('biggest', 0.6572940349578857),
 ('significant', 0.619140088558197),
 ('big', 0.6057686805725098),
 ('main', 0.5380213260650635),
 ('key', 0.5354758501052856),
 ('huge', 0.5329675674438477),
 ('signficant', 0.5157025456428528),
 ('amajor', 0.49914824962615967),
 ('largest', 0.49542921781539917),
 ('greatest', 0.49444860219955444),
 ('Major', 0.4887048304080963),
 ('massive', 0.4786102771759033),
 ('minor', 0.47539088129997253),
 ('substantial', 0.46729934215545654),
 ('monumental', 0.46554118394851685)]


Most similar 15 words (by word) for 'major' by GloVe model:


[('minor', 0.703789472579956),
 ('huge', 0.6762630939483643),
 ('massive', 0.655586838722229),
 ('big', 0.6330057382583618),
 ('biggest', 0.6215412616729736),
 ('another', 0.6144846081733704),
 ('third', 0.6137520670890808),
 ('any', 0.6084322333335876),
 ('serious', 0.6081491112709045),
 ('issues', 0.6023705005645752),
 ('first', 0.5963584780693054),
 ('having', 0.5878738164901733),
 ('two', 0.5866069197654724),
 ('other', 0.5805503129959106),
 ('many', 0.5805253982543945)]

## RNN

In [91]:
reload(rnnlm)

TF_GRAPHDIR = "/Users/jeylee/mids/W266/tmp/w266/a3_graph"

# Clear old log directory.
shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)

lm = rnnlm.RNNLM(V=10000, H=200, num_layers=2)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildSamplerGraph()

summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)

reload(rnnlm); reload(rnnlm_test)
utils.run_tests(rnnlm_test, ["TestRNNLMCore", "TestRNNLMTrain", "TestRNNLMSampler"])

test_shapes_embed (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_output (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_recurrent (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_train (rnn.rnnlm_test.TestRNNLMTrain) ... ok
test_shapes_sample (rnn.rnnlm_test.TestRNNLMSampler) ... ok

----------------------------------------------------------------------
Ran 5 tests in 2.049s

OK


In [92]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=None):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        feed_dict = {
            lm.input_w_: w,
            lm.target_y_: y,
            lm.initial_h_: h,
            lm.learning_rate_: learning_rate,
            lm.use_dropout_: use_dropout
        }
        ops = [loss, lm.final_h_, train_op]        
        #### YOUR CODE HERE ####
        # session.run(...) the ops with the feed_dict constructed above.
        # Ensure "cost" becomes the value of "loss".
        # Hint: see "ops" for other variables that need updating in this loop.
        
        cost, h, _ = session.run([loss, lm.final_h_ ,train_op], feed_dict=feed_dict)
        
        #### END(YOUR CODE) ####
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print("[batch {:d}]: seen {:d} words at {:.1f} wps, loss = {:.3f}".format(
                i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.rnnlm_batch_generator(ids, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=0.0, train=False, 
                     verbose=False, tick_s=3600)
    print("{:s}: avg. loss: {:.03f}  (perplexity: {:.02f})".format(name, cost, np.exp(cost)))
    return cost

In [93]:
reload(rnnlm); reload(rnnlm_test)
th = rnnlm_test.RunEpochTester("test_toy_model")
th.setUp(); th.injectCode(run_epoch, score_dataset)
unittest.TextTestRunner(verbosity=2).run(th)

test_toy_model (rnn.rnnlm_test.RunEpochTester) ... 

[batch 143]: seen 7200 words at 7149.5 wps, loss = 0.638
[batch 307]: seen 15400 words at 7655.6 wps, loss = 0.392
[batch 469]: seen 23500 words at 7799.3 wps, loss = 0.296
[batch 634]: seen 31750 words at 7906.6 wps, loss = 0.245
[batch 788]: seen 39450 words at 7863.2 wps, loss = 0.215
[batch 949]: seen 47500 words at 7890.5 wps, loss = 0.193
[batch 1112]: seen 55650 words at 7919.1 wps, loss = 0.176
[batch 1255]: seen 62800 words at 7817.2 wps, loss = 0.166
[batch 1417]: seen 70900 words at 7847.5 wps, loss = 0.154
[batch 1580]: seen 79050 words at 7876.9 wps, loss = 0.145
[batch 1744]: seen 87250 words at 7904.1 wps, loss = 0.138
[batch 1909]: seen 95500 words at 7929.0 wps, loss = 0.132
[batch 2074]: seen 103750 words at 7953.1 wps, loss = 0.126
[batch 2238]: seen 111950 words at 7969.7 wps, loss = 0.122
[batch 2402]: seen 120150 words at 7983.0 wps, loss = 0.117
[batch 2566]: seen 128350 words at 7995.7 wps, loss = 0.114
[batch 2730]: seen 136550 words at 8005.7 wps, loss = 0.111

ok

----------------------------------------------------------------------
Ran 1 test in 26.875s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>

In [94]:
# Load the dataset
V = 10000
vocab, train_ids, test_ids = utils.load_corpus("brown", split=0.8, V=V, shuffle=42)

[nltk_data] Downloading package brown to /Users/jeylee/nltk_data...
[nltk_data]   Package brown is already up-to-date!
Vocabulary: 10,000 types
Loaded 57,340 sentences (1.16119e+06 tokens)
Training set: 45,872 sentences (924,077 tokens)
Test set: 11,468 sentences (237,115 tokens)


In [95]:
# Training parameters
max_time = 25
batch_size = 100
learning_rate = 0.01
num_epochs = 10

# Model parameters
model_params = dict(V=vocab.size, 
                    H=200, 
                    softmax_ns=200,
                    num_layers=2)

TF_SAVEDIR = "/Users/jeylee/mids/W266/tmp/w266/a3_model"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [96]:
# Will print status every this many seconds
print_interval = 5

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils.rnnlm_batch_generator(train_ids, batch_size, max_time)
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))
        #### YOUR CODE HERE ####
        # Run a training epoch.
        
        run_epoch(lm, session, bi, learning_rate=learning_rate, train=True, verbose=False, tick_s=10)
        
        #### END(YOUR CODE) ####
        print("[epoch {:d}] Completed in {:s}".format(epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print("[epoch {:d}]".format(epoch), end=" ")
        score_dataset(lm, session, train_ids, name="Train set")
        print("[epoch {:d}]".format(epoch), end=" ")
        score_dataset(lm, session, test_ids, name="Test set")
        print("")
    
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:01:37
[epoch 1] Train set: avg. loss: 5.396  (perplexity: 220.47)
[epoch 1] Test set: avg. loss: 5.434  (perplexity: 229.02)

[epoch 2] Starting epoch 2
[epoch 2] Completed in 0:01:42
[epoch 2] Train set: avg. loss: 5.219  (perplexity: 184.80)
[epoch 2] Test set: avg. loss: 5.275  (perplexity: 195.45)

[epoch 3] Starting epoch 3
[epoch 3] Completed in 0:01:50
[epoch 3] Train set: avg. loss: 5.103  (perplexity: 164.56)
[epoch 3] Test set: avg. loss: 5.173  (perplexity: 176.39)

[epoch 4] Starting epoch 4
[epoch 4] Completed in 0:02:15
[epoch 4] Train set: avg. loss: 5.030  (perplexity: 152.97)
[epoch 4] Test set: avg. loss: 5.112  (perplexity: 165.95)

[epoch 5] Starting epoch 5
[epoch 5] Completed in 0:01:48
[epoch 5] Train set: avg. loss: 4.970  (perplexity: 143.96)
[epoch 5] Test set: avg. loss: 5.064  (perplexity: 158.21)

[epoch 6] Starting epoch 6
[epoch 6] Completed in 0:01:40
[epoch 6] Train set: avg. loss: 4.926  (perplexity: 

In [97]:
def score_seq(lm, session, seq, vocab):
    """Score a sequence of words. Returns total log-probability."""
    padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq + ["</s>"], 
                                                             wordset=vocab.word_to_id))
    w = np.reshape(padded_ids[:-1], [1,-1])
    y = np.reshape(padded_ids[1:],  [1,-1])
    h = session.run(lm.initial_h_, {lm.input_w_: w})
    feed_dict = {lm.input_w_:w,
                 lm.target_y_:y,
                 lm.initial_h_:h,
                 lm.dropout_keep_prob_: 1.0}
    # Return log(P(seq)) = -1*loss
    return -1*session.run(lm.loss_, feed_dict)

def load_and_score(inputs, sort=False):
    """Load the trained model and score the given words."""
    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    
    with lm.graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=lm.graph) as session:  
        # Load the trained model
        saver.restore(session, trained_filename)

        if isinstance(inputs[0], str) or isinstance(inputs[0], bytes):
            inputs = [inputs]

        # Actually run scoring
        results = []
        for words in inputs:
            score = score_seq(lm, session, words, vocab)
            results.append((score, words))

        # Sort if requested
        if sort: results = sorted(results, reverse=True)

        # Print results
        for score, words in results:
            print("\"{:s}\" : {:.02f}".format(" ".join(words), score))

In [100]:
sents = ["There's going to be major changes to the company next year."]
load_and_score([s.split() for s in sents])

INFO:tensorflow:Restoring parameters from /Users/jeylee/mids/W266/tmp/w266/a3_model/rnnlm_trained
"There's going to be major changes to the company next year." : -6.38


In [99]:
# most similar words - by vector
print("Most similar {} words (by vector) for '{}' by word2vec model:".format(n_similar, thisWord))
display(w2v.similar_by_vector(thisWord, n_similar))
print("\nMost similar {} words (by vector) for '{}' by GloVe model:".format(n_similar, thisWord))
display(glove.similar_by_vector(thisWord, n_similar))

Most similar 15 words (by vector) for 'major' by word2vec model:


[('biggest', 0.6572940349578857),
 ('significant', 0.619140088558197),
 ('big', 0.6057686805725098),
 ('main', 0.5380213260650635),
 ('key', 0.5354758501052856),
 ('huge', 0.5329675674438477),
 ('signficant', 0.5157025456428528),
 ('amajor', 0.49914824962615967),
 ('largest', 0.49542921781539917),
 ('greatest', 0.49444860219955444),
 ('Major', 0.4887048304080963),
 ('massive', 0.4786102771759033),
 ('minor', 0.47539088129997253),
 ('substantial', 0.46729934215545654),
 ('monumental', 0.46554118394851685)]


Most similar 15 words (by vector) for 'major' by GloVe model:


[('minor', 0.703789472579956),
 ('huge', 0.6762630939483643),
 ('massive', 0.655586838722229),
 ('big', 0.6330057382583618),
 ('biggest', 0.6215412616729736),
 ('another', 0.6144846081733704),
 ('third', 0.6137520670890808),
 ('any', 0.6084322333335876),
 ('serious', 0.6081491112709045),
 ('issues', 0.6023705005645752),
 ('first', 0.5963584780693054),
 ('having', 0.5878738164901733),
 ('two', 0.5866069197654724),
 ('other', 0.5805503129959106),
 ('many', 0.5805253982543945)]

In [132]:
targetSentence = 'There is going to be a major change in the company next year.'
targetWord = 'major'
print("### Most similar {} words (by vector) for '{}' by word2vec model:".format(n_similar, targetWord))
display(w2v.similar_by_vector(targetWord, n_similar))
print("\n### Most similar {} words (by vector) for '{}' by GloVe model:".format(n_similar, targetWord))
display(glove.similar_by_vector(targetWord, n_similar))

word2vec_candidates = w2v.similar_by_vector(targetWord, n_similar)
glove_candidates = glove.similar_by_vector(targetWord, n_similar)

print("\n### Most similar {} words for '{}' by word2vec + RNN model:\n".format(n_similar, targetWord))
sentences_word2vec = []
for i in word2vec_candidates:
    sentences_word2vec.append(targetSentence.replace('major', i[0]))
word2vec_rnn_candidates = load_and_score([s.split() for s in sentences_word2vec], sort=True)

print("\n### Most similar {} words for '{}' by GloVe + RNN model:\n".format(n_similar, targetWord))
sentences_glove = []
for i in glove_candidates:
    sentences_glove.append(targetSentence.replace('major', i[0]))
glove_rnn_candidates = load_and_score([s.split() for s in sentences_glove], sort=True)

### Most similar 15 words (by vector) for 'major' by word2vec model:


[('biggest', 0.6572940349578857),
 ('significant', 0.619140088558197),
 ('big', 0.6057686805725098),
 ('main', 0.5380213260650635),
 ('key', 0.5354758501052856),
 ('huge', 0.5329675674438477),
 ('signficant', 0.5157025456428528),
 ('amajor', 0.49914824962615967),
 ('largest', 0.49542921781539917),
 ('greatest', 0.49444860219955444),
 ('Major', 0.4887048304080963),
 ('massive', 0.4786102771759033),
 ('minor', 0.47539088129997253),
 ('substantial', 0.46729934215545654),
 ('monumental', 0.46554118394851685)]


### Most similar 15 words (by vector) for 'major' by GloVe model:


[('minor', 0.703789472579956),
 ('huge', 0.6762630939483643),
 ('massive', 0.655586838722229),
 ('big', 0.6330057382583618),
 ('biggest', 0.6215412616729736),
 ('another', 0.6144846081733704),
 ('third', 0.6137520670890808),
 ('any', 0.6084322333335876),
 ('serious', 0.6081491112709045),
 ('issues', 0.6023705005645752),
 ('first', 0.5963584780693054),
 ('having', 0.5878738164901733),
 ('two', 0.5866069197654724),
 ('other', 0.5805503129959106),
 ('many', 0.5805253982543945)]


### Most similar 15 words for 'major' by word2vec + RNN model:

INFO:tensorflow:Restoring parameters from /Users/jeylee/mids/W266/tmp/w266/a3_model/rnnlm_trained
"There is going to be a signficant change in the company next year." : -4.70
"There is going to be a monumental change in the company next year." : -4.70
"There is going to be a amajor change in the company next year." : -4.70
"There is going to be a big change in the company next year." : -4.93
"There is going to be a Major change in the company next year." : -4.98
"There is going to be a huge change in the company next year." : -5.01
"There is going to be a significant change in the company next year." : -5.03
"There is going to be a substantial change in the company next year." : -5.03
"There is going to be a massive change in the company next year." : -5.06
"There is going to be a greatest change in the company next year." : -5.09
"There is going to be a main change in the company next year." : -5.10
"There is going to be