In [3]:
import gensim                     # implements word2vec model infrastructure and provides interfacing APIs 
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile
import warnings
warnings.filterwarnings('ignore')

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# Helper libraries
from rnn.w266_common import utils, vocabulary, tf_embed_viz

# Your code
import rnn.rnnlm as rnnlm;reload(rnnlm)
import rnn.rnnlm_test as rnnlm_test;reload(rnnlm_test)

<module 'rnn.rnnlm_test' from '/Users/jeylee/MIDS/W266/smartThesaurus/Code/rnn/rnnlm_test.py'>

In [4]:
# load pre-trained word2vec model
word2vec_vectors = '../pretrained/GoogleNews-vectors-negative300.bin'

w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vec_vectors, binary=True)

In [5]:
# load pre-trained GloVe model
glove_vectors = '../pretrained/glove.twitter.27B.200d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")

glove2word2vec(glove_input_file=glove_vectors, word2vec_output_file=tmp_file)
glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

In [6]:
# similarity 
pair1 = ['minor','small']
pair2 = ['minor','major']
cos_dist1_w = w2v.similarity(pair1[0], pair1[1])
cos_dist2_w = w2v.similarity(pair2[0], pair2[1])

print('word2vec cosine similarity of {}: {}'.format(pair1, cos_dist1_w) )
print('word2vec cosine similarity of {}: {}'.format(pair2, cos_dist2_w) )

cos_dist1_g = glove.similarity(pair1[0], pair1[1])
cos_dist2_g = glove.similarity(pair2[0], pair2[1])

print('\nGloVe cosine similarity of {}: {}'.format(pair1, cos_dist1_g) )
print('GloVe cosine similarity of {}: {}'.format(pair2, cos_dist2_g) )

word2vec cosine similarity of ['minor', 'small']: 0.3416362404823303
word2vec cosine similarity of ['minor', 'major']: 0.47539088129997253

GloVe cosine similarity of ['minor', 'small']: 0.42706066370010376
GloVe cosine similarity of ['minor', 'major']: 0.7037895321846008


In [7]:
# vector representation of the word
vec_pair1_0_w = w2v.get_vector(pair1[0])
print("word2vec Vector embedding dimension: ",vec_pair1_0_w.shape)
print("\nPrinting a subset of the whole vector for the word '{}':".format(pair1[0]))
print(vec_pair1_0_w[1:20])

vec_pair1_0_g = glove.get_vector(pair1[0])
print("\nGloVe vector embedding dimension: ",vec_pair1_0_g.shape)
print("\nPrinting a subset of the whole vector for the word '{}':".format(pair1[0]))
print(vec_pair1_0_g[1:20])

word2vec Vector embedding dimension:  (300,)

Printing a subset of the whole vector for the word 'minor':
[ 0.06640625 -0.00228882  0.00402832 -0.28710938 -0.21972656  0.34765625
 -0.00494385 -0.01757812  0.12988281 -0.15917969 -0.15527344 -0.16992188
  0.06933594 -0.14257812 -0.07958984  0.16992188  0.12109375  0.125
 -0.06494141]

GloVe vector embedding dimension:  (200,)

Printing a subset of the whole vector for the word 'minor':
[-0.81608999 -0.10689    -0.53272998 -0.20412    -0.37599     0.12386
 -0.12322    -0.80023998 -0.017576    0.30317    -0.068888   -1.09749997
 -0.56645     0.37650999 -0.46614999 -0.42359    -0.076921   -0.012701
 -0.0067806 ]


In [8]:
# most similar words - by word
n_similar = 15
thisWord = 'major'

print("Most similar {} words (by word) for '{}' by word2vec model:".format(n_similar, thisWord))
display(w2v.similar_by_word(thisWord, n_similar))
print("\nMost similar {} words (by word) for '{}' by GloVe model:".format(n_similar, thisWord))
display(glove.similar_by_word(thisWord, n_similar))

Most similar 15 words (by word) for 'major' by word2vec model:


[('biggest', 0.6572940349578857),
 ('significant', 0.619140088558197),
 ('big', 0.6057686805725098),
 ('main', 0.5380213260650635),
 ('key', 0.5354758501052856),
 ('huge', 0.5329675674438477),
 ('signficant', 0.5157025456428528),
 ('amajor', 0.49914824962615967),
 ('largest', 0.49542921781539917),
 ('greatest', 0.49444860219955444),
 ('Major', 0.4887048304080963),
 ('massive', 0.4786102771759033),
 ('minor', 0.47539088129997253),
 ('substantial', 0.46729934215545654),
 ('monumental', 0.46554118394851685)]


Most similar 15 words (by word) for 'major' by GloVe model:


[('minor', 0.703789472579956),
 ('huge', 0.6762630939483643),
 ('massive', 0.655586838722229),
 ('big', 0.6330057382583618),
 ('biggest', 0.6215412616729736),
 ('another', 0.6144846081733704),
 ('third', 0.6137520670890808),
 ('any', 0.6084322333335876),
 ('serious', 0.6081491112709045),
 ('issues', 0.6023705005645752),
 ('first', 0.5963584780693054),
 ('having', 0.5878738164901733),
 ('two', 0.5866069197654724),
 ('other', 0.5805503129959106),
 ('many', 0.5805253982543945)]

## RNN

In [10]:
reload(rnnlm)

TF_GRAPHDIR = "/Users/jeylee/mids/W266/tmp/w266/a3_graph"

# Clear old log directory.
shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)

lm = rnnlm.RNNLM(V=10000, H=200, num_layers=2)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildSamplerGraph()

summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)

reload(rnnlm); reload(rnnlm_test)
utils.run_tests(rnnlm_test, ["TestRNNLMCore", "TestRNNLMTrain", "TestRNNLMSampler"])

test_shapes_embed (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_output (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_recurrent (rnn.rnnlm_test.TestRNNLMCore) ... ok
test_shapes_train (rnn.rnnlm_test.TestRNNLMTrain) ... ok
test_shapes_sample (rnn.rnnlm_test.TestRNNLMSampler) ... ok

----------------------------------------------------------------------
Ran 5 tests in 1.963s

OK


In [11]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=None):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        feed_dict = {
            lm.input_w_: w,
            lm.target_y_: y,
            lm.initial_h_: h,
            lm.learning_rate_: learning_rate,
            lm.use_dropout_: use_dropout
        }
        ops = [loss, lm.final_h_, train_op]        
        #### YOUR CODE HERE ####
        # session.run(...) the ops with the feed_dict constructed above.
        # Ensure "cost" becomes the value of "loss".
        # Hint: see "ops" for other variables that need updating in this loop.
        
        cost, h, _ = session.run([loss, lm.final_h_ ,train_op], feed_dict=feed_dict)
        
        #### END(YOUR CODE) ####
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print("[batch {:d}]: seen {:d} words at {:.1f} wps, loss = {:.3f}".format(
                i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.rnnlm_batch_generator(ids, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=0.0, train=False, 
                     verbose=False, tick_s=3600)
    print("{:s}: avg. loss: {:.03f}  (perplexity: {:.02f})".format(name, cost, np.exp(cost)))
    return cost

In [12]:
reload(rnnlm); reload(rnnlm_test)
th = rnnlm_test.RunEpochTester("test_toy_model")
th.setUp(); th.injectCode(run_epoch, score_dataset)
unittest.TextTestRunner(verbosity=2).run(th)

test_toy_model (rnn.rnnlm_test.RunEpochTester) ... 

[batch 143]: seen 7200 words at 7186.1 wps, loss = 0.740
[batch 305]: seen 15300 words at 7631.6 wps, loss = 0.474
[batch 469]: seen 23500 words at 7807.0 wps, loss = 0.366
[batch 627]: seen 31400 words at 7825.3 wps, loss = 0.305
[batch 790]: seen 39550 words at 7885.0 wps, loss = 0.263
[batch 930]: seen 46550 words at 7736.6 wps, loss = 0.235
[batch 1062]: seen 53150 words at 7568.6 wps, loss = 0.216
[batch 1216]: seen 60850 words at 7581.0 wps, loss = 0.198
[batch 1353]: seen 67700 words at 7497.8 wps, loss = 0.186
[batch 1487]: seen 74400 words at 7415.3 wps, loss = 0.176
[batch 1627]: seen 81400 words at 7374.7 wps, loss = 0.166
[batch 1729]: seen 86500 words at 7181.9 wps, loss = 0.160
[batch 1855]: seen 92800 words at 7111.0 wps, loss = 0.153
[batch 1963]: seen 98200 words at 6981.7 wps, loss = 0.148
[batch 2032]: seen 101650 words at 6742.3 wps, loss = 0.145
[batch 2117]: seen 105900 words at 6584.8 wps, loss = 0.141
[batch 2261]: seen 113100 words at 6619.7 wps, loss = 0.136
[

ok

----------------------------------------------------------------------
Ran 1 test in 37.111s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>

In [13]:
# Load the dataset
V = 10000
vocab, train_ids, test_ids = utils.load_corpus("brown", split=0.8, V=V, shuffle=42)

[nltk_data] Downloading package brown to /Users/jeylee/nltk_data...
[nltk_data]   Package brown is already up-to-date!
Vocabulary: 10,000 types
Loaded 57,340 sentences (1.16119e+06 tokens)
Training set: 45,872 sentences (924,077 tokens)
Test set: 11,468 sentences (237,115 tokens)


In [14]:
# Training parameters
max_time = 25
batch_size = 100
learning_rate = 0.01
num_epochs = 10

# Model parameters
model_params = dict(V=vocab.size, 
                    H=200, 
                    softmax_ns=200,
                    num_layers=2)

TF_SAVEDIR = "/Users/jeylee/mids/W266/tmp/w266/a3_model"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [162]:
# Will print status every this many seconds
print_interval = 5

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils.rnnlm_batch_generator(train_ids, batch_size, max_time)
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))
        #### YOUR CODE HERE ####
        # Run a training epoch.
        
        run_epoch(lm, session, bi, learning_rate=learning_rate, train=True, verbose=False, tick_s=10)
        
        #### END(YOUR CODE) ####
        print("[epoch {:d}] Completed in {:s}".format(epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print("[epoch {:d}]".format(epoch), end=" ")
        score_dataset(lm, session, train_ids, name="Train set")
        print("[epoch {:d}]".format(epoch), end=" ")
        score_dataset(lm, session, test_ids, name="Test set")
        print("")
    
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:01:30
[epoch 1] Train set: avg. loss: 5.396  (perplexity: 220.55)
[epoch 1] Test set: avg. loss: 5.437  (perplexity: 229.64)

[epoch 2] Starting epoch 2
[epoch 2] Completed in 0:01:29
[epoch 2] Train set: avg. loss: 5.210  (perplexity: 183.05)
[epoch 2] Test set: avg. loss: 5.267  (perplexity: 193.86)

[epoch 3] Starting epoch 3
[epoch 3] Completed in 0:01:31
[epoch 3] Train set: avg. loss: 5.118  (perplexity: 166.93)
[epoch 3] Test set: avg. loss: 5.186  (perplexity: 178.80)

[epoch 4] Starting epoch 4
[epoch 4] Completed in 0:01:32
[epoch 4] Train set: avg. loss: 5.032  (perplexity: 153.30)
[epoch 4] Test set: avg. loss: 5.115  (perplexity: 166.49)

[epoch 5] Starting epoch 5
[epoch 5] Completed in 0:01:32
[epoch 5] Train set: avg. loss: 4.972  (perplexity: 144.37)
[epoch 5] Test set: avg. loss: 5.066  (perplexity: 158.53)

[epoch 6] Starting epoch 6
[epoch 6] Completed in 0:01:33
[epoch 6] Train set: avg. loss: 4.921  (perplexity: 

In [15]:
def score_seq(lm, session, seq, vocab):
    """Score a sequence of words. Returns total log-probability."""
    padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq + ["</s>"], 
                                                             wordset=vocab.word_to_id))
    w = np.reshape(padded_ids[:-1], [1,-1])
    y = np.reshape(padded_ids[1:],  [1,-1])
    h = session.run(lm.initial_h_, {lm.input_w_: w})
    feed_dict = {lm.input_w_:w,
                 lm.target_y_:y,
                 lm.initial_h_:h,
                 lm.dropout_keep_prob_: 1.0}
    # Return log(P(seq)) = -1*loss
    return -1*session.run(lm.loss_, feed_dict)

def load_and_score(inputs, sort=True):
    """Load the trained model and score the given words."""
    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    
    with lm.graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=lm.graph) as session:  
        # Load the trained model
        saver.restore(session, trained_filename)

        if isinstance(inputs[0], str) or isinstance(inputs[0], bytes):
            inputs = [inputs]

        # Actually run scoring
        results = []
        for words in inputs:
            score = score_seq(lm, session, words, vocab)
            sentence_gen = " ".join(words)
            results.append((score, sentence_gen))

        # Sort if requested
        if sort: results = sorted(results, reverse=True)
            
        return results

In [98]:
simlex_synonyms = [['child','a man as simple as a child','youngster'],['author','The author shows promise of better things','writer'],['book','a freak copy of a book','work'],['dictionary','The dictionary is interleaved with a sheet of blank paper','wordbook'],['champion','Last years champion gained the lead in the race and won it','winner'],['air','How many times a day should we air our ad','wind'],['victory','The spiritual strength is just the motive power of victory','win'],['bath','Do you actually share the bath with other people','wash'],['roam','A user of the selected mail system may not roam','wander'],['crucial','The speed of the internet is crucial in your job search','vital'],['essential','Sleep and good food are essential to health','vital'],['winner','I do not know who the winner was','victor'],['car','I am looking for a used car','vehicle'],['disappear','Memories do not disappear They lap over each other','vanish'],['sick','He stayed and helped his sick friend all night','unwell'],['exotic','The tree right in front of us looks exotic','unusual'],['fragile','She looks soft but she is not fragile','unstable'],['restless','He was strangely restless at that time','unsettled'],['illegal','Do not ask him to do something illegal','unlawful'],['stupid','It would be really stupid for me to do this','unintelligent'],['sad','Do not be sad Life is like that','unhappy'],['rough','A car is bouncing along the rough road','uneven'],['wisdom','After 18 years of age wisdom teeth begin to emerge','understanding'],['comprehend','They did not comprehend the significance of his remark','understand'],['student','One student split on his friend','undergraduate'],['simple','a man as simple as a child','uncomplicated'],['pipe','I do not know how to dance to her pipe','tube'],['fact','As a matter of fact my mother wants to sell her car','truth'],['honest','I judge him to be a very honest man','trustworthy'],['belief','I was confirmed in my belief with the lapse of time','trust'],['confidence','Self confidence is the most important key to success','trust'],['succeed','I hope I shall succeed this time','triumph'],['journey','The day when I start for my journey is drawing near','trip'],['business','He works as a business consultant','trade'],['hard','I am hard at work on my next presentation','tough'],['finger','People pointed the finger at the mayor','touch'],['date','What is the date today over there','time'],['wood','A drill is lying on top of some wood','timber'],['attention','A loud noise diverted everyones attention from their work','thinking'],['narrow','No one accepted his narrow minded political views','thin'],['word','Word has it that they are not together anymore','term'],['muscle','They tend to lose some muscle mass every year','tendon'],['storm','A storm hit the country and killed a lot of people','tempest'],['inform','Why did not you inform me of this','tell'],['cab','Most people often have to share a cab with other passengers','taxi'],['assignment','You must hand in your assignment on time','task'],['discussion','After a long discussion they finally could abide the issue','talk'],['speak','The dog is so to speak a member of the family','talk'],['story','Let me hear the story or Tell me the story','tale'],['steal','Can I steal a minute of your time','take'],['nail','I want to nail it down','tack'],['lover','a lover of books','sweetheart'],['certain','You can be certain I will never do it again','sure'],['agony','The man cries in great agony','suffering'],['pain','She was in a lot of pain','suffering'],['log','People are sitting there like bumps on a log','stump'],['pupil','Each pupil has his own desk','student'],['anatomy','Define the anatomy to be imaged','structure'],['river','The man is fishing by the river','stream'],['bizarre','She died under bizarre circumstances no one knows how','strange'],['weird','He is charming and weird at the same time','strange'],['belly','I do not want a beer belly like my father','stomach'],['rod','This steel rod is straight as a back leg of a dog','stick'],['root','Lets eliminate the problem root and branch','stem'],['remain','Only the really bad people will decide to remain criminals','stay'],['politician','The politician nixed out one day','statesman'],['condition','the circumstances which condition our lives','state'],['declare','I wish to declare that I am certain of success','state'],['say','I suppose but they say it is very expensive','state'],['begin','After 18 years of age wisdom teeth begin to emerge','start'],['groom','The father is handing her to the groom','stableman'],['wife','Bring your wife and children too','spouse'],['ball','You can hear a description of the ball game on the radio','sphere'],['wide','A wide prospect burst upon my view','spacious'],['noise','The elevator stopped working with a loud noise','sound'],['army','the effective strength of an army','soldiers'],['nose','His guilt is as plain as the nose on his face','snout'],['tiny','Tiny plants float on the water and are a food source','small'],['kill','I will kill him five times before he hits the ground','slay'],['head','I worked in the head office of Bloomberg for seven years','skull'],['circumstance','They are friends that treat each other without circumstance','situation'],['easy','No we are still working on it It is not an easy task','simple'],['illness','His absence from work was because of his illness','sickness'],['beach','The man and woman are walking on a beach','shore'],['vessel','This vessel holds a lot of water','ship'],['harsh','She was harsh to her children','severe'],['formal','Use a colon after the salutation of a formal letter','serious'],['appointment','What time is my next appointment','selection'],['apparent','It is apparent that he wrote the letter himself','seeming'],['chapter','the final chapter of a book','section'],['afraid','The man is afraid of the mouse','scared'],['read','I usually read books but I watch TV at other times','scan'],['moon','Is it the sun or the moon','satellite'],['greet','She did not want to greet him so she looked the other way','salute'],['unhappy','He thinks the people will be unhappy in the future','sad'],['king','The King holds dominion over the people of his nation','ruler'],['prince','The crown prince is in training for becoming king one day','ruler'],['princess','Now be happy as a beautiful princess night and day','ruler'],['destroy','You want to destroy yourself you do it on your own','ruin'],['destruction','The pomps and vanities will bring the destruction','ruin'],['course','She led a one week course last year','route'],['cabin','Emergency exits are located on both sides of the cabin','room'],['sly','His lips were spread in a sly smile','roguish'],['street','The main street is busy all the times of the year','road'],['wealth','There are a lot of people who gained sudden wealth','riches'],['save','Save it for a rainy day','rescue'],['withdraw','They threatened to withdraw from the talks','remove'],['area','The farmers in this area are very busy at this time of year','region'],['advise','I advise you to leave the student problems severely alone','recommend'],['accept','Kindly accept a copy of my work just out','receive'],['reject','This scheme is so diabolical that I must reject it','rebuff'],['argument','He carefully dealt with a very sensitive argument','reason'],['logic','There is a jump in the logic of his opinion','reason'],['accomplish','I would accomplish so much more that way','realize'],['choice','The college offers a wide choice of courses','range'],['rain','Rain in Christmas time is visit of an angel','rainfall'],['anger','She drives like Jeho in a fit of anger','rage'],['fast','Driving too fast puts people in danger','quick'],['argue','I argue with my brother all the time','quarrel'],['heroine','Her reputation as a heroine grew and grew','protagonist'],['make','make a thing a matter of conscience','produce'],['jail','After all he landed up in jail','prison'],['code','Please reenter your personal code number','principles'],['leader','The Khmer Rouge leader Pol Pot died eight years ago','principal'],['keep','He wanted to keep out of it','prevent'],['assume','The matter threatens to assume serious proportions','presume'],['bias','Cultural bias has many problems that must be solved','prejudice'],['strong','Competition for such jobs will be strong','powerful'],['purse','Oh no I think I left my purse behind','pouch'],['jar','Collect change in a jar for a whole month','pot'],['job','We buy our supplies in job lots','position'],['situation','It looks like you are in a very tough situation','position'],['lake','measure the magnitude of a lake','pond'],['abundance','North America is a land of abundance','plenty'],['happy','No family can be happy without harmony among its members','pleased'],['delightful','a delightful situation for a house','pleasant'],['nice','It is nice but I do not want one','pleasant'],['portray','Ueno will portray a substitute high school teacher','play'],['plate','The woman is putting food on her plate','platter'],['arrange','Did you arrange those circus tickets for next week','plan'],['put','It is time to put a period to the matter','place'],['choose','I had to choose only one person among those people','pick'],['body','the economy of the human body','physique'],['doctor','There is a doctor on call 24 hours a day','physician'],['telephone','A person calls you on the telephone','phone'],['people','the numerous voice of the people','persons'],['actor','The actor is on the bill','performer'],['do','When do you show the movie','perform'],['give','give a person a fair hearing','perform'],['mountain','The people are on top of a mountain','peak'],['top','The people are on top of a mountain','peak'],['gut','I thought I would bust a gut laughing','paunch'],['game','So exciting to watch the soccer game with my friends','pastime'],['aisle','A man and beautiful woman walked down the aisle','passageway'],['alley','The troops passed into their alley','passage'],['celebration','I think this calls for a celebration','party'],['wall','The car dashed into a wall','partition'],['atom','He has not an ounce an atom of conscience','particle'],['molecule','They join together into one giant molecule','particle'],['limb','He lost his leg in an accident and wears an artificial limb','part'],['hand','The gentleman offered his hand to the lady','palm'],['couple','He has been with this company only for a couple of years','pair'],['adversary','A boxer sprang at his adversary','opponent'],['attitude','I perceived a slight change in his attitude','opinion'],['door','I felt my way to the door','opening'],['crime','He was caught as a party to the crime','offence'],['strange','Fate works in a strange way','odd'],['sea','I worked my way up along the East Sea from there','ocean'],['noticeable','This threat had a noticeable effect on the crowd','obvious'],['idea','print an idea on the mind','notion'],['north','a house with a north aspect','northern'],['racket','Hold the racket so like this','noise'],['sunset','The view of the sunset from here is out of sight','nightfall'],['paper','There is no use in trying to paper over your faults','newspaper'],['recent','His novels have gained in popularity over recent years','new'],['forget','Do not forget that I am always by you','neglect'],['denial','My father is in serious denial about it','negation'],['necessary','Oh I did not know that was necessary','needed'],['require','Highland games only require sports skills','need'],['horse','The match was horse and horse','nag'],['crowd','He pushed his way through a crowd','multitude'],['cup','The World Cup amused the people across the country','mug'],['go','That hat doesn not go very well with the dress','move'],['hill','There is a station right over the hill','mount'],['second','This reader is too difficult for second year grade pupils','moment'],['new','The Chinese New Year is a 15 day holiday','modern'],['modest','You really should not be so modest all the time','moderate'],['cloud','a cloud of war','mist'],['strength','the effective strength of an army','might'],['formula','“Very truly yours” is a formula used in letters','method'],['way','We need it in a kind of way','method'],['letter','How many days of the week start with the letter T','message'],['deserve','He doee not deserve to be told off','merit'],['dinner','Serving breakfast lunch and dinner seven days a week','meal'],['learn','The important thing is to learn from my mistakes','master'],['target','Please enter the distinguished name of the target OU','mark'],['polite','He is generous and you know very polite','mannerly'],['style','You can customize the style and wording of the product name','manner'],['administration','There is a thorough change in the administration','management'],['guy','Do not trust that guy man I think he is a narc','man'],['man','a man as simple as a child','male'],['creator','No one but the Creator understands their internal logic','maker'],['insane','His family sent their insane relative to live in a madhouse','mad'],['oil','Oil prices reached their peak last year','lubricate'],['laden','Osama bin Laden was the world most wanted terrorist','loaded'],['water','A man urged on people the need of water','liquid'],['mouth','My mouth felt as dry as a bone or My mouth felt parched','lips'],['leg','That is a stiff price or It costs me an arm and a leg','limb'],['blood','I need to let blood to test your immune system','lifeblood'],['reduce','The new technique will reduce the cost of production','lessen'],['captain','He was promoted to captain because he worked fine','leader'],['anarchy','At that time all was anarchy in China','lawlessness'],['big','The boy from the country looked gawky in the big city','large'],['great','His visit of state was a great event','large'],['tongue','The boy stuck his tongue in his cheek before his friends','language'],['woman','The woman is looking up to her friend','lady'],['boy','The man and boy are tuning the guitar','lad'],['sharp','The streets are very dangerous Look sharp','keen'],['trick','I will never miss a trick','joke'],['elbow','He busted up his elbow playing tennis','joint'],['task','No We are still working on it It is not an easy task','job'],['danger','He are in danger of losing his family members','jeopardy'],['island','He was landed on a lonely island','isle'],['intelligence','credit a person with rare intelligence','intellect'],['teacher','Do not talk back to your teacher','instructor'],['encourage','Bush would encourage added supplies in many ways','inspire'],['mad','It was a mad house in my office today','insane'],['ask','I wanted to ask you something','inquire'],['population','The population in this neighborhood increases day by day','inhabitants'],['multiply','Some bacteria multiply by cell division','increase'],['cheek','The boy stuck his tongue in his cheek before his friends','impudence'],['value','People do not know the value of health till they lose it','importance'],['beg','Do not be like this please I beg you','implore'],['reflection','The girl was lost in reflection','image'],['vision','He is a man of broad vision','image'],['disease','He is way ahead of me in disease research','illness'],['dog','A wolf is a member of the dog family','hound'],['aggression','I was able to hit with precision and aggression today','hostility'],['house','The school house is spick and span inside and out','home'],['container','He is pouring water from the container','holder'],['sky','The storm blew the house sky high','heavens'],['trial','It is just a process of trial and error','hearing'],['listen','You know what happens when you do not listen','hear'],['chief','the chief attraction of the day','head'],['bad','Be careful not to use bad language to people','harmful'],['difficult','This reader is too difficult for second year grade pupils','hard'],['hound','Defend justice as a hound of law','harass'],['occur','Since the Earth is rotating two tides occur each day','happen'],['cancer','The US death rate from cancer increased last year','growth'],['floor','Tony spilled the water on the floor','ground'],['take','You should not take too much exercise','grip'],['sorrow','Time blunts the edge of sorrow','grief'],['seed','After the seed fell on good soil it yields plenty of fruit','grain'],['monster','Another game shows a monster eating a person','giant'],['acquire','I hope the rumor does not acquire currency','get'],['receive','Others do not receive wages for several months at a time','get'],['lady','The gentleman offered his hand to the lady','gentlewoman'],['collect','Collect change in a jar for a whole month','gather'],['boundary','The neighbors had a violent dispute on the boundary','frontier'],['hysteria','Wherever he appeared he excited hysteria','frenzy'],['bread','This food is the best thing since sliced bread','food'],['diet','Using diet pills is not a safe way to lose weight','food'],['meat','The price of meat is high','food'],['page','Please turn to page 5 in your copy of the report','folio'],['elastic','When stretched a rubber band produces an elastic force','flexible'],['navy','The US Navy played a vital role in World War II','fleet'],['apartment','My home is an apartment in Manhattan','flat'],['gun','Shortly afterward a burst of machine gun fire was heard','firearm'],['locate','I can locate the lost car','find'],['dirty','Dirty oil can hurt an engine','filthy'],['movie','When do you show the movie','film'],['battle','The first blow is half the battle','fight'],['bring','Bring your wife and children too','fetch'],['cat','The cat slept for two hours','feline'],['pretend','I do not like people who pretend to be altruistic','feign'],['emotion','Music is a tool to express emotion','feeling'],['article','Newspaper reported correction article under fire','feature'],['bold','If I may be so bold as to say','fearless'],['quick','I grabbed a quick bite for lunch','fast'],['sense','The music fits the sense of the words like a glove','faculty'],['reality','The cooperative is a working reality','fact'],['eye','It is Friday and time for Eye on Hollywood','eyeball'],['phrase','The journalist turned a phrase for the journal','expression'],['milk','He drank a glass of milk','exploit'],['clarify','Identify clarify and extend Explicit Needs','explain'],['forgive','I will forgive you this time but do not let it happen again','excuse'],['fever','I need something to bring down the fever','excitement'],['terrific','Ladies and gentleman we have a terrific sale today','excellent'],['wonderful','Well whomever I am just glad I had such a wonderful meal','excellent'],['inspect','Do not expect what you do not inspect','examine'],['investigate','Mr Lagos says that his government may also investigate','examine'],['proof','People say the proof of the pudding is in the eating','evidence'],['band','It is a very family kind of band','ensemble'],['huge','I need you to do me a huge favor','enormous'],['worker','The worker is using a shovel','employee'],['heart','I love Tom in my heart','emotions'],['long','over a long term of years','elongated'],['comfort','People like to live in comfort','ease'],['world','The World Cup amused the people across the country','earth'],['death','What was the profession of him at the time of his death','dying'],['home','He beat his brother home from school','dwelling'],['evening','If we leave Friday evening we can come back Sunday evening','dusk'],['dreary','It was a dreary day cold and without sunshine','dull'],['physician','Social worker Physician Other service provider','doctor'],['different','The velocity of sound in air and water is different','dissimilar'],['send','How many copies do we need to send','dispatch'],['find','We find most of our people through employment agencies','discover'],['science','I felt interested in studying science','discipline'],['vanish','Should I just vanish to some unknown place','disappear'],['mud','Your coat is covered with mud','dirt'],['size','There are a lot of small apples of about the same size','dimensions'],['despair','drive a person to despair','despondency'],['joy','It was the time of year for joy','delight'],['please','O you may do as you please','delight'],['god','Do you ever pray to God','deity'],['overcome','I think we have to overcome the pollution problem first','defeat'],['depth','It is a question beyond my depth','deepness'],['action','Please refrain your action without system','deed'],['shrink','I do not shrink from this responsibility','decrease'],['deck','There was a dog below deck','decorate'],['conclude','Did they conclude upon an arrangement with each other','decide'],['night','I have night duty several times a month','darkness'],['father','My father reached the age of sixty this year','daddy'],['loop','The news blows my mind or The news knocks me for a loop','curve'],['modern','I stopped listening to modern music','current'],['mob','The police soon reduced the mob to order','crowd'],['impatient','Do not fuss about it so much or Do not be so impatient','cross'],['animal','Do you know what year of the animal you were born','creature'],['accident','The last thing I need is an accident','crash'],['cattle','The farmer is leading the cattle','cows'],['shelter','They sought shelter at my house','cover'],['sofa','How much does this sofa cost','couch'],['right','Drinking is all right as long as you do not do it to excess','correct'],['imitate','Speak naturally; do not try to imitate some actor','copy'],['vehicle','What is the license number of the vehicle','conveyance'],['carry','I always carry my important papers on my person','convey'],['compare','Shall I compare thee to a summers day','contrast'],['satisfy','Study hard to satisfy the examiners','content'],['pollution','Pollution will be a very big problem in the future','contamination'],['box','I want to send this box by third class mail','container'],['pot','Boil a lot of water in a pot','container'],['communicate','I use music to communicate with them','contact'],['communication','Computers are widely used as communication tools','contact'],['contemplate','All day he did nothing but contemplate','consider'],['gather','You can gather a lot if you scrape the barrel','congregate'],['agree','I am afraid I do not agree with that','concur'],['understand','I understand you re leaving us at the end of the month','comprehend'],['fraternity','The fraternity may bid five new men','companionship'],['friend','One student split on his friend','companion'],['arrive','Be sure you arrive at work on time every morning','come'],['tower','They are timing a race up the tower','column'],['clothes','His clothes are covered with crud from working on his car','clothing'],['priest','The priest talked about love for one another in his homily','clergyman'],['baby','An extra member was added to his family; he adopted a baby','child'],['smart','Buying a clunker like that was not such a smart thing to do','chic'],['verify','Use control charts to verify stability','check'],['inexpensive','Maybe we will find some really inexpensive clothes','cheap'],['fee','I put into any money to pay for your tuition fee','charge'],['tax','A heavy commodity tax is levied on gasoline','charge'],['room','A group of people packed in to the room','chamber'],['confident','Do not be too confident of yourself','certain'],['hole','The carpenter bored out a hole through a thick board','cavity'],['create','Huge fans are used to create strong winds','cause'],['money','Every moment is precious or Time is money','cash'],['marijuana','We can smoke marijuana any time in Vietnam','cannabis'],['cottage','Love in a cottage is also included in a happy life','cabin'],['corporation','What is true of the Deebay Shippin Corporation','business'],['insect','It eats the insect at a later time','bug'],['violent','Do not ever resort to violent means','brutal'],['deliver','Sports can also deliver the addictive qualities of a drug','bring'],['short','Two years is not a short period','brief'],['nerve','Eggs are helpful for people who have nerve damage','bravery'],['employer','The employer eats his workers alive','boss'],['dull','The party was a dull affair','boring'],['flower','Youth is the flower of life or Youth is a treasure','bloom'],['large','We are currently working on a large scale project','big'],['curve','He used to fog it in but now he focuses on a curve ball','bend'],['stomach','You cannot work on an empty stomach','belly'],['think','I am afraid not or I do not think so','believe'],['opinion','That is a matter of opinion','belief'],['presence','What could be considered evidence of a spirits presence','being'],['bed','There is a sick bed within the enclosure of this school','bedstead'],['get','The costs of college get higher every year','become'],['rhythm','The rhythm of the music quickens','beat'],['shoulder','lay hand upon a shoulder','bear'],['ray','There is still a ray of hope for his recovery','beam'],['shore','I lived near the shore last year','beach'],['bowl','Fill the second bowl with warm water','basin'],['song','Her first number was a song from a popular musical','ballad'],['medium','Kim stroke a happy medium at the debate competition','average'],['beauty','That car is a real beauty','attractiveness'],['beautiful','Boston is so beautiful this time of year','attractive'],['effort','The game needs some effort to master','attempt'],['try','Lets try the same time but next week Okay','attempt'],['club','expel a member from a club','association'],['helper','Tells the helper application to quit','assistant'],['appoint','I will ask the emperor to appoint you in my place','assign'],['guilty','Do not decide whether he is innocent or guilty yet','ashamed'],['organize','It is a mess you need to organize your ideas better','arrange'],['come','Not anymore Two more people want to come','approach'],['machine','The man left his machine in the street','appliance'],['plead','plead with a creditor for a longer time','appeal'],['proclaim','Proclaim liberty throughout all the land','announce'],['beast','A horse was used as a beast of draft','animal'],['fun','The party is a lot of fun','amusement'],['friendly','I am on friendly terms with him','amiable'],['liquor','It is not good to mix different kinds of liquor','alcohol'],['bubble','Look at the table in the bubble','air ball'],['goal','I feel my way around achieving my goal','aim'],['pact','The peace pact brought the war to an end','agreement'],['old','They spend more time online than 17 year old boys do','aged'],['era','The Jiang era does not begin in normal times','age'],['attach','I attach no importance to what he says','affix'],['determine','How burglars determine the best time to rob a home','affect'],['plane','Their plane will arrive first thing in the morning','aeroplane'],['acknowledge','Please acknowledge receipt of this document by signing','admit'],['fresh','As usual she looked fresh and full of energy','additional'],['activity','The office was a beehive of activity','action'],['behave','Do not behave like a hog','act'],['achieve','There is no easy way to achieve the goal','accomplish'],['school','He was a year ahead of me in school','academy'],['capability','I am sorry but this work is above my capability','ability'],['competence','The police has competence over that state','ability'],['achieve','There is no easy way to achieve aim','accomplish'],['school','He was a year ahead of me in school','academy'],['capability','I am sorry but this work is above my capability','ability'],['competence','The police has competence over that state','ability']]
print (len(simlex_synonyms))

427


In [97]:
#Supress default INFO logging
tf.logging.set_verbosity(tf.logging.ERROR)
n_candidate = 20

print("word2vec ", end=''),
avg_rank_word2vec = 0
for a in simlex_synonyms:
    print('.', end='')
    word2vec_candidates = w2v.similar_by_word(a[0], n_candidate)
    cnt = 1
    found = False
    for b in word2vec_candidates:
        if b[0] == a[2]:
            found = True
            avg_rank_word2vec += cnt
        cnt +=1
    if found == False:
        avg_rank_word2vec += 20
avg_rank_word2vec = avg_rank_word2vec / len(simlex_synonyms)

print("gloVe ", end=''),
avg_rank_glove = 0
for a in simlex_synonyms:
    print('.', end=''),
    glove_candidates = glove.similar_by_word(a[0], n_candidate)
    cnt = 1
    found = False
    for b in glove_candidates:
        if b[0] == a[2]:
            found = True
            avg_rank_glove += cnt
        cnt +=1
    if found == False:
        avg_rank_glove += 20
avg_rank_glove = avg_rank_glove / len(simlex_synonyms)

print("word2vec + rnn + pos ", end='')
avg_rank_word2vec_rnn = 0
sentences_word2vec = []
for a in simlex_synonyms:
    print('.', end=''),
    word2vec_candidates = w2v.similar_by_word(a[0], n_candidate)
    
    sentence_pos_tag_main = nltk.pos_tag(a[1].split(' '))
    word_loc = 0
    word_pos = ''
    for word_idx, word in enumerate(sentence_pos_tag_main):
        if(word[0] == a[0]):
            word_loc = word_idx
            word_pos = word[1]

    for idx, i in enumerate(word2vec_candidates):
        sen_gen = a[1].replace(a[0], i[0])
        sentences_word2vec.append(sen_gen)

    word2vec_rnn_candidates = load_and_score([s.split() for s in sentences_word2vec], sort=True)
    cnt = 1
    found = False
    for b in word2vec_rnn_candidates:
        sentence_pos_tag_rest = nltk.pos_tag(b[1].split(' '))
        if word_pos != sentence_pos_tag_rest[word_loc][1]:
            cnt -=1
        if a[2] == sentence_pos_tag_rest[word_loc][0]:
            found = True
            avg_rank_word2vec_rnn += cnt
            break
        cnt +=1
    if found == False:
        avg_rank_word2vec_rnn += 20
    word2vec_rnn_candidates = []
    sentences_word2vec = []
avg_rank_word2vec_rnn = avg_rank_word2vec_rnn / len(simlex_synonyms)

print("glove + rnn + pos ", end='')
avg_rank_glove_rnn = 0
sentences_word2vec = []
for a in simlex_synonyms:
    print('.', end=''),
    word2vec_candidates = glove.similar_by_word(a[0], n_candidate)
    
    sentence_pos_tag_main = nltk.pos_tag(a[1].split(' '))
    word_loc = 0
    word_pos = ''
    for word_idx, word in enumerate(sentence_pos_tag_main):
        if(word[0] == a[0]):
            word_loc = word_idx
            word_pos = word[1]
    for idx, i in enumerate(word2vec_candidates):
        sen_gen = a[1].replace(a[0], i[0])
        sentences_word2vec.append(sen_gen)

    word2vec_rnn_candidates = load_and_score([s.split() for s in sentences_word2vec], sort=True)
    cnt = 1
    found = False
    for b in word2vec_rnn_candidates:
        sentence_pos_tag_rest = nltk.pos_tag(b[1].split(' '))
        if word_pos != sentence_pos_tag_rest[word_loc][1]:
            cnt -=1
        if a[2] == sentence_pos_tag_rest[word_loc][0]:
            found = True
            avg_rank_glove_rnn += cnt
            break
        cnt +=1
    if found == False:
        avg_rank_glove_rnn += 20
    word2vec_rnn_candidates = []
    sentences_word2vec = []
avg_rank_glove_rnn = avg_rank_glove_rnn / len(simlex_synonyms)


print("Average Rank for word2Vec:", round(avg_rank_word2vec,3))
print("Average Rank for glove:", round(avg_rank_glove,3))
print("Average Rank for word2vec + rnn:", round(avg_rank_word2vec_rnn,3))
print("Average Rank for glove + rnn:", round(avg_rank_glove_rnn,3))

word2vec ...........................................................................................................................................................................................................................................................................................................................................................................................................................................gloVe ...........................................................................................................................................................................................................................................................................................................................................................................................................................................word2vec + rnn + pos ..............................................................................................................