In [25]:
import gensim                     # implements word2vec model infrastructure and provides interfacing APIs 
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.contrib.tensorboard.plugins import projector
import os
from scipy import spatial
from collections import defaultdict
tf.logging.set_verbosity(tf.logging.ERROR)
import datetime
import timeit 

# NLTK for NLP utils and corpora
import nltk


In [26]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [27]:
# load pre-trained word2vec model
word2vec_vectors = '../pretrained/GoogleNews-vectors-negative300.bin'
w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vec_vectors, binary=True)

In [90]:
thisWord = 'delay'
thisSentence = 'i want this work finished without delay'

In [98]:
# most similar words - by word
n_similar = 20
synonyms = w2v.similar_by_word(thisWord, n_similar)

synonyn_candidate = []
for words in synonyms:
    synonyn_candidate.append(words[0])

print("Most similar {} words (by word) for '{}' by word2vec model:".format(n_similar, thisWord))
print(synonyn_candidate)

Most similar 20 words (by word) for 'delay' by word2vec model:
['delayed', 'delays', 'delaying', 'postponement', 'postpone', 'postponing', 'Delays', 'Delaying', 'postponment', 'postponed', 'Delayed', 'stalling', 'Postponing', 'lateness', 'postponements', 'defer', 'inordinate_delay', 'indefinite_postponement', 'Lengthy_delays', 'glitch']


In [92]:
sentence_pos_tag = nltk.pos_tag(thisSentence.split(' '))
word_loc = 0
word_pos = ''
for word_idx, word in enumerate(sentence_pos_tag):
    if(word[0] == thisWord):
        word_loc = word_idx
        word_pos = word[1]

print (thisSentence)
print (thisWord,' : ', word_pos)

i want this work finished without delay
delay  :  NN


In [93]:
synonyn_candidate_filtered = []

print (thisSentence)
for candidate in synonyn_candidate:
    newSentence = thisSentence.replace(thisWord, candidate)
    sentence_pos_tag = nltk.pos_tag(newSentence.split(' '))
    print(sentence_pos_tag[word_loc][0], ' : ', sentence_pos_tag[word_loc][1])
    if sentence_pos_tag[word_loc][1] == word_pos:
        synonyn_candidate_filtered.append(sentence_pos_tag[word_loc][0])

print('\nFiltered by POS (count : ',len(synonyn_candidate_filtered),')',synonyn_candidate_filtered)

i want this work finished without delay
delayed  :  NNS
delays  :  NNS
delaying  :  VBG
postponement  :  NN
postpone  :  NN
postponing  :  VBG
Delays  :  NNS
Delaying  :  VBG
postponment  :  NN
postponed  :  VBN
Delayed  :  NNP
stalling  :  VBG
Postponing  :  VBG
lateness  :  NN
postponements  :  NNS
defer  :  NN
inordinate_delay  :  NN
indefinite_postponement  :  NN
Lengthy_delays  :  NNS
glitch  :  NN

Filtered by POS (count :  8 ) ['postponement', 'postpone', 'postponment', 'lateness', 'defer', 'inordinate_delay', 'indefinite_postponement', 'glitch']


In [95]:
sentence_split = thisSentence.split()
embeddings = elmo(sentence_split, signature="default", as_dict=True)["elmo"]

init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
main_embed = sess.run(embeddings)
print(main_embed[word_loc])

distances = []
for idx, word in enumerate(synonyn_candidate_filtered):
    sentence_temp = thisSentence.replace(thisWord, word)
    sentence_split = sentence_temp.split()
    embeddings = elmo(sentence_split, signature="default", as_dict=True)["elmo"]
    
    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    embed = sess.run(embeddings)
    
    distance = 1 - spatial.distance.cosine(main_embed[word_loc], embed[word_loc])
    distances.append(((thisWord, word), distance))
    
    print(word)
    
print(distances)

[[-0.7310336   0.6544951   0.88699144 ...  0.00710861  0.25345138
   0.56790507]]
postponement
postpone
postponment
lateness
defer
inordinate_delay
indefinite_postponement
glitch
[(('delay', 'postponement'), 0.7414836193508368), (('delay', 'postpone'), 0.7060910486134683), (('delay', 'postponment'), 0.6268072842975166), (('delay', 'lateness'), 0.5315769360929473), (('delay', 'defer'), 0.6143004897106897), (('delay', 'inordinate_delay'), 0.43277463573228236), (('delay', 'indefinite_postponement'), 0.4909549409076789), (('delay', 'glitch'), 0.5327293626157836)]


In [96]:
distances_sorted = sorted(distances, key=lambda x: x[1], reverse=True)

print (thisSentence)
for dist in distances_sorted:
    print (dist[0][1], ': ', round(dist[1],3))

i want this work finished without delay
postponement :  0.741
postpone :  0.706
postponment :  0.627
defer :  0.614
glitch :  0.533
lateness :  0.532
indefinite_postponement :  0.491
inordinate_delay :  0.433
