In [25]:
import gensim                     # implements word2vec model infrastructure and provides interfacing APIs 
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from tensorflow.examples.tutorials.mnist import input_data
from tensorflow.contrib.tensorboard.plugins import projector
import os
from scipy import spatial
from collections import defaultdict
tf.logging.set_verbosity(tf.logging.ERROR)
import datetime
import timeit 

# NLTK for NLP utils and corpora
import nltk


In [26]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [27]:
# load pre-trained word2vec model
word2vec_vectors = '../pretrained/GoogleNews-vectors-negative300.bin'
w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vec_vectors, binary=True)

In [47]:
thisWord = 'major'
thisSentence = 'there is going to be a major change in the organization'

In [29]:
# most similar words - by word
n_similar = 20
synonyms = w2v.similar_by_word(thisWord, n_similar)

synonyn_candidate = []
for words in synonyms:
    synonyn_candidate.append(words[0])

print("Most similar {} words (by word) for '{}' by word2vec model:".format(n_similar, thisWord))
print(synonyn_candidate)

Most similar 20 words (by word) for 'major' by word2vec model:
['biggest', 'significant', 'big', 'main', 'key', 'huge', 'signficant', 'amajor', 'largest', 'greatest', 'Major', 'massive', 'minor', 'substantial', 'monumental', 'notable', 'signifcant', 'big_gest', 'MAJOR', 'leading']


In [31]:
sentence_pos_tag = nltk.pos_tag(thisSentence.split(' '))
word_loc = 0
word_pos = ''
for word_idx, word in enumerate(sentence_pos_tag):
    if(word[0] == thisWord):
        word_loc = word_idx
        word_pos = word[1]

print (thisSentence)
print (thisWord,' : ', word_pos)

there is going to be a major change in the organization
major  :  JJ


In [40]:
synonyn_candidate_filtered = []

print (thisSentence)
for candidate in synonyn_candidate:
    newSentence = thisSentence.replace(thisWord, candidate)
    sentence_pos_tag = nltk.pos_tag(newSentence.split(' '))
    print(sentence_pos_tag[word_loc][0], ' : ', sentence_pos_tag[word_loc][1])
    if sentence_pos_tag[word_loc][1] == word_pos:
        synonyn_candidate_filtered.append(sentence_pos_tag[word_loc][0])

print('\nFiltered by POS (count : ',len(synonyn_candidate_filtered),')',synonyn_candidate_filtered)

there is going to be a major change in the organization
biggest  :  JJS
significant  :  JJ
big  :  JJ
main  :  JJ
key  :  JJ
huge  :  JJ
signficant  :  JJ
amajor  :  JJ
largest  :  JJS
greatest  :  JJS
Major  :  JJ
massive  :  JJ
minor  :  JJ
substantial  :  JJ
monumental  :  JJ
notable  :  JJ
signifcant  :  JJ
big_gest  :  JJ
MAJOR  :  NNP
leading  :  JJ

Filtered by POS (count :  16 ) ['significant', 'big', 'main', 'key', 'huge', 'signficant', 'amajor', 'Major', 'massive', 'minor', 'substantial', 'monumental', 'notable', 'signifcant', 'big_gest', 'leading']


In [48]:
sentence_split = thisSentence.split()
embeddings = elmo(sentence_split, signature="default", as_dict=True)["elmo"]

init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
main_embed = sess.run(embeddings)
print(main_embed[word_loc])

distances = []
for idx, word in enumerate(synonyn_candidate_filtered):
    sentence_temp = thisSentence.replace(thisWord, word)
    sentence_split = sentence_temp.split()
    embeddings = elmo(sentence_split, signature="default", as_dict=True)["elmo"]
    
    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    embed = sess.run(embeddings)
    
    distance = spatial.distance.cosine(main_embed[word_loc], embed[word_loc])
    distances.append(((thisWord, word), distance))
    
    print(word)
    
print(distances)





[[ 0.14566283 -0.061473    0.8619133  ...  0.19311848 -0.06004271
   0.12941864]]
significant
big
main
key
huge
signficant
amajor
Major
massive
minor
substantial
monumental
notable
signifcant
big_gest
leading
[(('major', 'significant'), 0.31172402414365585), (('major', 'big'), 0.3044875228571017), (('major', 'main'), 0.2945632668917523), (('major', 'key'), 0.3255786236565861), (('major', 'huge'), 0.3680712580929485), (('major', 'signficant'), 0.35003877692299257), (('major', 'amajor'), 0.3477929437018048), (('major', 'Major'), 0.3064411874801779), (('major', 'massive'), 0.33748969969266407), (('major', 'minor'), 0.31591775857293414), (('major', 'substantial'), 0.3898059151436981), (('major', 'monumental'), 0.4440521204330301), (('major', 'notable'), 0.4499687931305507), (('major', 'signifcant'), 0.4894332179219061), (('major', 'big_gest'), 0.5712501729401156), (('major', 'leading'), 0.49074622776537846)]
