In [1]:
# file imports

import numpy as np
import matplotlib.pyplot as plt
import gensim
import gensim.downloader
import os
from scipy.optimize import linprog
import nltk


In [None]:
#nltk.download('stopwords')
#nltk.download('punkt')
# nltk.download('wordnet')
from nltk.corpus import stopwords

## To check the performance with normalization later

In [None]:
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
  
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
  
# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos ="a"))

rocks : rock
corpora : corpus
better : good


In [8]:
import os
dir = os.listdir('../test')
dir

['.ipynb_checkpoints',
 'wmd.ipynb',
 'google300w2v.kv',
 'Miniconda3-latest-Linux-x86_64.sh',
 'google300w2v.kv.vectors.npy',
 'test_first_jupyteronada.ipynb']

## We will check two WMDs, one with normalized word2vecs and one without.

In [44]:
## WMD base code:

def wasserstein_distance(p, q, D):
    """Find Wasserstein distance through linear programming
    p.shape=[m], q.shape=[n], D.shape=[m, n]
    
    suppose doc1 has m words and doc2 has n words, then an mxn array would be formed, 
    having distance of each word in doc1 to that of doc2.
    
    
    
    p.sum()=1, q.sum()=1, p∈[0,1], q∈[0,1]
    """
    A_eq = [] # a list which will later be converted to array after appending.
    for i in range(len(p)):
        A = np.zeros_like(D) # a 2d array made with the shape of D.  
        A[i, :] = 1 # to make summation over "i" of Tij = pi, ie total / sum of outflow
        ## from one word is equal to its pi (normalized bag of word/ frequency/density)
        ## ex : T1,1 + T1,2 + T1,3 + 0 T2,1 + 0 T2,2 + 0 T2,3 = P1 and so on for every i,
        ## ie for each word in the doc1
        
        A_eq.append(A.reshape(-1)) ## reshape(-1) flatens and then appending in A_eq.
        ## A_eq will be (m+n)x(m.n)
    
    for i in range(len(q)):
        A = np.zeros_like(D)
        A[:, i] = 1 ## summation over "j" this time, so this time for different rows, 
        ## over a column "j" which refers to doc2, ie total incoming flow = qj density
        A_eq.append(A.reshape(-1))
    A_eq = np.array(A_eq)
    print(A_eq.shape,A_eq)
    b_eq = np.concatenate([p, q])
    D = D.reshape(-1)
    result = linprog(D, A_eq=A_eq[:-1], b_eq=b_eq[:-1]) ## removing redundant to make 
    ## solution more robust.
    return result.fun, result.x  ## fun returns the final optimized value, x returns each value of xi,j that is the array




def word_mover_distance(x, y):
    """ Reference implementation of WMD (Word Mover's Distance)
    x.shape=[m,d], y.shape=[n,d]
    """
    p = np.ones(x.shape[0]) / x.shape[0]
    q = np.ones(y.shape[0]) / y.shape[0]
    D = np.sqrt(np.square(x[:, None] - y[None, :]).sum(axis=2)) ## programmers sought used mean instead of sum
    #print('D shape:', D.shape)
    print('D : ', D)
    return wasserstein_distance(p, q, D)
 

 


In [2]:
# Download the "word2vec-google-news-300" embeddings
w2v_emb = gensim.downloader.load('word2vec-google-news-300')

In [3]:
from gensim.models import KeyedVectors
w2v_emb.save("google300w2v.kv")

In [4]:
new_model =  KeyedVectors.load('google300w2v.kv', mmap='r')
new_model

<gensim.models.keyedvectors.KeyedVectors at 0x7fd3d6eecd00>

In [7]:
a = np.array(new_model['obama'])
a

array([-1.23535156e-01,  7.22656250e-02,  1.71875000e-01,  4.02343750e-01,
       -1.25976562e-01, -3.02734375e-01, -4.49218750e-02, -1.52587891e-02,
        1.71875000e-01, -4.34570312e-02, -1.32812500e-01, -4.58984375e-01,
       -5.27343750e-01,  1.39648438e-01, -1.23535156e-01,  4.21875000e-01,
       -1.46484375e-01,  4.12597656e-02,  2.99072266e-02, -3.57421875e-01,
        4.44335938e-02, -9.37500000e-02,  4.25781250e-01, -1.54296875e-01,
       -2.29492188e-01,  3.00781250e-01, -4.27734375e-01, -8.00781250e-02,
        3.16406250e-01, -2.02148438e-01,  1.03515625e-01,  2.06054688e-01,
       -3.18359375e-01, -9.08203125e-02, -5.81054688e-02,  9.96093750e-02,
       -2.25585938e-01,  6.29882812e-02,  2.00195312e-01,  1.67968750e-01,
        2.33154297e-02, -2.45117188e-01,  3.67187500e-01,  8.39843750e-02,
        1.21093750e-01,  1.58203125e-01,  3.59375000e-01, -3.32031250e-01,
       -2.59765625e-01, -3.36914062e-02, -1.35742188e-01, -9.03320312e-02,
        1.94335938e-01,  

In [6]:
w2v_emb['obama']

array([-1.23535156e-01,  7.22656250e-02,  1.71875000e-01,  4.02343750e-01,
       -1.25976562e-01, -3.02734375e-01, -4.49218750e-02, -1.52587891e-02,
        1.71875000e-01, -4.34570312e-02, -1.32812500e-01, -4.58984375e-01,
       -5.27343750e-01,  1.39648438e-01, -1.23535156e-01,  4.21875000e-01,
       -1.46484375e-01,  4.12597656e-02,  2.99072266e-02, -3.57421875e-01,
        4.44335938e-02, -9.37500000e-02,  4.25781250e-01, -1.54296875e-01,
       -2.29492188e-01,  3.00781250e-01, -4.27734375e-01, -8.00781250e-02,
        3.16406250e-01, -2.02148438e-01,  1.03515625e-01,  2.06054688e-01,
       -3.18359375e-01, -9.08203125e-02, -5.81054688e-02,  9.96093750e-02,
       -2.25585938e-01,  6.29882812e-02,  2.00195312e-01,  1.67968750e-01,
        2.33154297e-02, -2.45117188e-01,  3.67187500e-01,  8.39843750e-02,
        1.21093750e-01,  1.58203125e-01,  3.59375000e-01, -3.32031250e-01,
       -2.59765625e-01, -3.36914062e-02, -1.35742188e-01, -9.03320312e-02,
        1.94335938e-01,  

In [6]:
from gensim import models

w = models.KeyedVectors.load_word2vec_format(
    'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)


In [14]:
obama = w['obama']
president = w['president']
eucdist = np.linalg.norm(president-obama)
eucdist

4.782619

In [4]:
obama = norm_w2vemb['Obama']
president = norm_w2vemb['President']

dist = np.linalg.norm(obama - president)
dist

1.1746119

In [38]:
band = w2v_emb['band']
president = w2v_emb['president']

dist22 = np.linalg.norm(band - president)
dist22

3.7117977

In [41]:
band = w2v_emb['Band']
president = w2v_emb['President']

dist22 = np.linalg.norm(band - president)
dist22

4.042637

In [39]:
obama = w2v_emb['obama']
president = w2v_emb['president']

dist11 = np.linalg.norm(obama - president)
dist11

4.782619

In [40]:
obama = w2v_emb['Obama']
president = w2v_emb['President']

dist11 = np.linalg.norm(obama - president)
dist11

3.3538148

In [11]:
xx = np.dot(np.array([0.5,0.5]).T,np.array([0.5,0.5]))
xx

0.5

In [7]:
np.dot(w['obama'].T,w2v_emb['obama'])

18.45192

In [8]:
from scipy import spatial

dataSetI = w['obama']
dataSetII = w2v_emb['obama']
result = 1 - spatial.distance.cosine(dataSetI, dataSetII)

In [9]:
result

1

In [4]:
w2v_emb['obama']

array([-1.23535156e-01,  7.22656250e-02,  1.71875000e-01,  4.02343750e-01,
       -1.25976562e-01, -3.02734375e-01, -4.49218750e-02, -1.52587891e-02,
        1.71875000e-01, -4.34570312e-02, -1.32812500e-01, -4.58984375e-01,
       -5.27343750e-01,  1.39648438e-01, -1.23535156e-01,  4.21875000e-01,
       -1.46484375e-01,  4.12597656e-02,  2.99072266e-02, -3.57421875e-01,
        4.44335938e-02, -9.37500000e-02,  4.25781250e-01, -1.54296875e-01,
       -2.29492188e-01,  3.00781250e-01, -4.27734375e-01, -8.00781250e-02,
        3.16406250e-01, -2.02148438e-01,  1.03515625e-01,  2.06054688e-01,
       -3.18359375e-01, -9.08203125e-02, -5.81054688e-02,  9.96093750e-02,
       -2.25585938e-01,  6.29882812e-02,  2.00195312e-01,  1.67968750e-01,
        2.33154297e-02, -2.45117188e-01,  3.67187500e-01,  8.39843750e-02,
        1.21093750e-01,  1.58203125e-01,  3.59375000e-01, -3.32031250e-01,
       -2.59765625e-01, -3.36914062e-02, -1.35742188e-01, -9.03320312e-02,
        1.94335938e-01,  

In [8]:
w['obama']

array([-1.23535156e-01,  7.22656250e-02,  1.71875000e-01,  4.02343750e-01,
       -1.25976562e-01, -3.02734375e-01, -4.49218750e-02, -1.52587891e-02,
        1.71875000e-01, -4.34570312e-02, -1.32812500e-01, -4.58984375e-01,
       -5.27343750e-01,  1.39648438e-01, -1.23535156e-01,  4.21875000e-01,
       -1.46484375e-01,  4.12597656e-02,  2.99072266e-02, -3.57421875e-01,
        4.44335938e-02, -9.37500000e-02,  4.25781250e-01, -1.54296875e-01,
       -2.29492188e-01,  3.00781250e-01, -4.27734375e-01, -8.00781250e-02,
        3.16406250e-01, -2.02148438e-01,  1.03515625e-01,  2.06054688e-01,
       -3.18359375e-01, -9.08203125e-02, -5.81054688e-02,  9.96093750e-02,
       -2.25585938e-01,  6.29882812e-02,  2.00195312e-01,  1.67968750e-01,
        2.33154297e-02, -2.45117188e-01,  3.67187500e-01,  8.39843750e-02,
        1.21093750e-01,  1.58203125e-01,  3.59375000e-01, -3.32031250e-01,
       -2.59765625e-01, -3.36914062e-02, -1.35742188e-01, -9.03320312e-02,
        1.94335938e-01,  

In [2]:
## normalised embeddings 
norm_w2vemb = gensim.downloader.load('word2vec-google-news-300')
norm_w2vemb.init_sims(replace=True)

  norm_w2vemb.init_sims(replace=True)


In [8]:
type(norm_w2vemb)

gensim.models.keyedvectors.KeyedVectors

In [9]:
norm_w2vemb['president']

array([-5.18716220e-03, -4.61915620e-02,  5.55800907e-02,  1.11254072e-02,
       -8.91910214e-03, -9.42608342e-02,  3.71785760e-02, -5.93355037e-02,
        9.76407006e-02, -6.94751143e-02, -4.16850671e-02,  1.43644493e-02,
       -6.12132102e-02,  2.12415471e-03,  4.71304171e-02,  8.29945952e-02,
        2.40346342e-02,  2.30957810e-02, -4.52527106e-02, -7.54837692e-02,
       -8.93787965e-02,  6.98506534e-02,  1.73687786e-02, -1.14540057e-02,
        4.99469750e-02, -8.41212198e-02, -3.11699156e-02, -6.04621246e-02,
       -5.89599609e-02, -6.38419986e-02, -4.09339853e-02, -4.52996511e-03,
       -9.50119123e-02,  4.99469750e-02,  1.63735941e-01,  6.19642902e-03,
       -1.56788435e-02,  5.74577972e-02, -3.72607232e-04,  5.63311726e-02,
       -2.55367979e-02, -4.05584462e-02,  4.31872346e-02,  6.87240288e-02,
       -7.77370185e-02, -9.01298746e-02, -5.14491387e-02,  3.22965384e-02,
       -7.09772781e-02,  9.68896225e-02, -8.68438929e-03,  1.87770575e-02,
       -2.37529781e-02,  

In [10]:
print("Available models in gensim: ", list(gensim.downloader.info()['models'].keys()), "\n")


Available models in gensim:  ['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'] 



In [9]:
print("Total number of embeddings : ", len(w2v_emb),"\n")
print("Type of word embeddings :", type(w2v_emb) , "\n" )
print(" Length of each embedding : ", len(w2v_emb["President"]), "\n")

Total number of embeddings :  3000000 

Type of word embeddings : <class 'gensim.models.keyedvectors.KeyedVectors'> 

 Length of each embedding :  300 



## Now we will notice that w2vec will have different embeddings for capital and lower case 
## ie Obama and obama


### And for this reason, before doing wmd, we will lower case, and remove the stop words etc.

In [11]:
w2v_emb["president"]

array([-1.34887695e-02, -1.20117188e-01,  1.44531250e-01,  2.89306641e-02,
       -2.31933594e-02, -2.45117188e-01,  9.66796875e-02, -1.54296875e-01,
        2.53906250e-01, -1.80664062e-01, -1.08398438e-01,  3.73535156e-02,
       -1.59179688e-01,  5.52368164e-03,  1.22558594e-01,  2.15820312e-01,
        6.25000000e-02,  6.00585938e-02, -1.17675781e-01, -1.96289062e-01,
       -2.32421875e-01,  1.81640625e-01,  4.51660156e-02, -2.97851562e-02,
        1.29882812e-01, -2.18750000e-01, -8.10546875e-02, -1.57226562e-01,
       -1.53320312e-01, -1.66015625e-01, -1.06445312e-01, -1.17797852e-02,
       -2.47070312e-01,  1.29882812e-01,  4.25781250e-01,  1.61132812e-02,
       -4.07714844e-02,  1.49414062e-01, -9.68933105e-04,  1.46484375e-01,
       -6.64062500e-02, -1.05468750e-01,  1.12304688e-01,  1.78710938e-01,
       -2.02148438e-01, -2.34375000e-01, -1.33789062e-01,  8.39843750e-02,
       -1.84570312e-01,  2.51953125e-01, -2.25830078e-02,  4.88281250e-02,
       -6.17675781e-02,  

In [11]:
w2v_emb["President"]

array([-0.04321289, -0.01080322,  0.21582031,  0.12402344, -0.02441406,
       -0.26953125, -0.08789062, -0.15527344,  0.11669922, -0.296875  ,
       -0.16503906,  0.08984375, -0.18847656,  0.06787109,  0.26953125,
        0.11132812,  0.02832031,  0.01257324, -0.08447266,  0.00646973,
       -0.15136719,  0.15917969,  0.13769531, -0.09033203,  0.11132812,
       -0.17285156,  0.14941406, -0.39257812, -0.03564453,  0.02062988,
       -0.04248047, -0.20605469, -0.19628906,  0.04907227,  0.30273438,
       -0.07666016,  0.11083984,  0.25976562, -0.06689453,  0.15527344,
       -0.11376953,  0.01159668,  0.18554688,  0.23632812, -0.09179688,
       -0.21582031, -0.21484375, -0.0189209 , -0.23828125,  0.02026367,
       -0.04589844,  0.12011719, -0.08837891,  0.07910156,  0.23828125,
        0.13867188, -0.19433594,  0.00787354,  0.09619141, -0.15234375,
        0.04711914,  0.16796875, -0.07324219,  0.26757812, -0.00427246,
        0.12695312,  0.01782227,  0.0279541 ,  0.265625  ,  0.18

## See example of Obama and obama dot product

In [12]:
np.dot((w2v_emb["Obama"].reshape(-1,1)).T,w2v_emb["obama"].reshape(-1,1))

array([[7.1928496]], dtype=float32)

In [13]:
np.dot((w2v_emb["President"].reshape(-1,1)).T,w2v_emb["president"].reshape(-1,1))

array([[5.258883]], dtype=float32)

## Most similar method provided by gensim model

In [9]:
w2v_emb.most_similar("Facebook", topn=5)

[('social_networking', 0.7837298512458801),
 ('FaceBook', 0.7794208526611328),
 ('Twitter', 0.7665740251541138),
 ('facebook', 0.7563533186912537),
 ('Facebook.com', 0.7391056418418884)]

In [10]:
w2v_emb.most_similar("president", topn=5)

[('President', 0.800627589225769),
 ('chairman', 0.6708744764328003),
 ('vice_president', 0.6700226068496704),
 ('chief_executive', 0.6691274642944336),
 ('CEO', 0.6590125560760498)]

In [17]:
## pre processing:

## The sentences

sent1 = "Obama speaks to the media in Illinois."
sent2 = "The President greets the press in Chicago."
sent3 = "The band gave a concert in Japan."

In [15]:
## creating preprocessing pipeline :

def sentence_preprocess(sentence,lowercase = 1, strip_punctuation = 1, remove_stopwords = 1, removedigit = 1):
    ''' 1 : True, 0 : False : Lowercase, Strip puncutation, Remove Stopwords, removedigit'''

    stop_words = list(stopwords.words('english'))

    if lowercase == 1:
        sentence = sentence.lower()

    sentence_words = nltk.word_tokenize(sentence)

    if strip_punctuation == 1 and removedigit == 1:
        sentence_words = [word for word in sentence_words if word.isalpha()] 
        


    if remove_stopwords == 1:
        sentence_words = [word for word in sentence_words if not word in stop_words]



    return sentence_words

In [20]:
sent1 = sentence_preprocess(sent1)
sent1

sent2 = sentence_preprocess(sent2)
sent2

sent3 = sentence_preprocess(sent3)
sent3

['obama', 'speaks', 'media', 'illinois']

In [18]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Now we will find and arrange word2vec embeddings in the p and q matrices. ## We will write functions for that:



In [23]:
def find_embdMatrix(sentence, normalized = False ):
    sent_mtx = []
    if normalized == True:
       embd_model = norm_w2vemb
    else:
        embd_model = w2v_emb

    for word in sentence:
        word_emb = embd_model[word]
        sent_mtx.append(word_emb)
    
    sent_mtx = np.array(sent_mtx).reshape(len(sentence),-1)

    return sent_mtx
    


In [24]:
sent1_mtx = find_embdMatrix(sent1)

sent1_mtx.shape

sent2_mtx = find_embdMatrix(sent2)

sent2_mtx.shape

In [30]:
diss, T = word_mover_distance(sent1_mtx,sent2_mtx)
diss.shape, T.shape

D shape: (4, 4)
D :  [[4.782619  5.5453153 4.6934934 4.0367475]
 [3.7898932 3.4098573 3.376263  4.7602735]
 [3.366701  4.215941  2.1343176 4.4567947]
 [4.357834  4.937704  4.119726  3.169699 ]]
(8, 16) [[1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]
 [1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1.]]


((), (16,))

In [None]:
(3.3741232173489615,
 array([2.50000000e-01, 3.18531834e-11, 2.14184326e-11, 2.52857048e-11,
        4.70801233e-12, 1.81424091e-11, 5.88324554e-11, 2.50000000e-01,
        6.85027770e-11, 3.21133789e-11, 2.50000000e-01, 2.74871660e-11,
        5.34556353e-12, 2.50000000e-01, 4.78527543e-11, 2.89110126e-11]))

In [31]:
T

array([2.50000000e-01, 3.21134168e-11, 2.74869757e-11, 6.85027734e-11,
       4.78524010e-11, 2.50000000e-01, 2.89107605e-11, 5.34554451e-12,
       5.88326089e-11, 1.81424359e-11, 2.50000000e-01, 4.70800505e-12,
       2.14182972e-11, 3.18531822e-11, 2.52855123e-11, 2.50000000e-01])

In [26]:
diss ## matched from gensim. though it didnt run, but same as its documentation :https://radimrehurek.com/gensim_3.8.3/auto_examples/tutorials/run_wmd.html

3.3741232173489553

In [35]:
sent3_matx = find_embdMatrix(sent3)

In [36]:
diss_23,T_23 = word_mover_distance(sent3_matx,sent2_mtx)
diss_23

D shape: (4, 4)
D :  [[3.711798  4.5337734 3.5188146 4.5288696]
 [3.5704765 4.218369  3.4494445 4.6724405]
 [3.7947109 4.4849935 3.5508246 4.813723 ]
 [4.6758094 5.3404236 4.4132013 4.0542464]]
(8, 16) [[1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]
 [1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1.]]


3.8838095425961106

In [32]:
T_23

array([2.49999999e-01, 1.65290348e-09, 8.68768224e-10, 3.48500939e-10,
       2.45479045e-09, 2.49999997e-01, 2.60203116e-09, 3.36383301e-10,
       8.32333691e-11, 3.40554976e-09, 2.49999998e-01, 3.28668471e-10,
       3.32148933e-10, 3.34752276e-10, 3.46653327e-10, 2.50000001e-01])

In [32]:
sent4 = "Obama speaks in illinois"
sent4 = sentence_preprocess(sent4)
sent4

['obama', 'speaks', 'illinois']

In [42]:
sent4_mtx = find_embdMatrix(sent4)
sent4_mtx.shape

(3, 300)

In [46]:
diss_42,T_42 = word_mover_distance(sent4_mtx,sent2_mtx)

D :  [[4.782619  5.5453153 4.6934934 4.0367475]
 [3.7898932 3.4098573 3.376263  4.7602735]
 [4.357834  4.937704  4.119726  3.169699 ]]
(7, 12) [[1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1.]
 [1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1.]]


In [48]:
T_42, diss_42

(array([2.50000000e-01, 9.63200057e-14, 8.33333333e-02, 5.08275992e-14,
        2.97859459e-13, 2.50000000e-01, 8.33333333e-02, 1.36859757e-14,
        3.24819324e-15, 5.98003243e-14, 8.33333333e-02, 2.50000000e-01]),
 3.8563340107673683)

In [54]:

from pyemd import emd
distance = w2v_emb.wmdistance(sent1, sent2)
distance

ModuleNotFoundError: No module named 'pyemd'

In [57]:
 D = np.sqrt(np.square(w2v_emb['obama']- w2v_emb['president'].sum))
 D

TypeError: unsupported operand type(s) for -: 'float' and 'builtin_function_or_method'

In [58]:
4.782619/np.sqrt(300)

NameError: name 'sqrt' is not defined