#### Cosine distance

In [1]:
from scipy.spatial import distance

In [2]:
distance.cosine()

TypeError: cosine() missing 2 required positional arguments: 'u' and 'v'

#### Gensim embeding

In [3]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

path = get_tmpfile("word2vec.model")

model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [4]:
from gensim.models import KeyedVectors

In [6]:
!wget -c https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

--2020-10-29 10:46:35--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.114.205
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.114.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-10-29 10:47:38 (24.8 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [9]:
!gzip -d GoogleNews-vectors-negative300.bin.gz

In [10]:
!mv GoogleNews-vectors-negative300.bin ..

In [11]:
model = KeyedVectors.load_word2vec_format('/home/ubuntu/GoogleNews-vectors-negative300.bin', binary=True)


In [12]:
text_for_analysis = \
    'The most innovative technology lives hand in hand with great design'

In [13]:
vectors = [model[x] for x in text_for_analysis.split(' ')]

In [14]:
vectors[0].shape

(300,)

In [15]:
distance.cosine( model['Harbor'], model['sea'])

0.6884577870368958

In [16]:
distance.cosine( model['Harbor'], model['Barcelona'])

0.8935506716370583

In [17]:
distance.cosine( model['Harbor'], model['ship'])

0.7504943758249283

In [18]:
distance.cosine( model['boat'], model['ship'])

0.38310855627059937

In [19]:
distance.cosine( model['boat'], model['goat'])

0.8759516403079033

In [21]:
for i,vector in enumerate(vectors):
    print(distance.cosine(vector,model['art']), text_for_analysis.split(' ')[i])

1.0291901398450136 The
0.8965152725577354 most
0.7164721190929413 innovative
0.7040139734745026 technology
0.9470963552594185 lives
0.9176338538527489 hand
0.8761060982942581 in
0.9176338538527489 hand
0.9169945791363716 with
0.8278181999921799 great
0.6652568876743317 design


In [39]:
distance.cosine(model['Spain'], model['Germany'])

0.4597814679145813

In [43]:
distance.cosine(model['Spain'], model['France'])

0.3935779929161072

In [41]:
distance.cosine(model['Spain'], model['Portugal'])

0.2779642343521118

In [16]:
len(vectors)

9

In [21]:
vectors[0].shape

(300,)

#### Exercise 3: try context aware embedings with ELMO

In [8]:
!ls /home/ubuntu/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5

elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5


In [23]:
from allennlp.commands.elmo import ElmoEmbedder
elmo = ElmoEmbedder(weight_file\
                        =' /home/ubuntu/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')

ModuleNotFoundError: No module named 'allennlp'

In [None]:
vectors_elmo = elmo.embed_sentence(
        'I ate an apple for breakfast'.split(" ")
)

In [28]:
" ".join( ["I", "ate", "an", "apple", "for", "breakfast"])

'I ate an apple for breakfast'

In [17]:
tokens = ["I", "ate", "an", "apple", "for", "breakfast"]
vectors_elmo = elmo.embed_sentence(tokens)

In [19]:
vectors_elmo

array([[[-6.79333657e-02, -2.52559334e-01, -1.03589855e-01, ...,
         -8.72765258e-02, -8.70569646e-02, -1.18727833e-02],
        [-7.98218846e-02, -1.86040878e-01,  2.00441986e-01, ...,
          5.48470989e-02,  2.88392484e-01, -1.89526588e-01],
        [-3.64798903e-01, -6.85286894e-02,  5.60156107e-02, ...,
         -4.17151377e-02, -1.18192822e-01, -1.74395293e-02],
        [-3.60579431e-01,  4.20438260e-01,  3.40165555e-01, ...,
          4.02825892e-01,  1.83227465e-01, -5.59873432e-02],
        [ 5.95672391e-02,  7.17996806e-02,  2.72936746e-03, ...,
         -3.90242599e-02,  2.69145537e-02, -3.22407395e-01],
        [-5.85217118e-01, -3.92053604e-01,  3.08201492e-01, ...,
          6.06921196e-01, -2.73451954e-01, -1.35265872e-01]],

       [[-1.77582875e-01, -3.56286913e-01, -2.12286919e-01, ...,
          6.09414279e-02, -2.31909454e-02, -7.69036412e-02],
        [-2.52683073e-01,  2.57170200e-01,  3.14080417e-01, ...,
         -1.77171379e-01, -8.70924369e-02,  8.63923

In [20]:
vectors_elmo.shape

(3, 6, 1024)

In [26]:
vectors_elmo = elmo.embed_sentence(["I", "just", "need", "to", "fix",  "the", "bug", "in", "the", "code", "today"])
target_word_sense_0 = vectors_elmo[2][6] # the bug in the code vector


vectors2 = elmo.embed_sentence(["I", "just", "saw", "a", "bug", "in", "the", "sunny", "forest"])
target_word_sense_1 = vectors2[2][4] # the bug in the forest vector
distance.cosine(target_word_sense_0, target_word_sense_1)

0.3587740659713745

In [33]:
vectors3 = elmo.embed_sentence(["I", "just", "saw", "a", "bug", "in", "the", "code"])
target_word_sense_2 = vectors3[2][4] 
distance.cosine(target_word_sense_0, target_word_sense_2)

0.20674782991409302

In [34]:
distance.cosine(target_word_sense_0, target_word_sense_0)

0.0

In [None]:
import scipy

In [None]:

vectors2 = elmo.embed_sentence(["I", "ate", "a", "carrot", "for", 
"breakfast"])
scipy.spatial.distance.cosine(vectors[2][3], vectors2[2][3])