## Here we first load the glove vectors as a dictionary - `embeddings_index`
`embeddings_index['banana']` would give some 100 length vector for the word `'banana'`

The object `GLOVE_DIR` points to the text file which containes the vectors, but it could also be downloaded form http://nlp.stanford.edu/data/glove.6B.zip and saved on disk

In [2]:
import os
import numpy as np
GLOVE_DIR = '/home/harshit/Documents/glove.6B/'

print('Indexing word vectors.')
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [12]:
embeddings_index['banana']

array([-0.34028  ,  0.46436  , -0.083324 ,  0.20186  , -0.17831  ,
       -0.4663   ,  0.61793  ,  0.30129  ,  0.5728   , -0.34783  ,
       -0.9216   ,  0.30484  ,  0.30382  ,  0.58035  ,  0.12112  ,
        0.77288  ,  1.1547   , -0.576    ,  0.51471  ,  0.21552  ,
        0.21106  ,  0.67875  ,  1.1962   ,  0.11142  ,  0.50809  ,
        1.1873   ,  0.035288 , -0.88952  ,  0.042803 , -0.36714  ,
        0.37993  ,  0.61945  ,  1.0194   , -0.95084  , -0.0072258,
        0.69454  ,  0.38692  , -0.18544  ,  0.2885   , -0.81279  ,
       -0.46473  , -0.82623  ,  0.42778  , -0.14064  ,  0.30173  ,
        0.074418 , -0.40044  ,  0.33969  , -0.62917  , -0.054449 ,
       -0.78469  ,  0.2354   , -0.78359  ,  0.74708  , -0.31074  ,
       -0.07038  , -0.34623  ,  0.33849  ,  0.89621  ,  0.30288  ,
        0.012978 ,  0.020869 , -0.14436  , -0.40914  ,  0.16651  ,
       -0.88124  , -0.078419 ,  0.048156 ,  0.27032  , -0.81761  ,
        0.027778 ,  0.62487  ,  0.1549   , -0.15838  ,  0.0886

## Let's find the top 7 words that are closest to 'compute'

In [3]:
u = embeddings_index['compute']
norm_u = np.linalg.norm(u)
similarity = []


for word in embeddings_index.keys():
    v = embeddings_index[word]
    cosine = np.dot(u, v)/norm_u/np.linalg.norm(v)
    similarity.append((word, cosine))


print(len(similarity))

400000


In [4]:
sorted(similarity, key=lambda x: x[1], reverse=True)[:10]

[('compute', 1.0),
 ('calculate', 0.7222063),
 ('algorithm', 0.64410573),
 ('computed', 0.6136235),
 ('algorithms', 0.61343825),
 ('equivalently', 0.59991395),
 ('formula_1', 0.5970425),
 ('formula_2', 0.5948518),
 ('formula_3', 0.593129),
 ('formula_5', 0.5920933)]

## Now let's do vector algebra.

### First we subtract the vector for `france` from `paris`. This could be imagined as a vector pointing from country to its capital. Then we add the vector of `nepal`. Let's see if it does point to the country's capital

In [20]:
output = embeddings_index['paris'] - embeddings_index['france'] + embeddings_index['nepal']
norm_out = np.linalg.norm(output)

similarity = []
for word in embeddings_index.keys():
    v = embeddings_index[word]
    cosine = np.dot(output, v)/norm_out/np.linalg.norm(v)
    similarity.append((word, cosine))


print(len(similarity))

400000


In [21]:
sorted(similarity, key=lambda x: x[1], reverse=True)[:7]

[('kathmandu', 0.8039164),
 ('nepal', 0.75638914),
 ('katmandu', 0.70364904),
 ('dhaka', 0.6556245),
 ('nepali', 0.6497056),
 ('delhi', 0.64691),
 ('bangkok', 0.6085232)]