# **Pre-trained Word Embeddings**

Download the GLOVE word embeddings from Stanford.  

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

Read embeddings to a dictionary

In [None]:
import numpy as np

def read_embeddings(n=1000):
    # Reads n embeddings from file
    # Returns a dictionary were embedding[w] is the embeding of string w
    embedding = {}
    count = 0
    with open('glove.6B.100d.txt', encoding="utf8") as f: 
        for line in f: 
            count+=1
            ls = line.split(" ")
            emb = np.array([np.float32(x) for x in ls[1:]])
            embedding[ls[0]]=emb
            if count>= n:
                break
    return embedding

vocabulary_size = 200000        
embedding = read_embeddings(vocabulary_size)

Define similarity metric

In [None]:
def cosine_similarity(w1,w2,emb):
  e1,e2 = emb[w1], emb[w2]
  return np.dot(e1,e2)/np.linalg.norm(e1)/np.linalg.norm(e2)

Embeddings even capture the similarity among universities!

In [None]:
words = ['utep','princeton','harvard','nmsu']
for w1 in words:
  for w2 in words:
    print('cosine_similarity({},{})={:4.3f}'.format(w1,w2,cosine_similarity(w1,w2,embedding)))

And countries:

In [None]:
words = ['mexico','russia','spain','china','japan','guatemala','poland']
for w1 in words:
  for w2 in words:
    print('cosine_similarity({},{})={:4.3f}'.format(w1,w2,cosine_similarity(w1,w2,embedding)))

Similarity among book characters:

In [None]:
W1 = ['buck','dracula','holmes','frankenstein']
W2 = ['dog','vampire','detective','monster']
for w1 in W1:
  for w2 in W2:
      print(w1,w2,cosine_similarity(w1,w2,embedding))

**Exercise:** Write a function that uses word embeddings to solve word analogy problems such as:

France is to Paris as Spain is to: 
*    Washington
*   Berlin
*   Moscow
*   Madrid


**Idea:** the vector connecting Paris' embedding to France's embedding is nearly parallel to the vector connecting Madrid's embedding to Spains's embegging.

Thus, if E[w] is the embedding of word w, 

E['france'] - E['paris'] ~= E['spain'] - E['madrid'] 

E['madrid'] ~= E['paris'] - E['france'] + E['spain']

Thus we expect E['paris'] - E['france'] + E['spain'] to be closer to E['madrid'] than to E['washington'], E['berlin'], or E['moscow'] in terms of Euclidean distance. 

In [None]:
def word_analogy(pair,w,options,emb):
  pair_v = emb[pair[1]] - emb[pair[0]]
  dest = emb[w] + pair_v
  dist = []
  for op in options:
      dist.append(np.sum((dest-emb[op])**2))
  print('Distances:',dist)
  wa = options[np.argmin(np.array(dist))]
  S = '{} is to {} as {} is to {}'.format(pair[0],pair[1],w,wa)
  return S


In [None]:
pair = ['france','paris']
w = 'spain'
options = ['washington','berlin','moscow','madrid']
print(word_analogy(pair,w,options,embedding))

In [None]:
pair = ['woman','queen']
w = 'man'
options = ['president','lord','minister','politician','king']
print(word_analogy(pair,w,options,embedding))

In [None]:
pair = ['princess','queen']
w = 'prince'
options = ['president','lord','minister','politician','king']
print(word_analogy(pair,w,options,embedding))

In [None]:
pair = ['algorithm','program']
w = 'recipe'
options = ['food','restaurant','taco','banana']
print(word_analogy(pair,w,options,embedding))


In [None]:
pair = ['cat','dog']
w = 'tiger'
options = ['lion','wolf','coyote','whale','dolphin']
print(word_analogy(pair,w,options,embedding))


In [None]:
pair = ['old','new']
w = 'fast'
options = ['slow','planet','berlin','earth','king','german']
print(word_analogy(pair,w,options,embedding))

In [None]:
pair = ['toe','foot']
w = 'finger'
options = ['slow','planet','hand','earth','king','german']
print(word_analogy(pair,w,options,embedding))

In [None]:
pair = ['positive','negative']
w = 'proton'
options = ['neutron','electron','atom','molecule']
print(word_analogy(pair,w,options,embedding))

In [None]:
pair = ['vulnerability','exploit']
w = 'food'
options = ['eat', 'neutron','electron','atom','molecule']
print(word_analogy(pair,w,options,embedding))

In [None]:
pair = ['federer','tennis']
w = 'armstrong'
options = ['tennis', 'soccer','baseball','football','cycling']
print(word_analogy(pair,w,options,embedding))