In [35]:
import spacy

nlp = spacy.load('en_core_web_lg')

## Word2Vec
Word2Vec is a technique for embedding words in a vector of numbers.


In [36]:
doc = nlp('dogs are the best.')
print(f'Shape: {doc.vector.shape}')
doc.vector

Shape: (300,)


array([-2.63023090e+00, -1.44064039e-01, -2.87560010e+00, -9.58000004e-01,
        6.34711599e+00,  1.81727195e+00, -2.13021231e+00,  3.74393988e+00,
       -4.01097953e-01, -8.92248034e-01,  7.67081976e+00,  2.34689999e+00,
       -5.68435192e+00,  1.63542008e+00,  4.29126406e+00,  2.65903997e+00,
       -5.76235414e-01, -9.24070001e-01, -1.19463965e-01, -6.29390049e+00,
        1.71580088e+00, -1.94941783e+00, -2.66502976e+00, -2.97551990e+00,
       -1.85918009e+00,  1.46602595e+00, -2.17178011e+00, -1.36144006e+00,
       -8.78052115e-01,  1.18097997e+00,  3.28314829e+00, -7.96960056e-01,
       -4.79652023e+00, -7.15170002e+00, -2.68129015e+00,  9.29199904e-02,
       -7.71052003e-01,  3.14497995e+00,  3.59101987e+00,  2.20619392e+00,
        2.29066801e+00,  3.84905577e+00,  1.83126199e+00,  6.30881339e-02,
       -3.43605995e+00,  2.61152601e+00,  1.49480021e+00, -3.63001895e+00,
       -3.32800001e-01,  2.76520014e-01, -6.39757991e-01,  4.56192017e+00,
        1.36009812e+00, -

In [37]:
word = nlp('dog')
print(f'Shape: {doc.vector.shape}')
doc.vector

Shape: (300,)


array([-2.63023090e+00, -1.44064039e-01, -2.87560010e+00, -9.58000004e-01,
        6.34711599e+00,  1.81727195e+00, -2.13021231e+00,  3.74393988e+00,
       -4.01097953e-01, -8.92248034e-01,  7.67081976e+00,  2.34689999e+00,
       -5.68435192e+00,  1.63542008e+00,  4.29126406e+00,  2.65903997e+00,
       -5.76235414e-01, -9.24070001e-01, -1.19463965e-01, -6.29390049e+00,
        1.71580088e+00, -1.94941783e+00, -2.66502976e+00, -2.97551990e+00,
       -1.85918009e+00,  1.46602595e+00, -2.17178011e+00, -1.36144006e+00,
       -8.78052115e-01,  1.18097997e+00,  3.28314829e+00, -7.96960056e-01,
       -4.79652023e+00, -7.15170002e+00, -2.68129015e+00,  9.29199904e-02,
       -7.71052003e-01,  3.14497995e+00,  3.59101987e+00,  2.20619392e+00,
        2.29066801e+00,  3.84905577e+00,  1.83126199e+00,  6.30881339e-02,
       -3.43605995e+00,  2.61152601e+00,  1.49480021e+00, -3.63001895e+00,
       -3.32800001e-01,  2.76520014e-01, -6.39757991e-01,  4.56192017e+00,
        1.36009812e+00, -

## Identifying similarities
The advantage of embedding words as vectors is that we can perform mathematical operation. Cosine similarity is  one such operation that compute the cosine distance between two words vectors.
With 1 being the highest value and 0 the lowest.

In [38]:
def print_similarity(doc):
    """Print similarities between all the words in the doc."""
    for token1 in doc:
        for token2 in doc:
            print(f'{token1.text} - {token2.text}: {token1.similarity(token2)}')
        print()

In [39]:
word1, word2 = nlp('king'), nlp('queen')
print('king and queen similarity:', word1.similarity(word2))

king and queen similarity: 0.6108841628588695


In [40]:
doc = nlp('king queen kings princess')
print_similarity(doc)

king - king: 1.0
king - queen: 0.6108841896057129
king - kings: 0.8352225422859192
king - princess: 0.6533219218254089

queen - king: 0.6108841896057129
queen - queen: 1.0
queen - kings: 0.510179340839386
queen - princess: 0.7273048758506775

kings - king: 0.8352225422859192
kings - queen: 0.510179340839386
kings - kings: 1.0
kings - princess: 0.5621244311332703

princess - king: 0.6533219218254089
princess - queen: 0.7273048758506775
princess - kings: 0.5621244311332703
princess - princess: 1.0



### Display similarites in a table

In [41]:
# For brevity, assign each token a name
a,b,c, d = doc

# Display as a Markdown table (this only works in Jupyter!)
from IPython.display import Markdown, display
display(Markdown(f'<table><tr><th></th><th>{a.text}</th><th>{b.text}</th><th>{c.text}</th></tr>\
<tr><td>**{a.text}**</td><td>{a.similarity(a):{.4}}</td><td>{b.similarity(a):{.4}}</td><td>{c.similarity(a):{.4}}</td></tr>\
<tr><td>**{b.text}**</td><td>{a.similarity(b):{.4}}</td><td>{b.similarity(b):{.4}}</td><td>{c.similarity(b):{.4}}</td></tr>\
<tr><td>**{c.text}**</td><td>{a.similarity(c):{.4}}</td><td>{b.similarity(c):{.4}}</td><td>{c.similarity(c):{.4}}</td></tr>\
<tr><td>**{d.text}**</td><td>{a.similarity(d):{.4}}</td><td>{b.similarity(d):{.4}}</td><td>{c.similarity(d):{.4}}</td></tr>'))

<table><tr><th></th><th>king</th><th>queen</th><th>kings</th></tr><tr><td>**king**</td><td>1.0</td><td>0.6109</td><td>0.8352</td></tr><tr><td>**queen**</td><td>0.6109</td><td>1.0</td><td>0.5102</td></tr><tr><td>**kings**</td><td>0.8352</td><td>0.5102</td><td>1.0</td></tr><tr><td>**princess**</td><td>0.6533</td><td>0.7273</td><td>0.5621</td></tr>

### Similar Opposites
Sometimes, words are different in English but very similar in NLP. It's quite fascinating to see how similar 'love' and 'hate'.

In [42]:
doc = nlp('like love hate')
print_similarity(doc)

like - like: 1.0
like - love: 0.5212638974189758
like - hate: 0.5065141320228577

love - like: 0.5212638974189758
love - love: 1.0
love - hate: 0.5708349943161011

hate - like: 0.5065141320228577
hate - love: 0.5708349943161011
hate - hate: 1.0



### Word outside of vocabulary
Words that do not belong to the vocabulary have value 0.

In [49]:
doc = nlp('dog cat dogat')

for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 75.254234 False
cat True 63.188496 False
dogat False 0.0 True


## Vector arithmetic
We can calculate new vectors but adding or subtracting related vectors.
A famous example is `king - man + woman = queen`.
Let's see how we can calculate that.

In [50]:
from scipy.spatial.distance import cosine

def cosine_similarity(x, y):
    return 1 - cosine(x, y)

In [54]:
nlp.vocab['king'].vector

array([-1.1296e-01, -4.1865e+00, -1.8453e+00,  3.0781e-01,  2.4956e+00,
        9.6267e-01, -1.8161e+00,  4.4655e+00, -2.8210e+00,  9.7090e-01,
        1.3542e+01,  4.3195e-01, -5.3098e+00,  4.7098e+00,  2.9030e+00,
        1.5588e+00,  6.0064e+00, -3.0345e+00,  1.0626e+00, -7.7197e-01,
       -5.4771e+00, -9.7380e-01, -4.4345e+00,  5.8367e+00,  2.4302e+00,
       -3.9408e+00, -9.1862e-01, -4.9124e+00,  1.4591e+00, -7.2772e-01,
        3.4957e+00, -4.0077e+00, -1.8354e+00, -4.1052e+00,  4.9211e+00,
       -9.7053e-01,  1.9223e+00,  5.2605e+00,  1.6086e+00,  7.1328e-01,
       -1.2146e+00, -1.9869e+00,  8.0265e-01,  2.9298e+00,  7.2985e-01,
       -6.2892e-01, -1.7082e+00,  1.9893e+00,  4.7529e-01,  3.2264e+00,
       -3.9215e+00,  4.6556e+00,  1.3475e+00, -1.0979e+00, -3.0365e+00,
        1.5815e+00,  2.2835e+00, -4.0616e+00,  2.5730e+00,  4.0618e+00,
        9.5438e-01, -6.2563e+00,  5.6463e+00, -3.8933e+00,  4.4076e+00,
        2.0517e+00, -6.6906e+00, -6.9448e+00,  6.0371e+00,  9.30

In [55]:
# Get thr words vector
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

['king', 'kings', 'princess', 'queen', 'the', 'and', 'that', 'where', 'she', 'they']
