# Word vectors in spaCy

In [1]:
from __future__ import unicode_literals
import spacy
nlp = spacy.load('en_core_web_md')

In [2]:
doc = nlp(open("pg345").read())

In [3]:
tokens = list(set([w.text for w in doc if w.is_alpha]))

In [4]:
nlp.vocab['cheese'].vector

array([-5.5252e-01,  1.8894e-01,  6.8737e-01, -1.9789e-01,  7.0575e-02,
        1.0075e+00,  5.1789e-02, -1.5603e-01,  3.1941e-01,  1.1702e+00,
       -4.7248e-01,  4.2867e-01, -4.2025e-01,  2.4803e-01,  6.8194e-01,
       -6.7488e-01,  9.2401e-02,  1.3089e+00, -3.6278e-02,  2.0098e-01,
        7.6005e-01, -6.6718e-02, -7.7794e-02,  2.3844e-01, -2.4351e-01,
       -5.4164e-01, -3.3540e-01,  2.9805e-01,  3.5269e-01, -8.0594e-01,
       -4.3611e-01,  6.1535e-01,  3.4212e-01, -3.3603e-01,  3.3282e-01,
        3.8065e-01,  5.7427e-02,  9.9918e-02,  1.2525e-01,  1.1039e+00,
        3.6678e-02,  3.0490e-01, -1.4942e-01,  3.2912e-01,  2.3300e-01,
        4.3395e-01,  1.5666e-01,  2.2778e-01, -2.5830e-02,  2.4334e-01,
       -5.8136e-02, -1.3486e-01,  2.4521e-01, -3.3459e-01,  4.2839e-01,
       -4.8181e-01,  1.3403e-01,  2.6049e-01,  8.9933e-02, -9.3770e-02,
        3.7672e-01, -2.9558e-02,  4.3841e-01,  6.1212e-01, -2.5720e-01,
       -7.8506e-01,  2.3880e-01,  1.3399e-01, -7.9315e-02,  7.05

In [5]:
def vec(s):
    return nlp.vocab[s].vector

In [6]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

# cosine similarity
def cosine(v1, v2):
    if norm(v1) > 0 and norm(v2) > 0:
        return dot(v1, v2) / (norm(v1) * norm(v2))
    else:
        return 0.0

In [16]:
def addv(coord1, coord2):
    return [c1 + c2 for c1, c2 in zip(coord1, coord2)]
addv([10, 1], [5, 2])

[15, 3]

In [8]:
def spacy_closest(token_list, vec_to_check, n=10):
    return sorted(token_list,
                  key=lambda x: cosine(vec_to_check, vec(x)),
                  reverse=True)[:n]

In [14]:
def subtractv(coord1, coord2):
    return [c1 - c2 for c1, c2 in zip(coord1, coord2)]
subtractv([10, 1], [5, 2])

[5, -1]

In [10]:
def meanv(coords):
    # assumes every item in coords has same length as item 0
    sumv = [0] * len(coords[0])
    for item in coords:
        for i in range(len(item)):
            sumv[i] += item[i]
    mean = [0] * len(sumv)
    for i in range(len(sumv)):
        mean[i] = float(sumv[i]) / len(coords)
    return mean
meanv([[0, 1], [2, 2], [4, 3]])

[2.0, 2.0]

In [7]:
cosine(vec('dog'), vec('puppy')) > cosine(vec('trousers'), vec('octopus'))

True

In [9]:
# what's the closest equivalent of basketball?
spacy_closest(tokens, vec("basketball"))

['tennis',
 'coach',
 'game',
 'teams',
 'junior',
 'Junior',
 'Team',
 'school',
 'boys',
 'puma']

In [11]:
spacy_closest(tokens, meanv([vec("day"), vec("night")]))

['night',
 'day',
 'Day',
 'evening',
 'Evening',
 'Morning',
 'morning',
 'afternoon',
 'nights',
 'Nights']

In [12]:
spacy_closest(tokens, vec("wine"))

['wine',
 'sparkling',
 'beer',
 'corked',
 'jug',
 'bottle',
 'Drink',
 'drink',
 'fruit',
 'bottles']

In [15]:
spacy_closest(tokens, subtractv(vec("wine"), vec("alcohol")))

['wine',
 'sparkling',
 'graceful',
 'exquisite',
 'fabulous',
 'splendid',
 'magnificent',
 'marvellous',
 'delightful',
 'banquet']

In [17]:
spacy_closest(tokens, vec("water"))

['water',
 'shallows',
 'waters',
 'salt',
 'Salt',
 'pond',
 'dry',
 'liquid',
 'ocean',
 'reef']

In [18]:
spacy_closest(tokens, addv(vec("water"), vec("frozen")))

['water',
 'cold',
 'ice',
 'salt',
 'Salt',
 'dry',
 'fresh',
 'liquid',
 'boiling',
 'bubbling']

In [19]:
spacy_closest(tokens, vec("grass"))

['Pampas',
 'grass',
 'grassy',
 'lawn',
 'elms',
 'foliage',
 'trees',
 'boughs',
 'greens',
 'garden']

In [20]:
# analogy: blue is to sky as X is to grass
blue_to_sky = subtractv(vec("blue"), vec("sky"))
spacy_closest(tokens, addv(blue_to_sky, vec("grass")))

['Pampas',
 'grass',
 'grassy',
 'green',
 'Green',
 'GREEN',
 'yellow',
 'red',
 'Red',
 'violet']

## Sentence similarity

In [21]:
def sentvec(s):
    sent = nlp(s)
    return meanv([w.vector for w in sent])

In [22]:
sentences = list(doc.sents)

In [23]:
def spacy_closest_sent(space, input_str, n=10):
    input_vec = sentvec(input_str)
    return sorted(space,
                  key=lambda x: cosine(np.mean([w.vector for w in x], axis=0), input_vec),
                  reverse=True)[:n]

In [25]:
for sent in spacy_closest_sent(sentences, "I love drink wine with cheese."):
    print (sent.text)
    print ("---")

This, with some cheese
and a salad and a bottle of old Tokay, of which I had two glasses, was
my supper.
---
I left Quincey lying down
after having a glass of wine, and told the cook to get ready a good
breakfast.
---
Do not wait for me.--D." I set to and
enjoyed a hearty meal.
---
a chicken done up some way with red pepper, which was
very good but thirsty.
---
We get hot soup, or coffee, or tea; and
off we go.
---
" I never liked garlic before, but to-night it is delightful!

---
I believe they went to the trouble of putting an
extra amount of garlic into our food; and I can't abide garlic.
---
Drink it off, like a good
child.
---
I dined on what they
called "robber steak"--bits of bacon, onion, and beef, seasoned with red
pepper, and strung on sticks and roasted over the fire, in the simple
style of the London cat's meat!
---
I
saw it drip with the fresh blood!"
---


In [27]:
for sent in spacy_closest_sent(sentences, "I went to war seven years ago"):
    print (sent.text)
    print ("---")

what we called Bersicker was one of three grey ones that came from
Norway to Jamrach's, which we bought off him four years ago.
---
When I saw him four days ago down at his
own place he looked queer.
---
Just after I had taken my place I heard a distant
clock strike twelve, and in time came one and two.
---
Only ten days ago

---
"You will be grieved to hear that Mrs. Westenra died five days ago, and
that Lucy died the day before yesterday.
---
None of it that I noticed was
less than three hundred years old.
---
Jonathan was holding me by the arm, the way he used to in old days
before I went to school.
---
I
took my way to Paddington, where I arrived about fifteen minutes before
the train came in.


---
What happened was this: Two nights ago
---
Fifty years ago a series
of great fires took place, which made terrible havoc on five separate
occasions.
---
