In [5]:
!pip install -U spaCy
!python -m spacy download en_core_web_lg

Collecting numpy>=1.19.0 (from spaCy)
  Downloading numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Downloading numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.5 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.2.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m 

# spaCy and word embeddings

Init spaCy model. We need the large model (lg) for retrieving word embeddings.

In [6]:
import spacy
nlp = spacy.load('en_core_web_lg')

Word Embeddings
------------------

spaCy comes shipped with a Word Vector model as well. We’ll need to download a larger model for that: *(python -m spacy download en_core_web_lg)*

The vectors are attached to spaCy objects: Token, Lexeme (a sort of unnatached token, part of the vocabulary), Span and Doc. The multi-token objects average its constituent vectors.

Here are a few properties word vectors have:
1. If two words are similar, they appear in similar contexts
2. Word vectors are computed taking into account the context (surrounding words)
3. Given the two previous observations, similar words should have similar word vectors
4. Using vectors we can derive relationships (relatedness) between words

Let’s see how we can access the embedding of a word in spaCy:

In [7]:
print(nlp.vocab['cat'].vector)

[-0.15067   -0.024468  -0.23368   -0.23378   -0.18382    0.32711
 -0.22084   -0.28777    0.12759    1.1656    -0.64163   -0.098455
 -0.62397    0.010431  -0.25653    0.31799    0.037779   1.1904
 -0.17714   -0.2595    -0.31461    0.038825  -0.15713   -0.13484
  0.36936   -0.30562   -0.40619   -0.38965    0.3686     0.013963
 -0.6895     0.004066  -0.1367     0.32564    0.24688   -0.14011
  0.53889   -0.80441   -0.1777    -0.12922    0.16303    0.14917
 -0.068429  -0.33922    0.18495   -0.082544  -0.46892    0.39581
 -0.13742   -0.35132    0.22223   -0.144     -0.048287   0.3379
 -0.31916    0.20526    0.098624  -0.23877    0.045338   0.43941
  0.030385  -0.013821  -0.093273  -0.18178    0.19438   -0.3782
  0.70144    0.16236    0.0059111  0.024898  -0.13613   -0.11425
 -0.31598   -0.14209    0.028194   0.5419    -0.42413   -0.599
  0.24976   -0.27003    0.14964    0.29287   -0.31281    0.16543
 -0.21045   -0.4408     1.2174     0.51236    0.56209    0.14131
  0.092514   0.71396   -0.02

There’s a really famous example of word embedding math (man:wonam=?:queen):

man - woman = ? - queen =>

man - woman + queen = ?.

It sounds pretty crazy to be true, so let’s test that out:

In [8]:
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector

wset = ("man", "woman" , "queen")

# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector or word in wset:
        continue

    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])

from numpy import dot
from numpy.linalg import norm

print(dot(nlp.vocab['king'].vector, maybe_king)/(norm(nlp.vocab['king'].vector)*norm(maybe_king)))

['Mr', 'Mr.', 'He', 'he', 'cuz', 'Cuz', 'Let', 'let', 'u', 'Jr']
0.77161425


Computing Similarity
---------------------------

Based on the word embeddings, spaCy offers a similarity interface for all of it’s building blocks: Token, Span, Doc and Lexeme. Here’s how to use that similarity interface:

In [9]:
apple = nlp.vocab['apple']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']

print("sim(dog, animal) =",dog.similarity(animal))
print("sim(dog, fruit) =", dog.similarity(fruit))
print("sim(apple, fruit) = ", apple.similarity(fruit))
print("sim(apple, animal) = ", apple.similarity(animal))

sim(dog, animal) = 0.6618534326553345
sim(dog, fruit) = 0.23552848398685455
sim(apple, fruit) =  0.6306183338165283
sim(apple, animal) =  0.26336556673049927


Let’s now use this technique on entire texts:

In [10]:
target = nlp("Cats are beautiful animals.")

doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming mammals.")

print(target.similarity(doc1))
print(target.similarity(doc2))
print(target.similarity(doc3))

0.8901766262114666
0.9115828449161616
0.7822956752876101


In [11]:
# Doc has its vector.
doc1.vector

array([-2.69004732e-01,  1.42533004e-01, -1.35690004e-01, -4.21249270e-01,
       -4.80500236e-03,  9.15274918e-02,  8.62575024e-02, -1.54831260e-01,
        3.10377479e-02,  2.01034999e+00, -7.26649985e-02,  3.04599226e-01,
       -2.65364945e-02,  1.02129750e-01, -7.98537433e-02, -6.15537539e-02,
        3.76442447e-02,  1.28187501e+00, -2.82263249e-01, -1.09444499e-01,
       -1.59755006e-01, -5.85690029e-02,  1.01359256e-01, -3.43629509e-01,
        5.34317568e-02,  1.05778247e-01, -1.71460003e-01, -1.87604249e-01,
        1.13147259e-01, -1.22040994e-01, -6.65290058e-02, -1.10341348e-01,
       -7.80415013e-02,  1.91448495e-01,  9.98925120e-02,  1.14207000e-01,
        2.58234978e-01, -1.68851495e-01, -6.82290047e-02,  1.23302005e-01,
       -1.00777246e-01,  4.64907587e-02,  6.60987496e-02, -5.69574982e-02,
        4.70714942e-02,  6.86054975e-02, -2.95626253e-01,  8.05705041e-02,
        1.03599489e-01, -7.28434995e-02, -6.48620054e-02,  9.56825018e-02,
        1.22112751e-01, -

Word embeddings for Italian.

In [12]:
!python -m spacy download it_core_news_lg

Collecting it-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_lg-3.8.0/it_core_news_lg-3.8.0-py3-none-any.whl (567.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.9/567.9 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: it-core-news-lg
Successfully installed it-core-news-lg-3.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')


In [13]:
nlp = spacy.load('it_core_news_lg')

In [14]:
mela = nlp.vocab['mela']
cane = nlp.vocab['cane']
frutta = nlp.vocab['frutta']
animale = nlp.vocab['animale']

print("sim(cane, animale) =", cane.similarity(animale))
print("sim(cane, frutta) =", cane.similarity(frutta))
print("sim(mela, frutta) = ", mela.similarity(frutta))
print("sim(mela, animale) = ", mela.similarity(animale))

sim(cane, animale) = 0.6316403746604919
sim(cane, frutta) = 0.10757876187562943
sim(mela, frutta) =  0.594420313835144
sim(mela, animale) =  0.22128893435001373


In [15]:
target = nlp("I gatti sono animali meravigliosi.")

doc1 = nlp("I cani sono fantastici.")
doc2 = nlp("I felini sono creature meravigliose.")
doc3 = nlp("I delfini sono mammiferi che nuotano.")

print(target.similarity(doc1))
print(target.similarity(doc2))
print(target.similarity(doc3))

0.9819733247316478
0.9638328369011564
0.9288913013436094
