In [1]:
pip install spacy



In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import spacy
nlp = spacy.load("en_core_web_md")
#get vector from single word(Token)
doc = nlp("dog cat elephent ken")
for token in doc:
  print(token.text,token.shape)
  print("vector:", token.has_vector, "OVV:", token.is_oov)


dog 4088098365541558500
vector: True OVV: False
cat 4088098365541558500
vector: True OVV: False
elephent 13110060611322374290
vector: False OVV: True
ken 4088098365541558500
vector: True OVV: False


In [2]:
doc[0].vector.shape

(300,)

In [4]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [5]:
doc = nlp("bread butter hotdog tea car human calculator")
for token in doc:
  print(f"{token.text} <--> {base_token.text}:", token.similarity(base_token))
#

bread <--> bread: 1.0
butter <--> bread: 0.7028381824493408
hotdog <--> bread: 0.622492790222168
tea <--> bread: 0.29417943954467773
car <--> bread: 0.14248128235340118
human <--> bread: 0.20022818446159363
calculator <--> bread: 0.006849278230220079


In [8]:
import numpy as np

def most_similar(word, nlp, n=5):
    query_vec = nlp.vocab[word].vector
    scores = {}
    for w in nlp.vocab:
        if w.has_vector and w.is_lower and w.is_alpha:
            scores[w.text] = np.dot(query_vec, w.vector) / (w.vector_norm * nlp.vocab[word].vector_norm + 1e-8)
    return sorted(scores, key=scores.get, reverse=True)[:n]

print(most_similar("king", nlp))

['king', 'do', 'why', 'all', 'it']


In [9]:
# king - man + woman ≈ queen
king  = nlp.vocab["king"].vector
man   = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector

result = king - man + woman

# Find closest word to result
from numpy.linalg import norm

def closest(vec, vocab):
    best, best_score = None, -1
    for w in vocab:
        if w.has_vector and w.is_lower and w.is_alpha:
            score = np.dot(vec, w.vector) / (norm(vec) * w.vector_norm + 1e-8)
            if score > best_score:
                best, best_score = w.text, score
    return best

print(closest(result, nlp.vocab))  # likely "queen"

woman


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the vector for 'queen'
queen = nlp.vocab["queen"].vector

# Reshape vectors for cosine_similarity function (expects 2D arrays)
similarity = cosine_similarity([result], [queen])

print(f"Cosine similarity between (king - man + woman) and queen: {similarity[0][0]}")

Cosine similarity between (king - man + woman) and queen: 0.4811784029006958
