## Word2Vec Spacy

In [1]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
                                              0.0/587.7 MB ? eta -:--:--
                                              0.3/587.7 MB 5.2 MB/s eta 0:01:53
                                              0.5/587.7 MB 5.6 MB/s eta 0:01:45
                                              0.6/587.7 MB 5.1 MB/s eta 0:01:56
                                              0.7/587.7 MB 4.2 MB/s eta 0:02:19
                                              0.8/587.7 MB 3.9 MB/s eta 0:02:33
                                              0.9/587.7 MB 3.4 MB/s eta 0:02:53
                                              1.0/587.7 MB 3.2 MB/s eta 0:03:03
                                              1.1/587.7 MB 3.1 MB/s eta 0:03:13
                                              1.2/587.7 MB 3.0 MB/s eta 0:03:19
                              

In [3]:
import spacy

# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. 
# In order to download
# word vectors you need to install large or medium english model. We will install the large one!
# make sure you have run "python -m spacy download en_core_web_lg" to install large english model
nlp = spacy.load("en_core_web_lg")

In [4]:
doc = nlp("dog cat banana kem")

for token in doc:
    print(token.text, "Vector:", token.has_vector, "OOV:", token.is_oov)

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
kem Vector: False OOV: True


In [5]:
doc[0].vector.shape

(300,)

In [6]:
base_token = nlp("bread")
base_token.vector.shape

(300,)

In [7]:
doc = nlp("bread sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <-> {base_token.text}:", token.similarity(base_token))

bread <-> bread: 1.0
sandwich <-> bread: 0.6341067010130894
burger <-> bread: 0.47520687769584247
car <-> bread: 0.06451533308853552
tiger <-> bread: 0.04764611675903374
human <-> bread: 0.2151154210812192
wheat <-> bread: 0.6150360888607199


In [8]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [9]:
print_similarity("iphone", "apple samsung iphone dog kitten")

apple <-> iphone:  0.4387907401919904
samsung <-> iphone:  0.670859081425417
iphone <-> iphone:  1.0
dog <-> iphone:  0.08211864228011527
kitten <-> iphone:  0.10222317834969896


In [12]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman
result

array([ 1.9392200e+00, -2.3115001e+00, -1.3863000e+00, -1.9133999e+00,
        4.1749401e+00, -1.5401300e+00, -3.8272700e+00,  5.0291996e+00,
       -2.4454002e+00,  2.0851002e+00,  1.6605499e+01, -1.3788500e+00,
       -5.7085404e+00,  2.7210798e+00,  6.6530025e-01,  3.4804001e+00,
        1.0497000e+00, -1.1281996e+00, -6.6435003e-01, -3.5216696e+00,
       -8.0680294e+00, -3.8434997e+00, -4.4948001e+00,  8.7943001e+00,
       -6.3383985e-01, -4.8098001e+00, -1.2955203e+00, -6.1078286e-01,
        4.1610003e-01, -4.1724200e+00,  3.7961500e+00, -5.5350199e+00,
       -1.4319000e+00, -4.7633996e+00,  3.7440000e+00, -1.2749730e+00,
        3.1816001e+00,  1.0476298e+00,  1.0784001e+00, -3.0779200e+00,
       -1.2711000e+00, -3.6251001e+00, -2.7258501e+00,  4.7676001e+00,
        1.5000498e+00,  2.5363998e+00,  9.6959996e-01,  2.8748999e+00,
        2.6771998e+00,  1.8741999e+00, -5.3535199e+00,  3.7624002e+00,
       -5.4443008e-01, -2.8594000e+00, -2.3983500e+00,  7.5615001e-01,
      

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.61780137]], dtype=float32)