In [1]:
import spacy
from scipy import spatial

In [3]:
! spacy download en_core_web_md

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [4]:
nlp = spacy.load('en_core_web_md')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
# md model --> 685k keys, 20k unique vectors (300 dimensions)
# lg model --> 685k keys, 685k unique vectors (300 dimensions)

print(len(nlp.vocab))
print(len(nlp.vocab.vectors))

773
20000


In [6]:
nlp(u'lion').vector.shape

(300,)

In [7]:
tokens = nlp(u'cat lion pet')

for t1 in tokens:
    for t2 in tokens:
        print(t1.text,t2.text,t1.similarity(t2))

cat cat 1.0
cat lion 0.5265437960624695
cat pet 0.7505456805229187
lion cat 0.5265437960624695
lion lion 1.0
lion pet 0.39923766255378723
pet cat 0.7505456805229187
pet lion 0.39923766255378723
pet pet 1.0


In [14]:
tokens = nlp(u'dog cat horse prince')
for t in tokens:
    print(t.text,t.has_vector,t.vector_norm,t.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
horse True 6.760544 False
prince True 6.5258965 False


In [15]:
# Words to Vectors
king = nlp(u'king').vector
man = nlp(u'man').vector
woman = nlp(u'woman').vector

In [16]:
# creating the new vector

new_vector = king-man+woman
new_vector

array([ 5.14087021e-01, -2.78459996e-01,  2.42767006e-01,  4.54899669e-02,
       -2.59425014e-01, -3.19999963e-01,  3.23920012e-01, -6.71030045e-01,
       -9.98499990e-02,  1.91499996e+00, -5.68080008e-01, -2.74451017e-01,
       -1.49906695e-01,  8.01083148e-02, -2.34764010e-01, -1.10950008e-01,
       -1.02593988e-01,  8.53819966e-01, -2.68564999e-01,  3.85140002e-01,
       -1.36149988e-01,  6.35029972e-01, -7.62044966e-01, -2.52770007e-01,
       -6.75969958e-01,  3.89851004e-01, -2.89680034e-01,  1.75860003e-01,
       -5.16229987e-01,  5.21373034e-01, -1.89909995e-01,  6.73759937e-01,
        1.17550008e-01, -4.69896019e-01,  5.88999987e-01,  1.29447982e-01,
       -5.71900010e-01, -5.47450066e-01, -4.84210014e-01,  5.85503951e-02,
        4.82379973e-01, -2.86769986e-01, -2.01718003e-01, -4.74729985e-01,
        3.43068987e-01, -2.28827983e-01, -1.76439017e-01,  6.05450034e-01,
        2.07139999e-01, -2.89762974e-01, -7.63288975e-01,  4.37090009e-01,
       -2.06220001e-01, -

In [17]:
cosine_similarity = lambda vec1,vec2 : 1-spatial.distance.cosine(vec1,vec2)

In [18]:
similarities = []

for word in nlp.vocab:
    if word.has_vector and word.is_alpha and word.is_lower:
        similarities.append((cosine_similarity(new_vector,word.vector),word.text))

In [19]:
# as we can observe that for a vector like king-man+woman we obviously expect a queen and it 
#proves to be successful in getting that

for similarity,word in  sorted(similarities,reverse=True)[:10]:
    print(word)

king
prince
woman
she
lion
who
when
sharma
dare
horse
