In [39]:
import spacy

In [40]:
!python -m spacy download en_core_web_lg # Download the en_core_web_lg model


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [41]:
nlp = spacy.load("en_core_web_lg")
nlp

<spacy.lang.en.English at 0x7fa2b67fee00>

In [42]:
nlp(u'lion').vector

array([  1.2746  ,   0.46242 ,  -1.1829  ,  -5.2661  ,  -2.7128  ,
         1.8521  ,  -0.94273 ,   2.1865  ,   6.503   ,   0.6704  ,
         1.5361  ,   2.5992  ,  -0.36233 ,   4.3965  ,  -6.5644  ,
         1.6141  ,  -1.2897  ,   2.1184  ,  -0.63654 ,  -3.4572  ,
        -4.3771  ,   4.2074  ,  -3.6411  ,  -0.97214 ,   1.3253  ,
        -2.3125  ,  -3.6531  ,  -2.8398  ,   2.7913  ,  -1.53    ,
        -2.9984  ,  -2.6357  ,   0.50615 ,  -2.6925  ,   4.3401  ,
        -5.6017  ,   0.045691,   4.3832  ,  -0.19535 ,  -1.0751  ,
         0.32172 ,   2.4395  ,   4.6638  ,   3.4471  ,  -3.3847  ,
        -1.8238  ,   0.70212 ,   0.58557 ,   5.0032  ,  -3.1072  ,
         1.2364  ,   7.4595  ,   0.057368,   1.0111  ,  -1.0827  ,
         0.69113 ,   2.8009  ,  -3.4383  ,  -1.0599  ,  -2.2627  ,
        -5.149   ,  -5.0636  ,   3.1405  ,   1.0793  ,  -0.72892 ,
        -3.9939  ,  -0.69551 ,  -0.55767 ,   3.2555  ,  -2.9449  ,
         4.7114  ,   1.6388  ,   1.3828  ,   1.4255  ,  -3.233

In [43]:
# Vector components for the string lion

In [44]:
nlp(u' I work at TD insurance').vector.shape

(300,)

In [45]:
# Let's try to identify similar vectors

# The best way to expose vector relationships is
# through the dot_similarity method of the actual document tokens.


In [46]:
tokens = nlp(u'lion cat pet')




In [47]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2))




lion lion 1.0
lion cat 0.3854507803916931
lion pet 0.20031584799289703
cat lion 0.3854507803916931
cat cat 1.0
cat pet 0.732966423034668
pet lion 0.20031584799289703
pet cat 0.732966423034668
pet pet 1.0


In [48]:
tokens = nlp(u'Mrugesh Maya Malav Minaxi')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2))


Mrugesh Mrugesh 1.0
Mrugesh Maya 0.0
Mrugesh Malav 0.0
Mrugesh Minaxi 0.0
Maya Mrugesh 0.0
Maya Maya 1.0
Maya Malav 0.0
Maya Minaxi 0.0
Malav Mrugesh 0.0
Malav Maya 0.0
Malav Malav 1.0
Malav Minaxi 0.0
Minaxi Mrugesh 0.0
Minaxi Maya 0.0
Minaxi Malav 0.0
Minaxi Minaxi 1.0


  print(token1.text,token2.text,token1.similarity(token2))


In [49]:
# So we have to keep in mind that
# words which have opposite meaning but that often appear
# in the same context may actually have similar vector as well.

tokens= nlp(u'like love hate')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text,token1.similarity(token2))



like like 1.0
like love 0.5212638974189758
like hate 0.5065140724182129
love like 0.5212638974189758
love love 1.0
love hate 0.5708349943161011
hate like 0.5065140724182129
hate love 0.5708349943161011
hate hate 1.0


In [50]:
nlp.vocab.vectors

<spacy.vectors.Vectors at 0x7fa288a756c0>

In [51]:
len(nlp.vocab.vectors)

514157

In [52]:
nlp.vocab.vectors.shape

(514157, 300)

In [53]:

tokens= nlp(u'dog cat nargle')

for token in tokens:
    print(token.text,token.has_vector,token.vector_norm,token.is_oov)


dog True 75.254234 False
cat True 63.188496 False
nargle False 0.0 True


In [54]:

tokens= nlp(u'dog cat Joshi')

for token in tokens:
    print(token.text,token.has_vector,token.vector_norm,token.is_oov)


dog True 75.254234 False
cat True 63.188496 False
Joshi True 35.95677 False


In [55]:
# vector arithmetic

from scipy import spatial


In [56]:
cosine_similarity = lambda vec1,vec2: 1- spatial.distance.cosine(vec1,vec2)


In [57]:
king = nlp(u'king').vector
man = nlp(u'man').vector
woman = nlp(u'woman').vector

In [58]:
# king - man + woman -> brand new vector similar to queen,princess,highness


new_vector = king - man + woman



In [59]:
computed_similarities=[]

# for all words in vocab

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))


In [60]:
# descending order

computed_similarities= sorted(computed_similarities,key=lambda item:-item[1])


In [61]:
computed_similarities[1:10]

[(<spacy.lexeme.Lexeme at 0x7fa2830f7740>, 0.3899004980014287),
 (<spacy.lexeme.Lexeme at 0x7fa2830f6f00>, 0.38483578122586826),
 (<spacy.lexeme.Lexeme at 0x7fa2830ff380>, 0.3385923203793766),
 (<spacy.lexeme.Lexeme at 0x7fa2830f58c0>, 0.3244562535098119),
 (<spacy.lexeme.Lexeme at 0x7fa2830f5200>, 0.3206636961864012),
 (<spacy.lexeme.Lexeme at 0x7fa2830f5380>, 0.30994718486145534),
 (<spacy.lexeme.Lexeme at 0x7fa2830f5440>, 0.3054207314031283),
 (<spacy.lexeme.Lexeme at 0x7fa2830f78c0>, 0.2983730534226481),
 (<spacy.lexeme.Lexeme at 0x7fa28378ad80>, 0.29441292236539585)]

In [62]:
print([t[0].text for t in computed_similarities[:10]])

['king', 'and', 'that', 'where', 'she', 'they', 'woman', 'there', 'should', 'these']
