In [1]:
import gensim.downloader as api
# Gensim is widely used in industry and academia for text analysis and modeling applications.


In [2]:
wv = api.load("word2vec-google-news-300")
# Word2Vec-Google-News-300 is a pre-trained word embedding model developed by Google.
# It was trained on a large corpus of text data (Google News) and represents words as 300-dimensional vectors. 
# These vectors capture the semantic meaning of the words and can be used for various NLP tasks such as text classification, 
# similarity analysis, and clustering. The "300" in the name represents the size of the word vectors, which is 300 dimensions.
# The model is widely used as a starting point for various NLP tasks, and its pre-trained vectors can save a lot of time and 
# computational resources compared to training a word embedding model from scratch.
# Huge model 1.6 GB and trained on 100 billion words.



In [3]:
wv.similarity(w1="great", w2="great")

1.0

In [4]:
wv.similarity(w1="great", w2="good")

0.729151

In [5]:
wv.most_similar("good") # SImilar words

[('great', 0.7291510105133057),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889115571975708),
 ('decent', 0.6837348937988281),
 ('nice', 0.6836092472076416),
 ('excellent', 0.644292950630188),
 ('fantastic', 0.6407778263092041),
 ('better', 0.6120728254318237),
 ('solid', 0.5806034803390503),
 ('lousy', 0.5764201879501343)]

In [6]:
wv.similarity(w1="good", w2="great")

0.729151

In [7]:
wv.most_similar("delectable") # SImilar words

[('delicious', 0.8363204002380371),
 ('scrumptious', 0.8109372854232788),
 ('tasty', 0.7385421991348267),
 ('yummy', 0.7123230695724487),
 ('delightful', 0.6968866586685181),
 ('sumptuous', 0.6954765319824219),
 ('luscious', 0.6944749355316162),
 ('delectable_desserts', 0.686908483505249),
 ('palate_pleasing', 0.6720318794250488),
 ('lip_smacking', 0.6578160524368286)]

In [8]:
wv.most_similar("dog")

[('dogs', 0.8680489659309387),
 ('puppy', 0.8106428384780884),
 ('pit_bull', 0.780396044254303),
 ('pooch', 0.7627377510070801),
 ('cat', 0.7609456777572632),
 ('golden_retriever', 0.7500902414321899),
 ('German_shepherd', 0.7465174198150635),
 ('Rottweiler', 0.7437614798545837),
 ('beagle', 0.7418621778488159),
 ('pup', 0.740691065788269)]

In [9]:
wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=5)

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133)]

In [10]:
wv.most_similar(positive=['france', 'berlin'], negative=['paris'], topn=5)

[('germany', 0.5094344019889832),
 ('european', 0.4865044951438904),
 ('german', 0.4714890718460083),
 ('austria', 0.46964019536972046),
 ('swedish', 0.46451830863952637)]

In [11]:
wv.doesnt_match(["facebook", "cat", "google", "microsoft"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'cat'

In [12]:
wv.doesnt_match(["dog", "cat", "google", "mouse"])

'google'

In [13]:
wv.doesnt_match(["tomato", "banana", "peach", "apple","pear"])

'banana'

In [14]:
wv.doesnt_match(["spectre", "apparition", "hoodlum", "phantom"])



'hoodlum'

In [16]:
wv.doesnt_match(["listen","swim","walk","climb"])

'listen'

Gensim: Glove

In [17]:
glv = api.load("glove-twitter-25")
 # similar to google-news-300



In [18]:
glv.most_similar("good")

[('too', 0.9648016095161438),
 ('day', 0.9533665180206299),
 ('well', 0.9503172039985657),
 ('nice', 0.9438973069190979),
 ('better', 0.9425961375236511),
 ('fun', 0.9418926239013672),
 ('much', 0.9413353204727173),
 ('this', 0.9387556314468384),
 ('hope', 0.9383507370948792),
 ('great', 0.9378515481948853)]

In [19]:
glv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [20]:
glv.doesnt_match("facebook cat google microsoft".split())

'cat'

In [21]:
glv.doesnt_match("banana grapes orange human".split())

'human'