In [32]:
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import word2vec, Word2Vec

In [33]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [34]:
datafile = './data/text8'

sentences = word2vec.Text8Corpus(datafile)

In [35]:
model = Word2Vec( vector_size=150, window=10, min_count=5, workers=14)

2023-08-21 18:10:52,677 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=150, alpha=0.025>', 'datetime': '2023-08-21T18:10:52.677138', 'gensim': '4.3.1', 'python': '3.10.6 (main, May 29 2023, 11:10:38) [GCC 11.3.0]', 'platform': 'Linux-6.0.12-76060006-generic-x86_64-with-glibc2.35', 'event': 'created'}


In [36]:
model.build_vocab(sentences)
model.train(sentences, epochs=10, total_examples=model.corpus_count)

2023-08-21 18:10:52,702 : INFO : collecting all words and their counts
2023-08-21 18:10:52,704 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


2023-08-21 18:11:07,933 : INFO : PROGRESS: at sentence #10000, processed 100000000 words, keeping 694463 word types
2023-08-21 18:11:11,896 : INFO : collected 833184 word types from a corpus of 124301826 raw words and 12431 sentences
2023-08-21 18:11:11,896 : INFO : Creating a fresh vocabulary
2023-08-21 18:11:12,491 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 218316 unique words (26.20% of original 833184, drops 614868)', 'datetime': '2023-08-21T18:11:12.491169', 'gensim': '4.3.1', 'python': '3.10.6 (main, May 29 2023, 11:10:38) [GCC 11.3.0]', 'platform': 'Linux-6.0.12-76060006-generic-x86_64-with-glibc2.35', 'event': 'prepare_vocab'}
2023-08-21 18:11:12,491 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 123353509 word corpus (99.24% of original 124301826, drops 948317)', 'datetime': '2023-08-21T18:11:12.491793', 'gensim': '4.3.1', 'python': '3.10.6 (main, May 29 2023, 11:10:38) [GCC 11.3.0]', 'platform': 'Linux-6.0.12-76060006-gen

(881624839, 1243018260)

In [37]:
model.save('./data/t8.model')

2023-08-21 18:20:40,703 : INFO : Word2Vec lifecycle event {'fname_or_handle': './data/t8.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-08-21T18:20:40.703126', 'gensim': '4.3.1', 'python': '3.10.6 (main, May 29 2023, 11:10:38) [GCC 11.3.0]', 'platform': 'Linux-6.0.12-76060006-generic-x86_64-with-glibc2.35', 'event': 'saving'}
2023-08-21 18:20:40,704 : INFO : storing np array 'vectors' to ./data/t8.model.wv.vectors.npy
2023-08-21 18:20:40,794 : INFO : storing np array 'syn1neg' to ./data/t8.model.syn1neg.npy
2023-08-21 18:20:40,885 : INFO : not storing attribute cum_table
2023-08-21 18:20:40,967 : INFO : saved ./data/t8.model


###### TODO: try both cbow and skipgram algo



In [64]:
wvecs = model.wv

wvecs.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)[0]

('queen', 0.7425088286399841)

In [65]:
wvecs.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('queen', 1.0131878852844238),
 ('consort', 0.9256384372711182),
 ('isabella', 0.9086795449256897),
 ('berengaria', 0.9038677215576172),
 ('princess', 0.8987451791763306),
 ('empress', 0.898198664188385),
 ('sibylla', 0.8961239457130432),
 ('monarch', 0.8903862833976746),
 ('consorts', 0.8881845474243164),
 ('juliana', 0.8738452196121216)]

In [66]:
wvecs['computer']

array([ 0.49592537, -3.1891572 , -2.2457151 , -5.8747573 , -2.0041273 ,
        1.4226991 , -3.628802  ,  1.1579313 , -5.33434   ,  3.532866  ,
       -0.27772924,  3.308482  , -2.8225465 ,  0.39876872,  2.7010603 ,
       -2.4277253 ,  2.0706992 ,  1.9089468 ,  2.6764374 , -2.283338  ,
        0.9065073 , -0.63707906,  2.2425954 ,  0.54302025,  1.8835143 ,
       -0.9078359 ,  1.1590532 , -5.4599595 , -0.8773132 , -1.5050745 ,
        0.01741553, -1.5671031 , -0.3230297 ,  3.8371875 ,  0.868768  ,
       -2.066443  ,  2.402219  , -1.946584  , -1.7279466 , -1.2619343 ,
        3.05435   , -1.3148477 , -1.732287  ,  1.1543812 ,  0.48508435,
       -1.84521   ,  1.9169743 ,  3.8433475 , -1.231384  ,  0.7861585 ,
        0.6148784 , -1.8372176 ,  3.358682  ,  3.75235   ,  1.7750186 ,
       -2.4798787 , -0.926995  , -0.8599408 ,  0.82124317, -0.7231485 ,
        5.6949677 , -1.5136521 ,  0.8121045 , -1.4240032 ,  0.10093074,
       -1.7910693 , -3.0969243 , -0.6231855 , -3.6005256 , -4.45

In [67]:
wvecs.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [68]:
wvecs.most_similar(positive=['cricket', 'tendulkar'], negative = ['australia'])

[('sachin', 0.7389860153198242),
 ('gavaskar', 0.6543283462524414),
 ('dravid', 0.6493920683860779),
 ('odi', 0.6422939300537109),
 ('cricinfo', 0.6287909746170044),
 ('sourav', 0.6099755764007568),
 ('odis', 0.5819308161735535),
 ('kapil', 0.5752612948417664),
 ('muttiah', 0.5735766291618347),
 ('ganguly', 0.5718796849250793)]

In [69]:
synonyms = wvecs.most_similar("beautiful")
print("Synonyms for 'beautiful': ", synonyms)

Synonyms for 'beautiful':  [('lovely', 0.7275329828262329), ('delightful', 0.7041609883308411), ('charming', 0.6669224500656128), ('magnificent', 0.650934636592865), ('picturesque', 0.6506274938583374), ('gorgeous', 0.6429691910743713), ('wonderful', 0.6405431032180786), ('beauties', 0.638332188129425), ('prettiest', 0.6351163983345032), ('handsome', 0.6178432106971741)]


In [70]:
antonym = wvecs.most_similar(positive=["strong", "weak"], negative=["powerful"])
print("Antonym of 'strong': ", antonym)


Antonym of 'strong':  [('slight', 0.47883889079093933), ('mild', 0.45128118991851807), ('insufficient', 0.44061610102653503), ('aggressiveness', 0.4397987723350525), ('lack', 0.43119075894355774), ('tight', 0.4286508560180664), ('stubborn', 0.42268553376197815), ('minimal', 0.4187222421169281), ('moderate', 0.4084620475769043), ('considerable', 0.4068261682987213)]


In [71]:

# Word Similarity
similarity_ranking = wvecs.most_similar("dog")
print("Word similarity ranking: ", similarity_ranking)

similarity_score = wvecs.similarity("computer", "keyboard")
print("Similarity between 'computer' and 'keyboard': ", similarity_score)


Word similarity ranking:  [('hound', 0.7799950838088989), ('dogs', 0.7793302536010742), ('sheepdog', 0.7477481961250305), ('cat', 0.7372485399246216), ('puppy', 0.6995719075202942), ('rottweiler', 0.6956620216369629), ('canine', 0.6654011607170105), ('spaniels', 0.66372150182724), ('wag', 0.6586538553237915), ('shaggy', 0.6467984914779663)]
Similarity between 'computer' and 'keyboard':  0.40872315


In [72]:

# Word Composition
running_relationship = wvecs.most_similar(positive=["ran", "running"], negative=["ran"])
print("Running relationship: ", running_relationship)

talk_relationship = wvecs.most_similar(positive=["talked", "talking"], negative=["talked"])
print("Talking relationship: ", talk_relationship)


Running relationship:  [('run', 0.7612755298614502), ('runs', 0.6255279779434204), ('pulled', 0.5429465174674988), ('haul', 0.5408431887626648), ('jumps', 0.515490710735321), ('walking', 0.5149667263031006), ('jumping', 0.513019323348999), ('pulling', 0.5053175687789917), ('slipping', 0.4904412627220154), ('stretch', 0.488147497177124)]
Talking relationship:  [('reminiscing', 0.6332584619522095), ('watching', 0.5662696957588196), ('joking', 0.547974169254303), ('telling', 0.5378240942955017), ('chatting', 0.533144474029541), ('dumb', 0.5303834080696106), ('fantasize', 0.5276933908462524), ('fantasizing', 0.5248972177505493), ('conversation', 0.517418622970581), ('inquiring', 0.5148041248321533)]


In [75]:
# Contextual Understanding

context_words = wvecs.most_similar("science")
print("around 'science': ", context_words)

ocean_context = wvecs.most_similar("ocean")
print("around 'ocean': ", ocean_context)



around 'science':  [('sciences', 0.671763002872467), ('humanities', 0.6314123272895813), ('scientific', 0.613213837146759), ('physics', 0.6086891293525696), ('cybernetics', 0.60279381275177), ('psychology', 0.596961259841919), ('philosophy', 0.5845029950141907), ('biology', 0.5778851509094238), ('neuroscience', 0.5759519934654236), ('mathematics', 0.5725533962249756)]
around 'ocean':  [('oceans', 0.7444000840187073), ('oceanic', 0.6311589479446411), ('sea', 0.6176193356513977), ('waters', 0.6018313765525818), ('atlantic', 0.5843362808227539), ('coast', 0.5809761881828308), ('seafloor', 0.5807109475135803), ('reef', 0.5706163644790649), ('seamount', 0.5674899816513062), ('seabed', 0.5631421804428101)]


In [None]:
# Outliers and Oddities
odd_word = wvecs.doesnt_match(["apple", "banana", "cherry", "potato"])
print("Odd word out: ", odd_word)

distant_word = wvecs.doesnt_match(["apple", "banana", "chicken", "potato"])
print("Distant word out: ", distant_word)


Odd word out:  apple
Distant word out:  apple
