# Tryout pretrained word2vec model

In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [3]:
# inspect vocabulary
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")


word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said


In [4]:
# check pairwise similarity
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print(f"{w1}|{w2}|{wv.similarity(w1, w2)}")

car|minivan|0.6907036900520325
car|bicycle|0.5364484190940857
car|airplane|0.42435577511787415
car|cereal|0.13924746215343475
car|communism|0.05820293352007866


In [5]:
# answer analogy questions
# king + woman - man = ?
# man:woman::king:?
print(wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=2))

# wv['Timbers'] - wv['Portland'] + wv['Seattle'] = ?
print(wv.most_similar(positive=['Timbers', 'Seattle'], negative=['Portland'], topn=2))

# wv['Einstein'] - wv['physics'] + wv['classical_music'] = ?
print(wv.most_similar(positive=['Einstein', 'classical_music'], negative=['physics'], topn=2))



[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827)]
[('Sounders', 0.627106249332428), ('Sounders_FC', 0.6018840670585632)]
[('Mozart', 0.5742169618606567), ('Beethoven', 0.5615471601486206)]


# Train your own word2vec model: using gensim 

In [1]:
# prepare sentence corpus from jsonl file
import os
import ujson
# import spacy
# nlp = spacy.load("en_core_web_sm")

def convert_sentences_from_jsonl_file(filepath:str):
    output_path = "data/mahabharat_gutenberg_lemmatized_sents.txt"
    if os.path.exists(output_path):
        return
    
    with open(output_path, "w+") as fp_write:
        with open(filepath) as fp_r:
            for line in fp_r.readlines():
                section = ujson.loads(line)
                paragraphs = section["paragraphs"]
                paragraph_docs = nlp.pipe(paragraphs)
                for p_doc in paragraph_docs:
                    for sent in p_doc.sents:
                        if len(sent) > 4:
                            sent_lemma = " ".join([token.lemma_ for token in sent if not token.is_stop and not token.is_punct])                   
                            fp_write.write(f"{sent_lemma}\n")

convert_sentences_from_jsonl_file("data/mahabharat_gutenberg.jsonl")

In [2]:
import re
al_regex = re.compile(r"[^a-zA-Z]")
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = "data/mahabharat_gutenberg_lemmatized_sents.txt"
        with open(corpus_path) as fp:
            for line in fp.readlines():
                tokens = line.split()
                tokens = [al_regex.sub('', token) for token in tokens]
                yield tokens


In [3]:
import time
import gensim.models

sentences = MyCorpus()
s_time = time.time()
# gensim word2vec defaults 
# self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5,
# max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
# sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
# trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
# comment=None, max_final_vocab=None, shrink_windows=True,
mahabharat_model = gensim.models.Word2Vec(sentences=sentences, vector_size=100, alpha=0.025, window=5, epochs=5)
print(f"Time taken to train word2vec model: {time.time() - s_time} seconds")

Time taken to train word2vec model: 9.266835451126099 seconds


In [4]:
my_wv = mahabharat_model.wv
for index, word in enumerate(my_wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(my_wv.index_to_key)} is {word}")

word #0/16456 is o
word #1/16456 is thou
word #2/16456 is say
word #3/16456 is king
word #4/16456 is great
word #5/16456 is man
word #6/16456 is thee
word #7/16456 is son
word #8/16456 is art
word #9/16456 is thy


In [5]:
# find similar words 
for word in ["Krishna", "mace", "Bhishma", "Drona", "Pandu"]:
    top_similars = [words for words, _ in my_wv.most_similar(word, topn=10)]
    print(f"{word}: {top_similars}")

Krishna: ['Vasudeva', 'Govinda', 'Kesava', 'Hrishikesa', 'Dhananjaya', 'Panchala', 'Janardana', 'exalt', 'Mahadeva', 'Yudhishthira']
mace: ['lance', 'discus', 'spiked', 'club', 'sword', 'axis', 'spear', 'dart', 'spike', 'uplifted']
Bhishma: ['Vaisampayana', 'sringa', 'Kanika', 'XI', 'religiously', 'Sauti', 'reharnesse', 'Upamanyu', 'Volume', 'Vamadeva']
Drona: ['Karna', 'pupil', 'Kripa', 'Aswatthaman', 'Bharadwaja', 'Phalguna', 'Salya', 'Arjuna', 'Vikarna', 'Vibhatsu']
Pandu: ['Kunti', 'Pritha', 'Vikartana', 'Anukampaka', 'Santanu', 'Vishwamitra', 'Anadhristi', 'Saradwat', 'Bidula', 'Radha']


In [64]:
# wv['Arjuna'] - wv['Krishna'] + wv['Duryodhana'] = ?
print(my_wv.most_similar(positive=['Arjuna', 'Duryodhana'], negative=['Krishna'], topn=2))

# wv['Drona'] - wv['Arjuna'] + wv['Drupada'] = ?
print(my_wv.most_similar(positive=['Drona', 'Arjuna'], negative=['Drupada'], topn=2))

# note: due to small size of corpus, analogies won"t be as good as googlenews model

[('Bhimasena', 0.5622424483299255), ('Karna', 0.5439724326133728)]
[('Karna', 0.6152733564376831), ('Gandiva', 0.5495764017105103)]


# Train word2wec model from scratch 

# References
1. [Paper: Efficient Estimation of Word Representations in
Vector Space](https://arxiv.org/pdf/1301.3781.pdf)
2. [The Illustrated Word2vec by Jay Alammar](https://jalammar.github.io/illustrated-word2vec/)
3. [Word2Vec Tutorial - The Skip-Gram Model](http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/)
4. [Tensorflow-word2vec tutorial](https://www.tensorflow.org/text/tutorials/word2vec)