In [18]:
import logging
import multiprocessing
import os

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# Enable gensim logging
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)


class W2VLossLogger(CallbackAny2Vec):
    """Callback to print loss after each epoch
    use by passing model.train(..., callbacks=[W2VLossLogger()])
    """

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()

        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss


def train_w2v_model(
    sentences,
    output_file,
    window,
    embedding_dim,
    epochs,
    min_word_count,
):
    

    """Train a word2vec model based on given sentences.
    Args:
        sentences list[list[str]]: List of sentences. Each element contains a list with the words
            in the current sentence
        output_file (str): Path to save the trained w2v model
        window (int): w2v context size
        embedding_dim (int): w2v vector dimension
        epochs (int): How many epochs should the training run
        min_word_count (int): Ignore words that appear less than min_word_count times
    """
    workers = multiprocessing.cpu_count()
    
    # TODO: Instantiate gensim.models.Word2Vec class
    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, window=window, min_count=min_word_count, workers=multiprocessing.cpu_count())
    model.build_vocab(sentences, progress_per=10000)
    # TODO: Build model vocabulary using sentences
    # TODO: Train word2vec model
    model.train(sentences, total_examples=model.corpus_count,epochs=epochs)
    # Save trained model
    model.save(output_file)
    # model.save(output_file)

    

    return model


if __name__ == "__main__":
    # read data/gutenberg.txt in the expected format
    f=open("tokenized.txt","r")
    sentences =eval(f.read())
    
   
    output_file = "gutenberg_w2v.100d.model"
    window = 5
    embedding_dim = 100
    epochs = 1000
    min_word_count = 1

    gutenberg_w2v =train_w2v_model(
        sentences,
        output_file,
        window,
        embedding_dim,
        epochs,
        min_word_count)

    




INFO - 16:33:19: collecting all words and their counts
INFO - 16:33:19: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:33:19: PROGRESS: at sentence #10000, processed 113516 words, keeping 6065 word types
INFO - 16:33:19: PROGRESS: at sentence #20000, processed 227311 words, keeping 8492 word types
INFO - 16:33:19: PROGRESS: at sentence #30000, processed 324672 words, keeping 9841 word types
INFO - 16:33:19: PROGRESS: at sentence #40000, processed 427445 words, keeping 11779 word types
INFO - 16:33:19: PROGRESS: at sentence #50000, processed 537208 words, keeping 13110 word types
INFO - 16:33:19: PROGRESS: at sentence #60000, processed 647741 words, keeping 14312 word types
INFO - 16:33:19: PROGRESS: at sentence #70000, processed 751694 words, keeping 16043 word types
INFO - 16:33:19: PROGRESS: at sentence #80000, processed 847983 words, keeping 16837 word types
INFO - 16:33:19: PROGRESS: at sentence #90000, processed 957413 words, keeping 17332 word types
I

INFO - 16:33:25: deleting the raw counts dictionary of 41465 items
INFO - 16:33:25: sample=0.001 downsamples 56 most-common words
INFO - 16:33:25: Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 1561117.1381724994 word corpus (73.1%% of prior 2135928)', 'datetime': '2023-04-02T16:33:25.813357', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 16:33:26: estimated required memory for 41465 words and 100 dimensions: 53904500 bytes
INFO - 16:33:26: resetting layer weights
INFO - 16:33:26: Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-02T16:33:26.046544', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'build_vocab'}
INFO - 16:33:26: Word2Vec lifecycle event {'msg': 'training model with 8 workers on 41465 voca

INFO - 16:34:16: EPOCH 54: training on 2135928 raw words (1561756 effective words) took 1.0s, 1530101 effective words/s
INFO - 16:34:17: EPOCH 55 - PROGRESS: at 97.98% examples, 1510215 words/s, in_qsize 4, out_qsize 1
INFO - 16:34:17: EPOCH 55: training on 2135928 raw words (1561051 effective words) took 1.0s, 1531861 effective words/s
INFO - 16:34:18: EPOCH 56 - PROGRESS: at 98.91% examples, 1544399 words/s, in_qsize 2, out_qsize 1
INFO - 16:34:18: EPOCH 56: training on 2135928 raw words (1560995 effective words) took 1.0s, 1549858 effective words/s
INFO - 16:34:19: EPOCH 57: training on 2135928 raw words (1561376 effective words) took 0.9s, 1735452 effective words/s
INFO - 16:34:20: EPOCH 58 - PROGRESS: at 96.34% examples, 1503499 words/s, in_qsize 7, out_qsize 1
INFO - 16:34:20: EPOCH 58: training on 2135928 raw words (1561387 effective words) took 1.0s, 1531337 effective words/s
INFO - 16:34:20: EPOCH 59: training on 2135928 raw words (1561170 effective words) took 0.9s, 1786041 e

INFO - 16:35:13: EPOCH 116: training on 2135928 raw words (1561018 effective words) took 0.9s, 1690869 effective words/s
INFO - 16:35:14: EPOCH 117: training on 2135928 raw words (1560581 effective words) took 0.9s, 1760259 effective words/s
INFO - 16:35:15: EPOCH 118: training on 2135928 raw words (1561220 effective words) took 0.9s, 1791303 effective words/s
INFO - 16:35:16: EPOCH 119: training on 2135928 raw words (1560747 effective words) took 0.8s, 1842165 effective words/s
INFO - 16:35:16: EPOCH 120: training on 2135928 raw words (1561150 effective words) took 0.9s, 1813429 effective words/s
INFO - 16:35:17: EPOCH 121: training on 2135928 raw words (1561266 effective words) took 0.9s, 1762406 effective words/s
INFO - 16:35:18: EPOCH 122: training on 2135928 raw words (1561129 effective words) took 0.9s, 1793702 effective words/s
INFO - 16:35:19: EPOCH 123: training on 2135928 raw words (1561563 effective words) took 0.9s, 1712236 effective words/s
INFO - 16:35:20: EPOCH 124: trai

INFO - 16:36:13: EPOCH 181: training on 2135928 raw words (1561710 effective words) took 1.0s, 1632313 effective words/s
INFO - 16:36:14: EPOCH 182: training on 2135928 raw words (1561469 effective words) took 0.9s, 1766363 effective words/s
INFO - 16:36:15: EPOCH 183: training on 2135928 raw words (1560608 effective words) took 0.9s, 1728821 effective words/s
INFO - 16:36:15: EPOCH 184: training on 2135928 raw words (1561393 effective words) took 0.9s, 1785243 effective words/s
INFO - 16:36:16: EPOCH 185: training on 2135928 raw words (1561719 effective words) took 0.9s, 1835989 effective words/s
INFO - 16:36:17: EPOCH 186: training on 2135928 raw words (1561533 effective words) took 0.9s, 1812254 effective words/s
INFO - 16:36:18: EPOCH 187: training on 2135928 raw words (1561790 effective words) took 0.9s, 1782743 effective words/s
INFO - 16:36:19: EPOCH 188: training on 2135928 raw words (1560813 effective words) took 0.9s, 1790097 effective words/s
INFO - 16:36:20: EPOCH 189: trai

INFO - 16:37:10: EPOCH 245 - PROGRESS: at 86.73% examples, 1380891 words/s, in_qsize 14, out_qsize 1
INFO - 16:37:11: EPOCH 245: training on 2135928 raw words (1560929 effective words) took 1.1s, 1426641 effective words/s
INFO - 16:37:12: EPOCH 246: training on 2135928 raw words (1561308 effective words) took 1.0s, 1597404 effective words/s
INFO - 16:37:13: EPOCH 247 - PROGRESS: at 72.72% examples, 1156473 words/s, in_qsize 14, out_qsize 1
INFO - 16:37:13: EPOCH 247: training on 2135928 raw words (1560660 effective words) took 1.3s, 1212431 effective words/s
INFO - 16:37:14: EPOCH 248 - PROGRESS: at 95.28% examples, 1480413 words/s, in_qsize 9, out_qsize 0
INFO - 16:37:14: EPOCH 248: training on 2135928 raw words (1560993 effective words) took 1.0s, 1505752 effective words/s
INFO - 16:37:15: EPOCH 249: training on 2135928 raw words (1560928 effective words) took 1.0s, 1613767 effective words/s
INFO - 16:37:16: EPOCH 250 - PROGRESS: at 95.80% examples, 1498873 words/s, in_qsize 8, out_q

INFO - 16:38:06: EPOCH 302: training on 2135928 raw words (1560895 effective words) took 0.9s, 1752902 effective words/s
INFO - 16:38:07: EPOCH 303: training on 2135928 raw words (1560372 effective words) took 0.9s, 1772739 effective words/s
INFO - 16:38:08: EPOCH 304: training on 2135928 raw words (1561322 effective words) took 1.0s, 1612531 effective words/s
INFO - 16:38:09: EPOCH 305: training on 2135928 raw words (1560963 effective words) took 0.9s, 1707307 effective words/s
INFO - 16:38:10: EPOCH 306: training on 2135928 raw words (1560806 effective words) took 0.9s, 1772425 effective words/s
INFO - 16:38:10: EPOCH 307: training on 2135928 raw words (1561135 effective words) took 0.9s, 1829124 effective words/s
INFO - 16:38:11: EPOCH 308: training on 2135928 raw words (1560855 effective words) took 0.9s, 1825647 effective words/s
INFO - 16:38:12: EPOCH 309: training on 2135928 raw words (1561369 effective words) took 0.9s, 1796866 effective words/s
INFO - 16:38:13: EPOCH 310: trai

INFO - 16:39:06: EPOCH 370: training on 2135928 raw words (1560726 effective words) took 0.9s, 1810502 effective words/s
INFO - 16:39:06: EPOCH 371: training on 2135928 raw words (1560983 effective words) took 0.9s, 1771452 effective words/s
INFO - 16:39:07: EPOCH 372: training on 2135928 raw words (1561361 effective words) took 0.9s, 1773122 effective words/s
INFO - 16:39:08: EPOCH 373: training on 2135928 raw words (1560784 effective words) took 0.9s, 1809968 effective words/s
INFO - 16:39:09: EPOCH 374: training on 2135928 raw words (1560766 effective words) took 0.9s, 1800077 effective words/s
INFO - 16:39:10: EPOCH 375: training on 2135928 raw words (1560954 effective words) took 0.9s, 1811598 effective words/s
INFO - 16:39:11: EPOCH 376: training on 2135928 raw words (1561512 effective words) took 0.9s, 1812706 effective words/s
INFO - 16:39:12: EPOCH 377: training on 2135928 raw words (1561557 effective words) took 0.9s, 1745899 effective words/s
INFO - 16:39:13: EPOCH 378: trai

INFO - 16:40:04: EPOCH 438: training on 2135928 raw words (1561233 effective words) took 0.8s, 1842597 effective words/s
INFO - 16:40:05: EPOCH 439: training on 2135928 raw words (1560787 effective words) took 0.8s, 1840205 effective words/s
INFO - 16:40:06: EPOCH 440: training on 2135928 raw words (1561145 effective words) took 0.8s, 1842318 effective words/s
INFO - 16:40:07: EPOCH 441: training on 2135928 raw words (1560856 effective words) took 0.8s, 1839228 effective words/s
INFO - 16:40:08: EPOCH 442: training on 2135928 raw words (1561782 effective words) took 0.8s, 1860634 effective words/s
INFO - 16:40:08: EPOCH 443: training on 2135928 raw words (1560714 effective words) took 0.8s, 1846136 effective words/s
INFO - 16:40:09: EPOCH 444: training on 2135928 raw words (1560684 effective words) took 0.8s, 1862597 effective words/s
INFO - 16:40:10: EPOCH 445: training on 2135928 raw words (1561511 effective words) took 0.8s, 1872605 effective words/s
INFO - 16:40:11: EPOCH 446: trai

INFO - 16:41:03: EPOCH 506: training on 2135928 raw words (1561168 effective words) took 0.9s, 1767034 effective words/s
INFO - 16:41:04: EPOCH 507: training on 2135928 raw words (1560975 effective words) took 0.9s, 1781958 effective words/s
INFO - 16:41:05: EPOCH 508: training on 2135928 raw words (1560885 effective words) took 0.9s, 1738452 effective words/s
INFO - 16:41:06: EPOCH 509: training on 2135928 raw words (1560980 effective words) took 0.9s, 1783714 effective words/s
INFO - 16:41:07: EPOCH 510: training on 2135928 raw words (1561788 effective words) took 0.9s, 1800371 effective words/s
INFO - 16:41:08: EPOCH 511: training on 2135928 raw words (1561837 effective words) took 0.9s, 1794591 effective words/s
INFO - 16:41:09: EPOCH 512: training on 2135928 raw words (1560938 effective words) took 0.9s, 1763303 effective words/s
INFO - 16:41:09: EPOCH 513: training on 2135928 raw words (1560785 effective words) took 0.9s, 1814144 effective words/s
INFO - 16:41:10: EPOCH 514: trai

INFO - 16:42:04: EPOCH 574: training on 2135928 raw words (1561093 effective words) took 0.9s, 1749361 effective words/s
INFO - 16:42:04: EPOCH 575: training on 2135928 raw words (1561787 effective words) took 0.9s, 1756178 effective words/s
INFO - 16:42:05: EPOCH 576: training on 2135928 raw words (1561632 effective words) took 0.9s, 1767612 effective words/s
INFO - 16:42:06: EPOCH 577: training on 2135928 raw words (1560851 effective words) took 0.9s, 1774957 effective words/s
INFO - 16:42:07: EPOCH 578: training on 2135928 raw words (1561439 effective words) took 0.9s, 1764903 effective words/s
INFO - 16:42:08: EPOCH 579: training on 2135928 raw words (1561565 effective words) took 0.9s, 1747991 effective words/s
INFO - 16:42:09: EPOCH 580: training on 2135928 raw words (1561968 effective words) took 0.9s, 1791896 effective words/s
INFO - 16:42:10: EPOCH 581: training on 2135928 raw words (1560913 effective words) took 0.9s, 1793679 effective words/s
INFO - 16:42:11: EPOCH 582: trai

INFO - 16:43:04: EPOCH 641: training on 2135928 raw words (1561323 effective words) took 0.9s, 1811019 effective words/s
INFO - 16:43:04: EPOCH 642: training on 2135928 raw words (1561418 effective words) took 0.9s, 1779869 effective words/s
INFO - 16:43:05: EPOCH 643: training on 2135928 raw words (1561513 effective words) took 0.9s, 1724591 effective words/s
INFO - 16:43:06: EPOCH 644: training on 2135928 raw words (1560634 effective words) took 0.9s, 1724124 effective words/s
INFO - 16:43:07: EPOCH 645: training on 2135928 raw words (1560997 effective words) took 0.9s, 1742031 effective words/s
INFO - 16:43:08: EPOCH 646: training on 2135928 raw words (1561695 effective words) took 1.0s, 1633028 effective words/s
INFO - 16:43:09: EPOCH 647: training on 2135928 raw words (1561396 effective words) took 0.9s, 1764914 effective words/s
INFO - 16:43:10: EPOCH 648: training on 2135928 raw words (1560660 effective words) took 1.0s, 1621052 effective words/s
INFO - 16:43:11: EPOCH 649: trai

INFO - 16:44:04: EPOCH 709: training on 2135928 raw words (1561184 effective words) took 0.9s, 1806024 effective words/s
INFO - 16:44:05: EPOCH 710: training on 2135928 raw words (1561350 effective words) took 0.9s, 1764056 effective words/s
INFO - 16:44:06: EPOCH 711: training on 2135928 raw words (1561340 effective words) took 0.9s, 1776599 effective words/s
INFO - 16:44:07: EPOCH 712: training on 2135928 raw words (1561494 effective words) took 0.9s, 1778853 effective words/s
INFO - 16:44:08: EPOCH 713: training on 2135928 raw words (1560511 effective words) took 0.9s, 1760273 effective words/s
INFO - 16:44:08: EPOCH 714: training on 2135928 raw words (1560825 effective words) took 0.9s, 1771943 effective words/s
INFO - 16:44:09: EPOCH 715: training on 2135928 raw words (1561364 effective words) took 0.9s, 1798835 effective words/s
INFO - 16:44:10: EPOCH 716: training on 2135928 raw words (1561651 effective words) took 0.9s, 1782528 effective words/s
INFO - 16:44:11: EPOCH 717: trai

INFO - 16:45:03: EPOCH 775: training on 2135928 raw words (1561464 effective words) took 0.9s, 1729847 effective words/s
INFO - 16:45:04: EPOCH 776: training on 2135928 raw words (1561271 effective words) took 0.9s, 1768261 effective words/s
INFO - 16:45:05: EPOCH 777: training on 2135928 raw words (1561958 effective words) took 0.9s, 1793033 effective words/s
INFO - 16:45:06: EPOCH 778: training on 2135928 raw words (1560530 effective words) took 0.9s, 1792036 effective words/s
INFO - 16:45:07: EPOCH 779: training on 2135928 raw words (1560936 effective words) took 0.9s, 1750273 effective words/s
INFO - 16:45:08: EPOCH 780: training on 2135928 raw words (1560952 effective words) took 0.9s, 1780956 effective words/s
INFO - 16:45:09: EPOCH 781: training on 2135928 raw words (1561107 effective words) took 0.9s, 1789815 effective words/s
INFO - 16:45:10: EPOCH 782: training on 2135928 raw words (1561062 effective words) took 0.9s, 1756647 effective words/s
INFO - 16:45:11: EPOCH 783: trai

INFO - 16:46:01: EPOCH 839: training on 2135928 raw words (1560994 effective words) took 0.9s, 1749250 effective words/s
INFO - 16:46:02: EPOCH 840: training on 2135928 raw words (1561024 effective words) took 1.0s, 1586436 effective words/s
INFO - 16:46:03: EPOCH 841: training on 2135928 raw words (1561063 effective words) took 1.0s, 1594086 effective words/s
INFO - 16:46:04: EPOCH 842: training on 2135928 raw words (1560526 effective words) took 0.9s, 1726190 effective words/s
INFO - 16:46:05: EPOCH 843: training on 2135928 raw words (1560754 effective words) took 1.0s, 1586819 effective words/s
INFO - 16:46:06: EPOCH 844: training on 2135928 raw words (1561156 effective words) took 0.9s, 1737342 effective words/s
INFO - 16:46:07: EPOCH 845: training on 2135928 raw words (1560859 effective words) took 0.9s, 1788312 effective words/s
INFO - 16:46:08: EPOCH 846: training on 2135928 raw words (1560721 effective words) took 0.9s, 1811049 effective words/s
INFO - 16:46:09: EPOCH 847: trai

INFO - 16:46:57: EPOCH 897: training on 2135928 raw words (1561616 effective words) took 0.9s, 1754452 effective words/s
INFO - 16:46:58: EPOCH 898: training on 2135928 raw words (1561120 effective words) took 0.9s, 1753514 effective words/s
INFO - 16:46:58: EPOCH 899: training on 2135928 raw words (1560762 effective words) took 0.9s, 1763197 effective words/s
INFO - 16:46:59: EPOCH 900: training on 2135928 raw words (1560644 effective words) took 0.9s, 1790324 effective words/s
INFO - 16:47:00: EPOCH 901: training on 2135928 raw words (1561671 effective words) took 0.9s, 1755146 effective words/s
INFO - 16:47:01: EPOCH 902: training on 2135928 raw words (1561055 effective words) took 0.9s, 1748817 effective words/s
INFO - 16:47:02: EPOCH 903: training on 2135928 raw words (1561714 effective words) took 0.9s, 1768891 effective words/s
INFO - 16:47:03: EPOCH 904: training on 2135928 raw words (1561171 effective words) took 0.9s, 1771755 effective words/s
INFO - 16:47:04: EPOCH 905: trai

INFO - 16:47:57: EPOCH 965: training on 2135928 raw words (1561098 effective words) took 0.9s, 1776467 effective words/s
INFO - 16:47:58: EPOCH 966: training on 2135928 raw words (1560774 effective words) took 0.9s, 1755078 effective words/s
INFO - 16:47:59: EPOCH 967: training on 2135928 raw words (1561377 effective words) took 0.9s, 1781762 effective words/s
INFO - 16:48:00: EPOCH 968: training on 2135928 raw words (1560808 effective words) took 0.9s, 1762656 effective words/s
INFO - 16:48:01: EPOCH 969: training on 2135928 raw words (1561322 effective words) took 0.9s, 1776425 effective words/s
INFO - 16:48:02: EPOCH 970: training on 2135928 raw words (1561097 effective words) took 0.9s, 1768257 effective words/s
INFO - 16:48:03: EPOCH 971: training on 2135928 raw words (1561605 effective words) took 0.9s, 1767250 effective words/s
INFO - 16:48:04: EPOCH 972: training on 2135928 raw words (1562074 effective words) took 0.9s, 1754420 effective words/s
INFO - 16:48:04: EPOCH 973: trai

In [20]:
gutenberg_w2v.wv.most_similar(positive=["bible"])


[('cruse', 0.3911580741405487),
 ('story', 0.3839758336544037),
 ('official', 0.37177857756614685),
 ('loop', 0.3678949475288391),
 ('dreamer', 0.3666142523288727),
 ('wrist', 0.36367979645729065),
 ('admirant', 0.34465309977531433),
 ('piasters', 0.3422392010688782),
 ('cyn', 0.3418917655944824),
 ('hearth', 0.34067288041114807)]

In [19]:
gutenberg_w2v.wv.most_similar(positive=["book"])


[('written', 0.48968249559402466),
 ('letter', 0.4876655042171478),
 ('temple', 0.4826296269893646),
 ('xxvi', 0.4754643440246582),
 ('xxv', 0.4714440703392029),
 ('xxviii', 0.4701104462146759),
 ('xxxi', 0.4691607356071472),
 ('xxiii', 0.4676828980445862),
 ('xxvii', 0.46753865480422974),
 ('pen', 0.46510955691337585)]

In [22]:
gutenberg_w2v.wv.most_similar(positive=["bank"])


[('top', 0.5600375533103943),
 ('ground', 0.53549724817276),
 ('wall', 0.5250206589698792),
 ('table', 0.5072520971298218),
 ('floor', 0.49955102801322937),
 ('side', 0.4951784312725067),
 ('road', 0.44858473539352417),
 ('hill', 0.447895884513855),
 ('edge', 0.44256141781806946),
 ('bottom', 0.4280133545398712)]

In [23]:
gutenberg_w2v.wv.most_similar(positive=["water"])

[('waters', 0.6415269374847412),
 ('river', 0.5259552597999573),
 ('wine', 0.49791309237480164),
 ('fire', 0.49540627002716064),
 ('blood', 0.47519922256469727),
 ('sea', 0.4743334650993347),
 ('fish', 0.4742337167263031),
 ('frogs', 0.46917083859443665),
 ('hole', 0.4668067693710327),
 ('wood', 0.45990630984306335)]

In [27]:
v = gutenberg_w2v("grills") - gutenberg_w2v("queens") + gutenberg_w2v("kings")

TypeError: 'Word2Vec' object is not callable

In [None]:
v = w2v("good") – w2v("taller") + w2v("tall")

In [None]:
v = w2v("france") – w2v("paris") + w2v("london")