In [2]:
import logging
import multiprocessing
import os

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# Enable gensim logging
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)


class W2VLossLogger(CallbackAny2Vec):
    """Callback to print loss after each epoch
    use by passing model.train(..., callbacks=[W2VLossLogger()])
    """

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()

        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss


def train_w2v_model(
    sentences,
    output_file,
    window,
    embedding_dim,
    epochs,
    min_word_count,
):
    

    """Train a word2vec model based on given sentences.
    Args:
        sentences list[list[str]]: List of sentences. Each element contains a list with the words
            in the current sentence
        output_file (str): Path to save the trained w2v model
        window (int): w2v context size
        embedding_dim (int): w2v vector dimension
        epochs (int): How many epochs should the training run
        min_word_count (int): Ignore words that appear less than min_word_count times
    """
    workers = multiprocessing.cpu_count()
    
    # TODO: Instantiate gensim.models.Word2Vec class
    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, window=window, min_count=min_word_count, workers=multiprocessing.cpu_count())
    model.build_vocab(sentences, progress_per=10000)
    # TODO: Build model vocabulary using sentences
    # TODO: Train word2vec model
    model.train(sentences, total_examples=model.corpus_count,epochs=epochs)
    # Save trained model
    model.save(output_file)
    # model.save(output_file)

    

    return model


if __name__ == "__main__":
    # read data/gutenberg.txt in the expected format
    f=open("tokenized.txt","r")
    sentences =eval(f.read())
    
   
    output_file = "gutenberg_w2v.hundd.model"
    window = 5
    embedding_dim = 100
    epochs = 1000
    min_word_count = 1

    gutenberg_w2v =train_w2v_model(
        sentences,
        output_file,
        window,
        embedding_dim,
        epochs,
        min_word_count)

    




INFO - 10:34:45: collecting all words and their counts
INFO - 10:34:45: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 10:34:45: PROGRESS: at sentence #10000, processed 113516 words, keeping 6065 word types
INFO - 10:34:45: PROGRESS: at sentence #20000, processed 227311 words, keeping 8492 word types
INFO - 10:34:45: PROGRESS: at sentence #30000, processed 324672 words, keeping 9841 word types
INFO - 10:34:45: PROGRESS: at sentence #40000, processed 427445 words, keeping 11779 word types
INFO - 10:34:45: PROGRESS: at sentence #50000, processed 537208 words, keeping 13110 word types
INFO - 10:34:45: PROGRESS: at sentence #60000, processed 647741 words, keeping 14312 word types
INFO - 10:34:45: PROGRESS: at sentence #70000, processed 751694 words, keeping 16043 word types
INFO - 10:34:45: PROGRESS: at sentence #80000, processed 847983 words, keeping 16837 word types
INFO - 10:34:45: PROGRESS: at sentence #90000, processed 957413 words, keeping 17332 word types
I

INFO - 10:34:51: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 2135928 word corpus (100.00% of original 2135928, drops 0)', 'datetime': '2023-04-03T10:34:51.792720', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 10:34:51: deleting the raw counts dictionary of 41465 items
INFO - 10:34:51: sample=0.001 downsamples 56 most-common words
INFO - 10:34:51: Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 1561117.1381724994 word corpus (73.1%% of prior 2135928)', 'datetime': '2023-04-03T10:34:51.955980', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 10:34:52: estimated required memory for 41465 words and 100 dimensions: 53904500 bytes
INFO - 10:34:52: resetting layer weights
INFO - 10:34:52: Word2Vec lifecy

INFO - 10:35:38: EPOCH 54: training on 2135928 raw words (1561549 effective words) took 0.8s, 1842824 effective words/s
INFO - 10:35:39: EPOCH 55: training on 2135928 raw words (1560980 effective words) took 0.8s, 1858158 effective words/s
INFO - 10:35:40: EPOCH 56: training on 2135928 raw words (1560709 effective words) took 0.8s, 1861996 effective words/s
INFO - 10:35:40: EPOCH 57: training on 2135928 raw words (1560765 effective words) took 0.8s, 1865338 effective words/s
INFO - 10:35:41: EPOCH 58: training on 2135928 raw words (1560596 effective words) took 0.8s, 1901713 effective words/s
INFO - 10:35:42: EPOCH 59: training on 2135928 raw words (1560725 effective words) took 0.8s, 1894080 effective words/s
INFO - 10:35:43: EPOCH 60: training on 2135928 raw words (1562172 effective words) took 0.9s, 1733108 effective words/s
INFO - 10:35:44: EPOCH 61: training on 2135928 raw words (1561007 effective words) took 0.9s, 1769117 effective words/s
INFO - 10:35:45: EPOCH 62: training on 2

INFO - 10:36:36: EPOCH 123: training on 2135928 raw words (1561356 effective words) took 0.8s, 1859200 effective words/s
INFO - 10:36:37: EPOCH 124: training on 2135928 raw words (1561008 effective words) took 0.9s, 1836332 effective words/s
INFO - 10:36:38: EPOCH 125: training on 2135928 raw words (1560998 effective words) took 0.8s, 1869857 effective words/s
INFO - 10:36:39: EPOCH 126: training on 2135928 raw words (1561131 effective words) took 0.8s, 1850909 effective words/s
INFO - 10:36:40: EPOCH 127: training on 2135928 raw words (1561099 effective words) took 0.9s, 1834051 effective words/s
INFO - 10:36:40: EPOCH 128: training on 2135928 raw words (1559954 effective words) took 0.8s, 1871676 effective words/s
INFO - 10:36:41: EPOCH 129: training on 2135928 raw words (1561612 effective words) took 0.9s, 1817049 effective words/s
INFO - 10:36:42: EPOCH 130: training on 2135928 raw words (1561420 effective words) took 0.8s, 1850873 effective words/s
INFO - 10:36:43: EPOCH 131: trai

INFO - 10:37:34: EPOCH 191: training on 2135928 raw words (1561361 effective words) took 0.8s, 1871890 effective words/s
INFO - 10:37:35: EPOCH 192: training on 2135928 raw words (1561262 effective words) took 0.9s, 1747578 effective words/s
INFO - 10:37:36: EPOCH 193: training on 2135928 raw words (1559919 effective words) took 0.8s, 1850894 effective words/s
INFO - 10:37:37: EPOCH 194: training on 2135928 raw words (1561111 effective words) took 0.8s, 1847248 effective words/s
INFO - 10:37:37: EPOCH 195: training on 2135928 raw words (1560918 effective words) took 0.8s, 1853794 effective words/s
INFO - 10:37:38: EPOCH 196: training on 2135928 raw words (1561125 effective words) took 0.8s, 1841543 effective words/s
INFO - 10:37:39: EPOCH 197: training on 2135928 raw words (1561426 effective words) took 0.8s, 1845505 effective words/s
INFO - 10:37:40: EPOCH 198: training on 2135928 raw words (1560162 effective words) took 0.8s, 1892607 effective words/s
INFO - 10:37:41: EPOCH 199: trai

INFO - 10:38:32: EPOCH 259: training on 2135928 raw words (1560377 effective words) took 0.8s, 1855253 effective words/s
INFO - 10:38:33: EPOCH 260: training on 2135928 raw words (1560851 effective words) took 0.9s, 1826189 effective words/s
INFO - 10:38:34: EPOCH 261: training on 2135928 raw words (1561348 effective words) took 0.9s, 1830828 effective words/s
INFO - 10:38:34: EPOCH 262: training on 2135928 raw words (1561737 effective words) took 0.8s, 1890833 effective words/s
INFO - 10:38:35: EPOCH 263: training on 2135928 raw words (1560641 effective words) took 0.8s, 1896744 effective words/s
INFO - 10:38:36: EPOCH 264: training on 2135928 raw words (1561346 effective words) took 0.8s, 1843904 effective words/s
INFO - 10:38:37: EPOCH 265: training on 2135928 raw words (1561410 effective words) took 0.9s, 1813087 effective words/s
INFO - 10:38:38: EPOCH 266: training on 2135928 raw words (1560989 effective words) took 0.8s, 1859893 effective words/s
INFO - 10:38:39: EPOCH 267: trai

INFO - 10:39:28: EPOCH 323: training on 2135928 raw words (1561641 effective words) took 0.8s, 1844192 effective words/s
INFO - 10:39:28: EPOCH 324: training on 2135928 raw words (1561466 effective words) took 0.8s, 1901831 effective words/s
INFO - 10:39:29: EPOCH 325: training on 2135928 raw words (1561166 effective words) took 0.9s, 1828417 effective words/s
INFO - 10:39:30: EPOCH 326: training on 2135928 raw words (1561492 effective words) took 0.9s, 1795545 effective words/s
INFO - 10:39:31: EPOCH 327: training on 2135928 raw words (1560507 effective words) took 0.8s, 1874554 effective words/s
INFO - 10:39:32: EPOCH 328: training on 2135928 raw words (1560853 effective words) took 0.8s, 1857154 effective words/s
INFO - 10:39:33: EPOCH 329: training on 2135928 raw words (1560509 effective words) took 0.9s, 1824563 effective words/s
INFO - 10:39:34: EPOCH 330: training on 2135928 raw words (1561426 effective words) took 0.9s, 1835184 effective words/s
INFO - 10:39:35: EPOCH 331: trai

INFO - 10:40:26: EPOCH 391: training on 2135928 raw words (1560994 effective words) took 0.9s, 1798574 effective words/s
INFO - 10:40:26: EPOCH 392: training on 2135928 raw words (1561032 effective words) took 0.9s, 1825083 effective words/s
INFO - 10:40:27: EPOCH 393: training on 2135928 raw words (1560834 effective words) took 0.8s, 1857343 effective words/s
INFO - 10:40:28: EPOCH 394: training on 2135928 raw words (1561604 effective words) took 0.8s, 1854902 effective words/s
INFO - 10:40:29: EPOCH 395: training on 2135928 raw words (1561088 effective words) took 0.8s, 1882203 effective words/s
INFO - 10:40:30: EPOCH 396: training on 2135928 raw words (1560426 effective words) took 0.9s, 1827065 effective words/s
INFO - 10:40:31: EPOCH 397: training on 2135928 raw words (1561109 effective words) took 0.8s, 1866942 effective words/s
INFO - 10:40:32: EPOCH 398: training on 2135928 raw words (1561356 effective words) took 0.9s, 1830727 effective words/s
INFO - 10:40:32: EPOCH 399: trai

INFO - 10:41:23: EPOCH 459: training on 2135928 raw words (1560989 effective words) took 0.8s, 1887653 effective words/s
INFO - 10:41:24: EPOCH 460: training on 2135928 raw words (1561714 effective words) took 0.8s, 1838052 effective words/s
INFO - 10:41:25: EPOCH 461: training on 2135928 raw words (1561140 effective words) took 0.9s, 1831423 effective words/s
INFO - 10:41:26: EPOCH 462: training on 2135928 raw words (1560887 effective words) took 0.8s, 1860777 effective words/s
INFO - 10:41:27: EPOCH 463: training on 2135928 raw words (1560662 effective words) took 0.9s, 1822262 effective words/s
INFO - 10:41:28: EPOCH 464: training on 2135928 raw words (1561337 effective words) took 0.8s, 1861302 effective words/s
INFO - 10:41:28: EPOCH 465: training on 2135928 raw words (1560965 effective words) took 0.8s, 1842711 effective words/s
INFO - 10:41:29: EPOCH 466: training on 2135928 raw words (1561288 effective words) took 0.8s, 1849542 effective words/s
INFO - 10:41:30: EPOCH 467: trai

INFO - 10:42:21: EPOCH 525: training on 2135928 raw words (1560541 effective words) took 0.8s, 1872088 effective words/s
INFO - 10:42:21: EPOCH 526: training on 2135928 raw words (1560939 effective words) took 0.8s, 1871822 effective words/s
INFO - 10:42:22: EPOCH 527: training on 2135928 raw words (1561362 effective words) took 0.8s, 1859995 effective words/s
INFO - 10:42:23: EPOCH 528: training on 2135928 raw words (1561663 effective words) took 0.8s, 1844980 effective words/s
INFO - 10:42:24: EPOCH 529: training on 2135928 raw words (1560409 effective words) took 0.9s, 1814654 effective words/s
INFO - 10:42:25: EPOCH 530: training on 2135928 raw words (1561273 effective words) took 0.8s, 1855744 effective words/s
INFO - 10:42:26: EPOCH 531: training on 2135928 raw words (1560821 effective words) took 0.8s, 1863782 effective words/s
INFO - 10:42:26: EPOCH 532: training on 2135928 raw words (1560731 effective words) took 0.8s, 1870703 effective words/s
INFO - 10:42:27: EPOCH 533: trai

INFO - 10:43:19: EPOCH 593: training on 2135928 raw words (1561424 effective words) took 0.9s, 1834714 effective words/s
INFO - 10:43:19: EPOCH 594: training on 2135928 raw words (1561582 effective words) took 0.8s, 1844419 effective words/s
INFO - 10:43:20: EPOCH 595: training on 2135928 raw words (1562326 effective words) took 0.8s, 1887190 effective words/s
INFO - 10:43:21: EPOCH 596: training on 2135928 raw words (1560970 effective words) took 0.8s, 1873388 effective words/s
INFO - 10:43:22: EPOCH 597: training on 2135928 raw words (1561346 effective words) took 0.8s, 1863879 effective words/s
INFO - 10:43:23: EPOCH 598: training on 2135928 raw words (1561487 effective words) took 0.9s, 1826537 effective words/s
INFO - 10:43:24: EPOCH 599: training on 2135928 raw words (1560914 effective words) took 0.9s, 1799871 effective words/s
INFO - 10:43:25: EPOCH 600: training on 2135928 raw words (1561416 effective words) took 0.8s, 1858061 effective words/s
INFO - 10:43:25: EPOCH 601: trai

INFO - 10:44:16: EPOCH 659: training on 2135928 raw words (1560625 effective words) took 0.8s, 1839903 effective words/s
INFO - 10:44:17: EPOCH 660: training on 2135928 raw words (1561265 effective words) took 0.8s, 1885510 effective words/s
INFO - 10:44:18: EPOCH 661: training on 2135928 raw words (1561168 effective words) took 0.9s, 1795381 effective words/s
INFO - 10:44:19: EPOCH 662: training on 2135928 raw words (1560656 effective words) took 0.8s, 1850434 effective words/s
INFO - 10:44:19: EPOCH 663: training on 2135928 raw words (1560960 effective words) took 0.8s, 1861389 effective words/s
INFO - 10:44:20: EPOCH 664: training on 2135928 raw words (1560420 effective words) took 0.9s, 1822640 effective words/s
INFO - 10:44:21: EPOCH 665: training on 2135928 raw words (1562312 effective words) took 0.9s, 1819625 effective words/s
INFO - 10:44:22: EPOCH 666: training on 2135928 raw words (1560697 effective words) took 0.8s, 1843324 effective words/s
INFO - 10:44:23: EPOCH 667: trai

INFO - 10:45:14: EPOCH 727: training on 2135928 raw words (1561662 effective words) took 0.9s, 1813815 effective words/s
INFO - 10:45:15: EPOCH 728: training on 2135928 raw words (1561659 effective words) took 0.8s, 1848105 effective words/s
INFO - 10:45:16: EPOCH 729: training on 2135928 raw words (1561708 effective words) took 0.8s, 1847064 effective words/s
INFO - 10:45:17: EPOCH 730: training on 2135928 raw words (1561318 effective words) took 0.9s, 1797932 effective words/s
INFO - 10:45:18: EPOCH 731: training on 2135928 raw words (1561297 effective words) took 0.9s, 1827820 effective words/s
INFO - 10:45:19: EPOCH 732: training on 2135928 raw words (1561508 effective words) took 0.8s, 1893681 effective words/s
INFO - 10:45:19: EPOCH 733: training on 2135928 raw words (1561459 effective words) took 0.8s, 1848920 effective words/s
INFO - 10:45:20: EPOCH 734: training on 2135928 raw words (1561507 effective words) took 0.8s, 1852141 effective words/s
INFO - 10:45:21: EPOCH 735: trai

INFO - 10:46:08: EPOCH 776 - PROGRESS: at 72.29% examples, 1142169 words/s, in_qsize 14, out_qsize 1
INFO - 10:46:08: EPOCH 776: training on 2135928 raw words (1561391 effective words) took 1.3s, 1162499 effective words/s
INFO - 10:46:09: EPOCH 777 - PROGRESS: at 76.12% examples, 1207819 words/s, in_qsize 15, out_qsize 0
INFO - 10:46:09: EPOCH 777: training on 2135928 raw words (1560766 effective words) took 1.3s, 1212799 effective words/s
INFO - 10:46:10: EPOCH 778 - PROGRESS: at 67.74% examples, 1069438 words/s, in_qsize 14, out_qsize 1
INFO - 10:46:11: EPOCH 778: training on 2135928 raw words (1561204 effective words) took 1.4s, 1126906 effective words/s
INFO - 10:46:12: EPOCH 779 - PROGRESS: at 78.34% examples, 1252239 words/s, in_qsize 15, out_qsize 0
INFO - 10:46:12: EPOCH 779: training on 2135928 raw words (1561425 effective words) took 1.3s, 1244215 effective words/s
INFO - 10:46:13: EPOCH 780 - PROGRESS: at 63.35% examples, 986212 words/s, in_qsize 15, out_qsize 0
INFO - 10:46

INFO - 10:46:56: EPOCH 815: training on 2135928 raw words (1561819 effective words) took 1.0s, 1605075 effective words/s
INFO - 10:46:57: EPOCH 816: training on 2135928 raw words (1561054 effective words) took 1.0s, 1574122 effective words/s
INFO - 10:46:58: EPOCH 817: training on 2135928 raw words (1560596 effective words) took 1.0s, 1622318 effective words/s
INFO - 10:46:59: EPOCH 818 - PROGRESS: at 88.64% examples, 1402181 words/s, in_qsize 15, out_qsize 0
INFO - 10:46:59: EPOCH 818: training on 2135928 raw words (1561715 effective words) took 1.1s, 1447522 effective words/s
INFO - 10:47:00: EPOCH 819: training on 2135928 raw words (1561703 effective words) took 1.0s, 1605674 effective words/s
INFO - 10:47:01: EPOCH 820: training on 2135928 raw words (1560784 effective words) took 1.0s, 1635990 effective words/s
INFO - 10:47:02: EPOCH 821 - PROGRESS: at 84.18% examples, 1353586 words/s, in_qsize 15, out_qsize 0
INFO - 10:47:02: EPOCH 821: training on 2135928 raw words (1560305 effec

INFO - 10:47:43: EPOCH 859 - PROGRESS: at 83.12% examples, 1335553 words/s, in_qsize 14, out_qsize 1
INFO - 10:47:43: EPOCH 859: training on 2135928 raw words (1561154 effective words) took 1.1s, 1359428 effective words/s
INFO - 10:47:44: EPOCH 860 - PROGRESS: at 77.91% examples, 1222745 words/s, in_qsize 15, out_qsize 2
INFO - 10:47:45: EPOCH 860: training on 2135928 raw words (1561236 effective words) took 1.3s, 1240325 effective words/s
INFO - 10:47:46: EPOCH 861 - PROGRESS: at 77.00% examples, 1219101 words/s, in_qsize 15, out_qsize 0
INFO - 10:47:46: EPOCH 861: training on 2135928 raw words (1561540 effective words) took 1.2s, 1275828 effective words/s
INFO - 10:47:47: EPOCH 862 - PROGRESS: at 78.78% examples, 1248825 words/s, in_qsize 14, out_qsize 1
INFO - 10:47:47: EPOCH 862: training on 2135928 raw words (1561377 effective words) took 1.2s, 1269733 effective words/s
INFO - 10:47:48: EPOCH 863 - PROGRESS: at 79.21% examples, 1261590 words/s, in_qsize 15, out_qsize 0
INFO - 10:4

INFO - 10:48:31: EPOCH 903: training on 2135928 raw words (1561518 effective words) took 1.2s, 1331314 effective words/s
INFO - 10:48:32: EPOCH 904 - PROGRESS: at 80.51% examples, 1269910 words/s, in_qsize 15, out_qsize 0
INFO - 10:48:33: EPOCH 904: training on 2135928 raw words (1560829 effective words) took 1.2s, 1318942 effective words/s
INFO - 10:48:34: EPOCH 905 - PROGRESS: at 80.93% examples, 1277937 words/s, in_qsize 14, out_qsize 1
INFO - 10:48:34: EPOCH 905: training on 2135928 raw words (1560954 effective words) took 1.2s, 1330743 effective words/s
INFO - 10:48:35: EPOCH 906 - PROGRESS: at 68.93% examples, 1060477 words/s, in_qsize 16, out_qsize 2
INFO - 10:48:35: EPOCH 906: training on 2135928 raw words (1561049 effective words) took 1.5s, 1037172 effective words/s
INFO - 10:48:36: EPOCH 907 - PROGRESS: at 81.81% examples, 1301478 words/s, in_qsize 15, out_qsize 0
INFO - 10:48:37: EPOCH 907: training on 2135928 raw words (1560900 effective words) took 1.2s, 1325650 effective

INFO - 10:49:17: EPOCH 941: training on 2135928 raw words (1560597 effective words) took 1.4s, 1143231 effective words/s
INFO - 10:49:18: EPOCH 942 - PROGRESS: at 75.27% examples, 1194682 words/s, in_qsize 15, out_qsize 0
INFO - 10:49:18: EPOCH 942: training on 2135928 raw words (1561030 effective words) took 1.3s, 1233880 effective words/s
INFO - 10:49:19: EPOCH 943 - PROGRESS: at 66.01% examples, 1031503 words/s, in_qsize 16, out_qsize 0
INFO - 10:49:20: EPOCH 943: training on 2135928 raw words (1561479 effective words) took 1.5s, 1067930 effective words/s
INFO - 10:49:21: EPOCH 944 - PROGRESS: at 75.69% examples, 1195530 words/s, in_qsize 15, out_qsize 0
INFO - 10:49:21: EPOCH 944: training on 2135928 raw words (1560653 effective words) took 1.3s, 1244335 effective words/s
INFO - 10:49:22: EPOCH 945 - PROGRESS: at 81.81% examples, 1302828 words/s, in_qsize 15, out_qsize 0
INFO - 10:49:22: EPOCH 945: training on 2135928 raw words (1561128 effective words) took 1.2s, 1337519 effective

INFO - 10:50:02: EPOCH 978: training on 2135928 raw words (1560824 effective words) took 1.1s, 1430150 effective words/s
INFO - 10:50:03: EPOCH 979 - PROGRESS: at 86.75% examples, 1387468 words/s, in_qsize 15, out_qsize 1
INFO - 10:50:04: EPOCH 979: training on 2135928 raw words (1562187 effective words) took 1.1s, 1408849 effective words/s
INFO - 10:50:05: EPOCH 980 - PROGRESS: at 87.38% examples, 1393253 words/s, in_qsize 15, out_qsize 0
INFO - 10:50:05: EPOCH 980: training on 2135928 raw words (1561862 effective words) took 1.1s, 1429518 effective words/s
INFO - 10:50:06: EPOCH 981 - PROGRESS: at 80.94% examples, 1292648 words/s, in_qsize 15, out_qsize 0
INFO - 10:50:06: EPOCH 981: training on 2135928 raw words (1561361 effective words) took 1.2s, 1293042 effective words/s
INFO - 10:50:07: EPOCH 982 - PROGRESS: at 59.80% examples, 930019 words/s, in_qsize 15, out_qsize 0
INFO - 10:50:07: EPOCH 982: training on 2135928 raw words (1561720 effective words) took 1.5s, 1017518 effective 

In [3]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

gutenberg_w2v = Word2Vec.load("gutenberg_w2v.hundd.model")

In [43]:
gutenberg_w2v.wv.most_similar(["bible"])


[('wrist', 0.4070826470851898),
 ('buzite', 0.3894822895526886),
 ('dreamer', 0.3729250133037567),
 ('cobbler', 0.35319337248802185),
 ('cable', 0.35270583629608154),
 ('cassius', 0.3514133393764496),
 ('auger', 0.3499419093132019),
 ('priesthood', 0.34642016887664795),
 ('loe', 0.34116289019584656),
 ('headship', 0.3361479640007019)]

In [42]:
gutenberg_w2v.wv.most_similar(["book"])


[('xxiii', 0.5053890943527222),
 ('xxxi', 0.5027292370796204),
 ('xxix', 0.5025994777679443),
 ('xxviii', 0.5024734139442444),
 ('xxv', 0.4990857243537903),
 ('xxvii', 0.4965112507343292),
 ('xxvi', 0.4925084710121155),
 ('raze', 0.490346759557724),
 ('temple', 0.4875237047672272),
 ('written', 0.4845070540904999)]

In [40]:
gutenberg_w2v.wv.most_similar(["bank"])


[('table', 0.5287594199180603),
 ('wall', 0.5040320754051208),
 ('ground', 0.5038182735443115),
 ('top', 0.5032145977020264),
 ('floor', 0.4864599108695984),
 ('side', 0.4803113639354706),
 ('bed', 0.450342059135437),
 ('hill', 0.4464374780654907),
 ('river', 0.4414878189563751),
 ('pool', 0.4404714107513428)]

In [39]:
gutenberg_w2v.wv.most_similar(["water"])

[('waters', 0.6542330384254456),
 ('wine', 0.5172027945518494),
 ('river', 0.5064811110496521),
 ('hole', 0.49640560150146484),
 ('fire', 0.4884592294692993),
 ('wood', 0.4813365340232849),
 ('rivers', 0.47593218088150024),
 ('blood', 0.4702102243900299),
 ('ground', 0.46649661660194397),
 ('fowls', 0.46397456526756287)]

In [63]:
v = gutenberg_w2v.wv["good"] - gutenberg_w2v.wv["taller"] + gutenberg_w2v.wv["tall"]
gutenberg_w2v.wv.most_similar(v)

[('good', 0.48668214678764343),
 ('sandals', 0.4110839068889618),
 ('tall', 0.40284034609794617),
 ('tout', 0.3939869999885559),
 ('festive', 0.37680643796920776),
 ('handsome', 0.3653698265552521),
 ('salamander', 0.3586299419403076),
 ('test', 0.35852161049842834),
 ('renderest', 0.3573521673679352),
 ('paire', 0.35532742738723755)]

In [None]:
v = gutenberg_w2v.wv["grils"] - gutenberg_w2v.wv["kings"] + gutenberg_w2v.wv["queens"]
gutenberg_w2v.wv.most_similar(v)

In [62]:
v = gutenberg_w2v.wv["france"] - gutenberg_w2v.wv["paris"] + gutenberg_w2v.wv["london"]
gutenberg_w2v.wv.most_similar(v)

[('france', 0.6633819937705994),
 ('london', 0.5153282284736633),
 ('inferiorities', 0.39137589931488037),
 ('plan', 0.3810945153236389),
 ('highbury', 0.380890429019928),
 ('species', 0.38061437010765076),
 ('country', 0.369165301322937),
 ('siam', 0.3610007166862488),
 ('allusion', 0.35697129368782043),
 ('ambassadors', 0.35658982396125793)]

In [2]:
from gensim.models import KeyedVectors
google_model = KeyedVectors.load_word2vec_format('/home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', binary=True,
limit=1000000)

In [38]:
google_model.most_similar(["bible"])

[('Bible', 0.736778199672699),
 ('bibles', 0.6052598357200623),
 ('Holy_Bible', 0.5989601612091064),
 ('scriptures', 0.574568510055542),
 ('scripture', 0.5697901844978333),
 ('New_Testament', 0.5638793110847473),
 ('Scripture', 0.5502957701683044),
 ('Scriptures', 0.5411645770072937),
 ('NRSV', 0.5341106057167053),
 ('Leviticus_##:##-##', 0.5247005224227905)]

In [37]:
google_model.most_similar(["book"])

[('tome', 0.7485830783843994),
 ('books', 0.7379177808761597),
 ('memoir', 0.7302926778793335),
 ('paperback_edition', 0.6868364214897156),
 ('autobiography', 0.6741527318954468),
 ('memoirs', 0.6505153179168701),
 ('Book', 0.6479282975196838),
 ('paperback', 0.6471226811408997),
 ('novels', 0.6341459155082703),
 ('hardback', 0.6283079981803894)]

In [36]:
google_model.most_similar(["bank"])

[('banks', 0.7440759539604187),
 ('banking', 0.690161406993866),
 ('Bank', 0.6698698401451111),
 ('lender', 0.6342284679412842),
 ('banker', 0.6092953085899353),
 ('depositors', 0.6031531691551208),
 ('mortgage_lender', 0.5797975659370422),
 ('depositor', 0.5716427564620972),
 ('BofA', 0.5714625120162964),
 ('Citibank', 0.5589520335197449)]

In [13]:
google_model.most_similar(["cwater"])

[('coochie', 0.6906902194023132),
 ('vajayjay', 0.6699857711791992),
 ('p_*_ssy', 0.6656262278556824),
 ('tushy', 0.6618760824203491),
 ('titties', 0.654757022857666),
 ('d_**_k', 0.6432459950447083),
 ('Ewwwww', 0.6393193602561951),
 ('dangly_bits', 0.6388564109802246),
 ('pubes', 0.6357187628746033),
 ('Urgh', 0.6354457139968872)]

In [74]:
v = google_model["girls"] - google_model["queen"] + google_model["kings"]
google_model.most_similar(v)

[('boys', 0.6931698322296143),
 ('girls', 0.6385126709938049),
 ('kings', 0.4957888424396515),
 ('men', 0.48680540919303894),
 ('teenagers', 0.4788475036621094),
 ('schoolboys', 0.45804113149642944),
 ('pee_wees', 0.44774994254112244),
 ('Mitey_Mite', 0.44012460112571716),
 ('kids', 0.4373849332332611),
 ('youngsters', 0.43566834926605225)]

In [72]:
v = google_model["good"] - google_model["taller"] + google_model["tall"]
google_model.most_similar(v)

[('good', 0.6434131860733032),
 ('great', 0.49164238572120667),
 ('bad', 0.4760521948337555),
 ('terrific', 0.46986129879951477),
 ('wonderful', 0.4452008605003357),
 ('nice', 0.4425136148929596),
 ('fantastic', 0.43418607115745544),
 ('decent', 0.4307934641838074),
 ('excellent', 0.41867733001708984),
 ('terrible', 0.4151829481124878)]

In [70]:
v = google_model["france"] - google_model["paris"] + google_model["london"]
google_model.most_similar(v)

[('london', 0.754153847694397),
 ('france', 0.7366582751274109),
 ('england', 0.600825309753418),
 ('europe', 0.5708170533180237),
 ('birmingham', 0.5392330884933472),
 ('european', 0.5275605916976929),
 ('newcastle', 0.5263600945472717),
 ('barcelona', 0.5107599496841431),
 ('africa', 0.510517418384552),
 ('spain', 0.5082812905311584)]

In [4]:
import numpy as np

voc = gutenberg_w2v.wv.index_to_key
# get vector size
dim = gutenberg_w2v.vector_size


# Convert to numpy 2d array (n_vocab x vector_size)
def to_embeddings_Matrix(model):  
    embedding_matrix = np.zeros((len(voc), model.vector_size))
    word2idx = {}
    for i in range(len(voc)):
        embedding_matrix[i] = model.wv[model.wv.index_to_key[i]] 
    return embedding_matrix


embeddings=to_embeddings_Matrix(gutenberg_w2v)


(41465, 100)


In [15]:
# Put it in data later
import csv
with open('../data/embeddings.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output, delimiter='\t')
    for embedding in embeddings:
        tsv_output.writerow(embedding)
    
with open('../data/metadata.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output)
    for voc_rows in voc:
        tsv_output.writerow([voc_rows])
  

In [83]:


import glob
import os
import re

import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

SCRIPT_DIRECTORY = os.path.realpath(os.getcwd())

data_dir = os.path.join(SCRIPT_DIRECTORY, "../data/aclImdb")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
pos_train_dir = os.path.join(train_dir, "pos")
neg_train_dir = os.path.join(train_dir, "neg")
pos_test_dir = os.path.join(test_dir, "pos")
neg_test_dir = os.path.join(test_dir, "neg")

# For memory limitations. These parameters fit in 8GB of RAM.
# If you have 16G of RAM you can experiment with the full dataset / W2V
MAX_NUM_SAMPLES = 5000
# Load first 1M word embeddings. This works because GoogleNews are roughly
# sorted from most frequent to least frequent.
# It may yield much worse results for other embeddings corpora
NUM_W2V_TO_LOAD = 1000000


SEED = 42

# Fix numpy random seed for reproducibility
np.random.seed(SEED)


def strip_punctuation(s):
    return re.sub(r"[^a-zA-Z\s]", " ", s)


def preprocess(s):
    return re.sub("\s+", " ", strip_punctuation(s).lower())


def tokenize(s):
    return s.split(" ")


def preproc_tok(s):
    return tokenize(preprocess(s))


def token_proc(t_corpus):
    data=[]
    for i,ind in enumerate(t_corpus):
        proc_t_corpus=preproc_tok(train_corpus[i])
        data.append(proc_t_corpus)
    
    return data


# def read_samples(folder, preprocess=lambda x: x):
#     samples = glob.iglob(os.path.join(folder, "*.txt"))
#     data = []

#     for i, sample in enumerate(samples):
#         if MAX_NUM_SAMPLES > 0 and i == MAX_NUM_SAMPLES:
#             break
#         with open(sample, "r") as fd:
#             x = [preprocess(l) for l in fd][0]
#             data.append(x)

#     return data


def create_corpus(pos, neg):
    corpus = np.array(pos + neg)
    y = np.array([1 for _ in pos] + [0 for _ in neg])
    indices = np.arange(y.shape[0])
    np.random.shuffle(indices)

    return list(corpus[indices]), list(y[indices])


def extract_nbow(corpus):
    """Extract neural bag of words representations"""
    freq = {}
    for item in corpus:
        if (item in freq):
            freq[item] += 1
        else:
            freq[item] = 1
 

    return freq

    raise NotImplementedError("Implement nbow extractor")


def train_sentiment_analysis(train_corpus, train_labels):
    """Train a sentiment analysis classifier using NBOW + Logistic regression"""
    raise NotImplementedError("Implement sentiment analysis training")


def evaluate_sentiment_analysis(classifier, test_corpus, test_labels):
    """Evaluate classifier in the test corpus and report accuracy"""
    raise NotImplementedError("Implement sentiment analysis evaluation")


if __name__ == "__main__":
    # TODO: read Imdb corpus
    pos_train=read_samples(pos_train_dir)
    neg_train=read_samples(neg_train_dir)
    
    
    pos_test=read_samples(pos_test_dir)
    neg_test=read_samples(neg_test_dir)
    
    corpus,labels = create_corpus(pos_train,neg_train)
#     train_corpus, train_labels = create_corpus(pos_train,neg_train)
#     test_corpus, test_labels = create_corpus(pos_test,neg_test)
#     nbow_corpus = extract_nbow(corpus)
    (
            train_corpus,
            test_corpus,
            train_labels,
            test_labels,
    ) = sklearn.model_selection.train_test_split(corpus, labels)
#         p_corpus=extract_nbow(corpus)
        # TODO: train / evaluate and report accuracy


In [84]:
def token_proc(t_corpus):
    data=[]
    for i,ind in enumerate(t_corpus):
        proc_t_corpus=preproc_tok(train_corpus[i])
        data.append(proc_t_corpus)
    
    return data
        
        

proc_train_corpus=token_proc(train_corpus)


In [85]:
# print(proc_train_corpus[1])

In [86]:
# print(extract_nbow(proc_train_corpus[1]))

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,tokenizer=token_proc,use_idf=True,norm='l2',smooth_idf=True)


In [113]:
import pandas as pd


df = pd.DataFrame (corpus)
df.columns=['text']
df.insert(1, "label", labels, True)
print(df)

                                                   text  label
0     Over the last 20 years the majority of British...      0
1     The plot intellect is about as light as feathe...      1
2     Personnaly I really loved this movie, and it p...      1
3     I've just purchased the restored version of a ...      1
4     ''Ranma ½" is my favorite anime by Rumiko Taka...      1
...                                                 ...    ...
9995  to be honest, i didn't watch all of the origin...      0
9996  It is enjoyable and fast-paced. <br /><br />Th...      0
9997  If this movie were any worse, it would have be...      0
9998  Turned out to be a classy production with what...      1
9999  A shaky hand-held camera was used, presumably ...      0

[10000 rows x 2 columns]


In [116]:
from sklearn.feature_extraction.text import CountVectorizer
y=df.label.values
count=CountVectorizer()

# corpus_list=[[i] for i in corpus]
# print(corpus_list[0])
# print(token_proc(corpus))
x=count.fit_transform(df.text)
# print(y)
# # tfidf.get_feature_names_out()
# print(count.get_feature_names())

In [117]:
 (
            train_corpus,
            test_corpus,
            train_labels,
            test_labels,
    ) = sklearn.model_selection.train_test_split(df.text, df.label)
#         p_corpus=extract_nbow(corpus)

In [118]:
print(train_corpus)

4143    This remake of the 1962 orginal film'o the boo...
6934    An idiotic dentist finds out that his wife has...
5777    This movie is about a female rape victim/comic...
5586    on the contrary to the person listed above me ...
2591    Holy crap! What a terrible, terrible Spanish t...
                              ...                        
7087    Saw in on TV late last night. Yeah, I can hear...
5629    I am dumbfounded that I actually sat and watch...
4139    According to IMDb Takashi Miike's Master of Ho...
7944    I swear when I first saw this movie,I cried my...
2156    The story of Cinderella is one of my favorites...
Name: text, Length: 7500, dtype: object


In [None]:
from sklearn.linear_model import LogisticRegressionCV

clf=LogisticRegressionCV(cv=5,scoring='accuracy',random_state=42,n_jobs=-1,verbose=3,max_iter=500).fit(train_corpus,train_corpus)

y_pred = clf.predict(test_corpus)