In [4]:
import logging
import multiprocessing
import os

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

# Enable gensim logging
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)


class W2VLossLogger(CallbackAny2Vec):
    """Callback to print loss after each epoch
    use by passing model.train(..., callbacks=[W2VLossLogger()])
    """

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()

        if self.epoch == 0:
            print("Loss after epoch {}: {}".format(self.epoch, loss))
        else:
            print(
                "Loss after epoch {}: {}".format(
                    self.epoch, loss - self.loss_previous_step
                )
            )
        self.epoch += 1
        self.loss_previous_step = loss


def train_w2v_model(
    sentences,
    output_file,
    window,
    embedding_dim,
    epochs,
    min_word_count,
):
    

    """Train a word2vec model based on given sentences.
    Args:
        sentences list[list[str]]: List of sentences. Each element contains a list with the words
            in the current sentence
        output_file (str): Path to save the trained w2v model
        window (int): w2v context size
        embedding_dim (int): w2v vector dimension
        epochs (int): How many epochs should the training run
        min_word_count (int): Ignore words that appear less than min_word_count times
    """
    workers = multiprocessing.cpu_count()
    
    # TODO: Instantiate gensim.models.Word2Vec class
    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, window=window, min_count=min_word_count, workers=multiprocessing.cpu_count())
    model.build_vocab(sentences, progress_per=10000)
    # TODO: Build model vocabulary using sentences
    # TODO: Train word2vec model
    model.train(sentences, total_examples=model.corpus_count,epochs=epochs)
    # Save trained model
    model.save(output_file)
    # model.save(output_file)

    

    return model
    




In [2]:
    # read data/gutenberg.txt in the expected format (tokenized)
    f=open("../data/tokenized.txt","r")
    sentences =eval(f.read())
    
   
    output_file = "gutenberg_w2v.hundd.model"
    window = 5
    embedding_dim = 100
    epochs = 1000
    min_word_count = 1

    
    #Initialize training of our Word2Vec model
    
    gutenberg_w2v =train_w2v_model(
        sentences,
        output_file,
        window,
        embedding_dim,
        epochs,
        min_word_count)


INFO - 10:16:32: collecting all words and their counts
INFO - 10:16:32: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 10:16:32: PROGRESS: at sentence #10000, processed 113516 words, keeping 6065 word types
INFO - 10:16:32: PROGRESS: at sentence #20000, processed 227311 words, keeping 8492 word types
INFO - 10:16:32: PROGRESS: at sentence #30000, processed 324672 words, keeping 9841 word types
INFO - 10:16:32: PROGRESS: at sentence #40000, processed 427445 words, keeping 11779 word types
INFO - 10:16:32: PROGRESS: at sentence #50000, processed 537208 words, keeping 13110 word types
INFO - 10:16:32: PROGRESS: at sentence #60000, processed 647741 words, keeping 14312 word types
INFO - 10:16:32: PROGRESS: at sentence #70000, processed 751694 words, keeping 16043 word types
INFO - 10:16:32: PROGRESS: at sentence #80000, processed 847983 words, keeping 16837 word types
INFO - 10:16:32: PROGRESS: at sentence #90000, processed 957413 words, keeping 17332 word types
I

INFO - 10:16:41: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 41465 unique words (100.00% of original 41465, drops 0)', 'datetime': '2023-04-08T10:16:41.471941', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 10:16:41: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 2135928 word corpus (100.00% of original 2135928, drops 0)', 'datetime': '2023-04-08T10:16:41.472694', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 10:16:41: deleting the raw counts dictionary of 41465 items
INFO - 10:16:41: sample=0.001 downsamples 56 most-common words
INFO - 10:16:41: Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 1561117.1381724994 word corpus (73.1%% of prior 2135928)', 'datetime': '2023-04-08T10:1

INFO - 10:17:22: EPOCH 28 - PROGRESS: at 78.78% examples, 1260407 words/s, in_qsize 12, out_qsize 3
INFO - 10:17:22: EPOCH 28: training on 2135928 raw words (1562135 effective words) took 1.2s, 1310786 effective words/s
INFO - 10:17:23: EPOCH 29 - PROGRESS: at 79.65% examples, 1264174 words/s, in_qsize 15, out_qsize 0
INFO - 10:17:23: EPOCH 29: training on 2135928 raw words (1560997 effective words) took 1.2s, 1298563 effective words/s
INFO - 10:17:24: EPOCH 30 - PROGRESS: at 73.12% examples, 1161485 words/s, in_qsize 15, out_qsize 0
INFO - 10:17:25: EPOCH 30: training on 2135928 raw words (1561045 effective words) took 1.4s, 1147408 effective words/s
INFO - 10:17:26: EPOCH 31 - PROGRESS: at 74.84% examples, 1194629 words/s, in_qsize 15, out_qsize 0
INFO - 10:17:26: EPOCH 31: training on 2135928 raw words (1560813 effective words) took 1.3s, 1174523 effective words/s
INFO - 10:17:27: EPOCH 32 - PROGRESS: at 76.57% examples, 1225134 words/s, in_qsize 15, out_qsize 0
INFO - 10:17:27: EPO

INFO - 10:18:14: EPOCH 65: training on 2135928 raw words (1561242 effective words) took 1.5s, 1071734 effective words/s
INFO - 10:18:15: EPOCH 66 - PROGRESS: at 56.20% examples, 871932 words/s, in_qsize 13, out_qsize 2
INFO - 10:18:15: EPOCH 66: training on 2135928 raw words (1560984 effective words) took 1.5s, 1018641 effective words/s
INFO - 10:18:16: EPOCH 67 - PROGRESS: at 57.99% examples, 907394 words/s, in_qsize 13, out_qsize 1
INFO - 10:18:17: EPOCH 67: training on 2135928 raw words (1562435 effective words) took 1.6s, 1000086 effective words/s
INFO - 10:18:18: EPOCH 68 - PROGRESS: at 69.33% examples, 1089089 words/s, in_qsize 15, out_qsize 0
INFO - 10:18:18: EPOCH 68: training on 2135928 raw words (1560887 effective words) took 1.4s, 1109965 effective words/s
INFO - 10:18:19: EPOCH 69 - PROGRESS: at 68.93% examples, 1077177 words/s, in_qsize 15, out_qsize 0
INFO - 10:18:20: EPOCH 69: training on 2135928 raw words (1561865 effective words) took 1.4s, 1108921 effective words/s
IN

INFO - 10:19:05: EPOCH 103 - PROGRESS: at 60.27% examples, 924624 words/s, in_qsize 15, out_qsize 0
INFO - 10:19:06: EPOCH 103: training on 2135928 raw words (1561281 effective words) took 1.6s, 968181 effective words/s
INFO - 10:19:07: EPOCH 104 - PROGRESS: at 52.58% examples, 817770 words/s, in_qsize 16, out_qsize 0
INFO - 10:19:08: EPOCH 104: training on 2135928 raw words (1561314 effective words) took 1.8s, 887457 effective words/s
INFO - 10:19:09: EPOCH 105 - PROGRESS: at 53.94% examples, 837678 words/s, in_qsize 15, out_qsize 0
INFO - 10:19:09: EPOCH 105: training on 2135928 raw words (1560890 effective words) took 1.8s, 863326 effective words/s
INFO - 10:19:11: EPOCH 106 - PROGRESS: at 49.80% examples, 770164 words/s, in_qsize 16, out_qsize 7
INFO - 10:19:11: EPOCH 106: training on 2135928 raw words (1560644 effective words) took 2.0s, 791423 effective words/s
INFO - 10:19:13: EPOCH 107 - PROGRESS: at 49.80% examples, 759466 words/s, in_qsize 14, out_qsize 1
INFO - 10:19:13: EPO

INFO - 10:19:56: EPOCH 140 - PROGRESS: at 80.07% examples, 1279265 words/s, in_qsize 15, out_qsize 0
INFO - 10:19:56: EPOCH 140: training on 2135928 raw words (1561155 effective words) took 1.2s, 1326431 effective words/s
INFO - 10:19:57: EPOCH 141 - PROGRESS: at 79.64% examples, 1273489 words/s, in_qsize 14, out_qsize 1
INFO - 10:19:57: EPOCH 141: training on 2135928 raw words (1561436 effective words) took 1.2s, 1303591 effective words/s
INFO - 10:19:58: EPOCH 142 - PROGRESS: at 78.78% examples, 1250529 words/s, in_qsize 15, out_qsize 0
INFO - 10:19:58: EPOCH 142: training on 2135928 raw words (1561082 effective words) took 1.2s, 1291700 effective words/s
INFO - 10:19:59: EPOCH 143 - PROGRESS: at 80.95% examples, 1300759 words/s, in_qsize 15, out_qsize 0
INFO - 10:19:59: EPOCH 143: training on 2135928 raw words (1560827 effective words) took 1.2s, 1322456 effective words/s
INFO - 10:20:00: EPOCH 144 - PROGRESS: at 80.95% examples, 1298962 words/s, in_qsize 15, out_qsize 0
INFO - 10:2

INFO - 10:20:40: EPOCH 177 - PROGRESS: at 81.82% examples, 1313278 words/s, in_qsize 15, out_qsize 0
INFO - 10:20:40: EPOCH 177: training on 2135928 raw words (1560922 effective words) took 1.2s, 1330985 effective words/s
INFO - 10:20:41: EPOCH 178 - PROGRESS: at 81.38% examples, 1302735 words/s, in_qsize 16, out_qsize 1
INFO - 10:20:41: EPOCH 178: training on 2135928 raw words (1560768 effective words) took 1.2s, 1341553 effective words/s
INFO - 10:20:42: EPOCH 179 - PROGRESS: at 80.51% examples, 1286572 words/s, in_qsize 14, out_qsize 1
INFO - 10:20:42: EPOCH 179: training on 2135928 raw words (1560553 effective words) took 1.2s, 1330553 effective words/s
INFO - 10:20:43: EPOCH 180 - PROGRESS: at 81.37% examples, 1307000 words/s, in_qsize 15, out_qsize 0
INFO - 10:20:44: EPOCH 180: training on 2135928 raw words (1561520 effective words) took 1.2s, 1343317 effective words/s
INFO - 10:20:45: EPOCH 181 - PROGRESS: at 78.78% examples, 1253986 words/s, in_qsize 13, out_qsize 2
INFO - 10:2

INFO - 10:21:24: EPOCH 214 - PROGRESS: at 80.08% examples, 1280952 words/s, in_qsize 15, out_qsize 0
INFO - 10:21:24: EPOCH 214: training on 2135928 raw words (1561313 effective words) took 1.2s, 1313310 effective words/s
INFO - 10:21:25: EPOCH 215 - PROGRESS: at 79.65% examples, 1277745 words/s, in_qsize 15, out_qsize 0
INFO - 10:21:25: EPOCH 215: training on 2135928 raw words (1561107 effective words) took 1.2s, 1303215 effective words/s
INFO - 10:21:26: EPOCH 216 - PROGRESS: at 80.95% examples, 1293834 words/s, in_qsize 14, out_qsize 1
INFO - 10:21:27: EPOCH 216: training on 2135928 raw words (1561568 effective words) took 1.2s, 1338939 effective words/s
INFO - 10:21:28: EPOCH 217 - PROGRESS: at 79.64% examples, 1278050 words/s, in_qsize 12, out_qsize 3
INFO - 10:21:28: EPOCH 217: training on 2135928 raw words (1561425 effective words) took 1.2s, 1324318 effective words/s
INFO - 10:21:29: EPOCH 218 - PROGRESS: at 79.21% examples, 1266073 words/s, in_qsize 15, out_qsize 0
INFO - 10:2

INFO - 10:22:08: EPOCH 251 - PROGRESS: at 81.37% examples, 1291887 words/s, in_qsize 15, out_qsize 0
INFO - 10:22:08: EPOCH 251: training on 2135928 raw words (1561049 effective words) took 1.2s, 1325702 effective words/s
INFO - 10:22:09: EPOCH 252 - PROGRESS: at 79.64% examples, 1274908 words/s, in_qsize 15, out_qsize 0
INFO - 10:22:09: EPOCH 252: training on 2135928 raw words (1561736 effective words) took 1.2s, 1323837 effective words/s
INFO - 10:22:10: EPOCH 253 - PROGRESS: at 80.95% examples, 1295162 words/s, in_qsize 14, out_qsize 1
INFO - 10:22:11: EPOCH 253: training on 2135928 raw words (1560981 effective words) took 1.2s, 1320075 effective words/s
INFO - 10:22:12: EPOCH 254 - PROGRESS: at 80.07% examples, 1259468 words/s, in_qsize 15, out_qsize 0
INFO - 10:22:12: EPOCH 254: training on 2135928 raw words (1560773 effective words) took 1.2s, 1314744 effective words/s
INFO - 10:22:13: EPOCH 255 - PROGRESS: at 80.51% examples, 1282688 words/s, in_qsize 15, out_qsize 0
INFO - 10:2

INFO - 10:22:52: EPOCH 288 - PROGRESS: at 80.08% examples, 1279917 words/s, in_qsize 15, out_qsize 0
INFO - 10:22:52: EPOCH 288: training on 2135928 raw words (1562078 effective words) took 1.2s, 1317128 effective words/s
INFO - 10:22:53: EPOCH 289 - PROGRESS: at 80.08% examples, 1284136 words/s, in_qsize 15, out_qsize 0
INFO - 10:22:54: EPOCH 289: training on 2135928 raw words (1560862 effective words) took 1.2s, 1313261 effective words/s
INFO - 10:22:55: EPOCH 290 - PROGRESS: at 79.65% examples, 1276746 words/s, in_qsize 16, out_qsize 0
INFO - 10:22:55: EPOCH 290: training on 2135928 raw words (1561558 effective words) took 1.2s, 1305452 effective words/s
INFO - 10:22:56: EPOCH 291 - PROGRESS: at 79.64% examples, 1275755 words/s, in_qsize 15, out_qsize 0
INFO - 10:22:56: EPOCH 291: training on 2135928 raw words (1560680 effective words) took 1.2s, 1318250 effective words/s
INFO - 10:22:57: EPOCH 292 - PROGRESS: at 81.37% examples, 1305410 words/s, in_qsize 16, out_qsize 0
INFO - 10:2

INFO - 10:23:36: EPOCH 325 - PROGRESS: at 81.81% examples, 1304830 words/s, in_qsize 16, out_qsize 0
INFO - 10:23:37: EPOCH 325: training on 2135928 raw words (1561540 effective words) took 1.2s, 1325131 effective words/s
INFO - 10:23:38: EPOCH 326 - PROGRESS: at 80.93% examples, 1299177 words/s, in_qsize 15, out_qsize 0
INFO - 10:23:38: EPOCH 326: training on 2135928 raw words (1561025 effective words) took 1.2s, 1323398 effective words/s
INFO - 10:23:39: EPOCH 327 - PROGRESS: at 78.34% examples, 1242975 words/s, in_qsize 16, out_qsize 0
INFO - 10:23:39: EPOCH 327: training on 2135928 raw words (1560850 effective words) took 1.2s, 1286228 effective words/s
INFO - 10:23:40: EPOCH 328 - PROGRESS: at 80.07% examples, 1275125 words/s, in_qsize 13, out_qsize 2
INFO - 10:23:40: EPOCH 328: training on 2135928 raw words (1561476 effective words) took 1.2s, 1314930 effective words/s
INFO - 10:23:41: EPOCH 329 - PROGRESS: at 78.78% examples, 1259373 words/s, in_qsize 13, out_qsize 2
INFO - 10:2

INFO - 10:24:21: EPOCH 362 - PROGRESS: at 80.08% examples, 1282663 words/s, in_qsize 16, out_qsize 1
INFO - 10:24:21: EPOCH 362: training on 2135928 raw words (1560874 effective words) took 1.2s, 1317830 effective words/s
INFO - 10:24:22: EPOCH 363 - PROGRESS: at 81.37% examples, 1297082 words/s, in_qsize 14, out_qsize 1
INFO - 10:24:22: EPOCH 363: training on 2135928 raw words (1561765 effective words) took 1.2s, 1329672 effective words/s
INFO - 10:24:23: EPOCH 364 - PROGRESS: at 79.21% examples, 1270774 words/s, in_qsize 14, out_qsize 1
INFO - 10:24:23: EPOCH 364: training on 2135928 raw words (1561604 effective words) took 1.2s, 1315125 effective words/s
INFO - 10:24:25: EPOCH 365 - PROGRESS: at 81.37% examples, 1302001 words/s, in_qsize 16, out_qsize 0
INFO - 10:24:25: EPOCH 365: training on 2135928 raw words (1561255 effective words) took 1.2s, 1328035 effective words/s
INFO - 10:24:26: EPOCH 366 - PROGRESS: at 81.38% examples, 1299260 words/s, in_qsize 16, out_qsize 0
INFO - 10:2

INFO - 10:25:05: EPOCH 399 - PROGRESS: at 81.38% examples, 1298168 words/s, in_qsize 15, out_qsize 0
INFO - 10:25:05: EPOCH 399: training on 2135928 raw words (1561483 effective words) took 1.2s, 1320924 effective words/s
INFO - 10:25:06: EPOCH 400 - PROGRESS: at 80.08% examples, 1285192 words/s, in_qsize 15, out_qsize 0
INFO - 10:25:06: EPOCH 400: training on 2135928 raw words (1561734 effective words) took 1.2s, 1307699 effective words/s
INFO - 10:25:07: EPOCH 401 - PROGRESS: at 79.65% examples, 1279497 words/s, in_qsize 14, out_qsize 1
INFO - 10:25:08: EPOCH 401: training on 2135928 raw words (1562174 effective words) took 1.2s, 1322572 effective words/s
INFO - 10:25:09: EPOCH 402 - PROGRESS: at 80.94% examples, 1289497 words/s, in_qsize 16, out_qsize 1
INFO - 10:25:09: EPOCH 402: training on 2135928 raw words (1560833 effective words) took 1.2s, 1323257 effective words/s
INFO - 10:25:10: EPOCH 403 - PROGRESS: at 80.95% examples, 1298422 words/s, in_qsize 15, out_qsize 0
INFO - 10:2

INFO - 10:25:49: EPOCH 436 - PROGRESS: at 80.94% examples, 1279722 words/s, in_qsize 15, out_qsize 0
INFO - 10:25:49: EPOCH 436: training on 2135928 raw words (1561296 effective words) took 1.2s, 1338808 effective words/s
INFO - 10:25:50: EPOCH 437 - PROGRESS: at 81.37% examples, 1306382 words/s, in_qsize 16, out_qsize 0
INFO - 10:25:51: EPOCH 437: training on 2135928 raw words (1561121 effective words) took 1.2s, 1326017 effective words/s
INFO - 10:25:52: EPOCH 438 - PROGRESS: at 78.34% examples, 1253470 words/s, in_qsize 16, out_qsize 2
INFO - 10:25:52: EPOCH 438: training on 2135928 raw words (1560765 effective words) took 1.2s, 1301741 effective words/s
INFO - 10:25:53: EPOCH 439 - PROGRESS: at 81.81% examples, 1313948 words/s, in_qsize 15, out_qsize 0
INFO - 10:25:53: EPOCH 439: training on 2135928 raw words (1560571 effective words) took 1.2s, 1343976 effective words/s
INFO - 10:25:54: EPOCH 440 - PROGRESS: at 80.94% examples, 1282479 words/s, in_qsize 15, out_qsize 0
INFO - 10:2

INFO - 10:26:33: EPOCH 473 - PROGRESS: at 78.35% examples, 1250822 words/s, in_qsize 15, out_qsize 0
INFO - 10:26:34: EPOCH 473: training on 2135928 raw words (1561808 effective words) took 1.2s, 1289436 effective words/s
INFO - 10:26:35: EPOCH 474 - PROGRESS: at 80.07% examples, 1275130 words/s, in_qsize 15, out_qsize 2
INFO - 10:26:35: EPOCH 474: training on 2135928 raw words (1561355 effective words) took 1.2s, 1313957 effective words/s
INFO - 10:26:36: EPOCH 475 - PROGRESS: at 80.95% examples, 1297053 words/s, in_qsize 15, out_qsize 0
INFO - 10:26:36: EPOCH 475: training on 2135928 raw words (1561454 effective words) took 1.2s, 1336236 effective words/s
INFO - 10:26:37: EPOCH 476 - PROGRESS: at 78.78% examples, 1261916 words/s, in_qsize 15, out_qsize 0
INFO - 10:26:37: EPOCH 476: training on 2135928 raw words (1561161 effective words) took 1.2s, 1308806 effective words/s
INFO - 10:26:38: EPOCH 477 - PROGRESS: at 80.08% examples, 1283009 words/s, in_qsize 15, out_qsize 0
INFO - 10:2

INFO - 10:27:17: EPOCH 510 - PROGRESS: at 82.25% examples, 1316754 words/s, in_qsize 16, out_qsize 0
INFO - 10:27:18: EPOCH 510: training on 2135928 raw words (1561622 effective words) took 1.2s, 1341472 effective words/s
INFO - 10:27:19: EPOCH 511 - PROGRESS: at 80.08% examples, 1284151 words/s, in_qsize 12, out_qsize 3
INFO - 10:27:19: EPOCH 511: training on 2135928 raw words (1560445 effective words) took 1.2s, 1315936 effective words/s
INFO - 10:27:20: EPOCH 512 - PROGRESS: at 80.51% examples, 1292909 words/s, in_qsize 15, out_qsize 0
INFO - 10:27:20: EPOCH 512: training on 2135928 raw words (1561170 effective words) took 1.2s, 1330778 effective words/s
INFO - 10:27:21: EPOCH 513 - PROGRESS: at 80.95% examples, 1291909 words/s, in_qsize 16, out_qsize 1
INFO - 10:27:21: EPOCH 513: training on 2135928 raw words (1561404 effective words) took 1.2s, 1324173 effective words/s
INFO - 10:27:22: EPOCH 514 - PROGRESS: at 79.65% examples, 1272625 words/s, in_qsize 14, out_qsize 1
INFO - 10:2

INFO - 10:28:01: EPOCH 547 - PROGRESS: at 80.51% examples, 1292564 words/s, in_qsize 16, out_qsize 0
INFO - 10:28:02: EPOCH 547: training on 2135928 raw words (1561646 effective words) took 1.2s, 1333576 effective words/s
INFO - 10:28:03: EPOCH 548 - PROGRESS: at 80.93% examples, 1288486 words/s, in_qsize 15, out_qsize 0
INFO - 10:28:03: EPOCH 548: training on 2135928 raw words (1561631 effective words) took 1.2s, 1324386 effective words/s
INFO - 10:28:04: EPOCH 549 - PROGRESS: at 79.64% examples, 1275361 words/s, in_qsize 16, out_qsize 1
INFO - 10:28:04: EPOCH 549: training on 2135928 raw words (1562065 effective words) took 1.2s, 1316846 effective words/s
INFO - 10:28:05: EPOCH 550 - PROGRESS: at 77.42% examples, 1235698 words/s, in_qsize 12, out_qsize 3
INFO - 10:28:05: EPOCH 550: training on 2135928 raw words (1561267 effective words) took 1.2s, 1294662 effective words/s
INFO - 10:28:06: EPOCH 551 - PROGRESS: at 81.37% examples, 1280566 words/s, in_qsize 14, out_qsize 2
INFO - 10:2

INFO - 10:28:46: EPOCH 584 - PROGRESS: at 81.37% examples, 1302905 words/s, in_qsize 16, out_qsize 0
INFO - 10:28:46: EPOCH 584: training on 2135928 raw words (1560034 effective words) took 1.2s, 1304094 effective words/s
INFO - 10:28:47: EPOCH 585 - PROGRESS: at 80.51% examples, 1288730 words/s, in_qsize 14, out_qsize 1
INFO - 10:28:47: EPOCH 585: training on 2135928 raw words (1561105 effective words) took 1.2s, 1326593 effective words/s
INFO - 10:28:48: EPOCH 586 - PROGRESS: at 80.08% examples, 1283183 words/s, in_qsize 15, out_qsize 0
INFO - 10:28:48: EPOCH 586: training on 2135928 raw words (1561055 effective words) took 1.2s, 1309464 effective words/s
INFO - 10:28:49: EPOCH 587 - PROGRESS: at 80.51% examples, 1281820 words/s, in_qsize 16, out_qsize 1
INFO - 10:28:49: EPOCH 587: training on 2135928 raw words (1560667 effective words) took 1.2s, 1328391 effective words/s
INFO - 10:28:50: EPOCH 588 - PROGRESS: at 79.64% examples, 1276665 words/s, in_qsize 16, out_qsize 3
INFO - 10:2

INFO - 10:29:30: EPOCH 621 - PROGRESS: at 81.38% examples, 1277631 words/s, in_qsize 15, out_qsize 0
INFO - 10:29:30: EPOCH 621: training on 2135928 raw words (1560229 effective words) took 1.2s, 1324861 effective words/s
INFO - 10:29:31: EPOCH 622 - PROGRESS: at 80.07% examples, 1278467 words/s, in_qsize 15, out_qsize 0
INFO - 10:29:31: EPOCH 622: training on 2135928 raw words (1561451 effective words) took 1.2s, 1329343 effective words/s
INFO - 10:29:32: EPOCH 623 - PROGRESS: at 79.64% examples, 1276643 words/s, in_qsize 16, out_qsize 0
INFO - 10:29:32: EPOCH 623: training on 2135928 raw words (1560936 effective words) took 1.2s, 1302247 effective words/s
INFO - 10:29:33: EPOCH 624 - PROGRESS: at 80.08% examples, 1284698 words/s, in_qsize 14, out_qsize 1
INFO - 10:29:34: EPOCH 624: training on 2135928 raw words (1560928 effective words) took 1.2s, 1330659 effective words/s
INFO - 10:29:35: EPOCH 625 - PROGRESS: at 80.51% examples, 1276467 words/s, in_qsize 15, out_qsize 0
INFO - 10:2

INFO - 10:30:14: EPOCH 658 - PROGRESS: at 80.52% examples, 1279781 words/s, in_qsize 15, out_qsize 0
INFO - 10:30:14: EPOCH 658: training on 2135928 raw words (1560591 effective words) took 1.2s, 1316930 effective words/s
INFO - 10:30:15: EPOCH 659 - PROGRESS: at 79.21% examples, 1263049 words/s, in_qsize 16, out_qsize 1
INFO - 10:30:15: EPOCH 659: training on 2135928 raw words (1560990 effective words) took 1.2s, 1304612 effective words/s
INFO - 10:30:16: EPOCH 660 - PROGRESS: at 79.64% examples, 1269751 words/s, in_qsize 14, out_qsize 1
INFO - 10:30:17: EPOCH 660: training on 2135928 raw words (1560966 effective words) took 1.2s, 1295676 effective words/s
INFO - 10:30:18: EPOCH 661 - PROGRESS: at 80.50% examples, 1283453 words/s, in_qsize 16, out_qsize 0
INFO - 10:30:18: EPOCH 661: training on 2135928 raw words (1561528 effective words) took 1.2s, 1318786 effective words/s
INFO - 10:30:19: EPOCH 662 - PROGRESS: at 79.21% examples, 1258601 words/s, in_qsize 15, out_qsize 0
INFO - 10:3

INFO - 10:30:58: EPOCH 695 - PROGRESS: at 81.37% examples, 1303195 words/s, in_qsize 15, out_qsize 0
INFO - 10:30:58: EPOCH 695: training on 2135928 raw words (1560417 effective words) took 1.2s, 1317702 effective words/s
INFO - 10:30:59: EPOCH 696 - PROGRESS: at 78.78% examples, 1263429 words/s, in_qsize 15, out_qsize 0
INFO - 10:31:00: EPOCH 696: training on 2135928 raw words (1561114 effective words) took 1.2s, 1298849 effective words/s
INFO - 10:31:01: EPOCH 697 - PROGRESS: at 81.81% examples, 1315473 words/s, in_qsize 15, out_qsize 0
INFO - 10:31:01: EPOCH 697: training on 2135928 raw words (1560682 effective words) took 1.2s, 1326935 effective words/s
INFO - 10:31:02: EPOCH 698 - PROGRESS: at 79.65% examples, 1274059 words/s, in_qsize 12, out_qsize 3
INFO - 10:31:02: EPOCH 698: training on 2135928 raw words (1561765 effective words) took 1.2s, 1317171 effective words/s
INFO - 10:31:03: EPOCH 699 - PROGRESS: at 80.09% examples, 1280280 words/s, in_qsize 16, out_qsize 0
INFO - 10:3

INFO - 10:31:42: EPOCH 732 - PROGRESS: at 80.51% examples, 1289544 words/s, in_qsize 15, out_qsize 0
INFO - 10:31:43: EPOCH 732: training on 2135928 raw words (1560952 effective words) took 1.2s, 1325863 effective words/s
INFO - 10:31:44: EPOCH 733 - PROGRESS: at 80.08% examples, 1278486 words/s, in_qsize 15, out_qsize 0
INFO - 10:31:44: EPOCH 733: training on 2135928 raw words (1561037 effective words) took 1.2s, 1312235 effective words/s
INFO - 10:31:45: EPOCH 734 - PROGRESS: at 81.37% examples, 1279075 words/s, in_qsize 16, out_qsize 1
INFO - 10:31:45: EPOCH 734: training on 2135928 raw words (1560946 effective words) took 1.2s, 1319862 effective words/s
INFO - 10:31:46: EPOCH 735 - PROGRESS: at 79.64% examples, 1274176 words/s, in_qsize 13, out_qsize 2
INFO - 10:31:46: EPOCH 735: training on 2135928 raw words (1561195 effective words) took 1.2s, 1314519 effective words/s
INFO - 10:31:47: EPOCH 736 - PROGRESS: at 80.07% examples, 1265118 words/s, in_qsize 15, out_qsize 0
INFO - 10:3

INFO - 10:32:27: EPOCH 769 - PROGRESS: at 79.65% examples, 1273270 words/s, in_qsize 16, out_qsize 2
INFO - 10:32:27: EPOCH 769: training on 2135928 raw words (1560850 effective words) took 1.2s, 1313169 effective words/s
INFO - 10:32:28: EPOCH 770 - PROGRESS: at 80.51% examples, 1284252 words/s, in_qsize 12, out_qsize 3
INFO - 10:32:28: EPOCH 770: training on 2135928 raw words (1560957 effective words) took 1.2s, 1330804 effective words/s
INFO - 10:32:29: EPOCH 771 - PROGRESS: at 81.37% examples, 1305712 words/s, in_qsize 15, out_qsize 0
INFO - 10:32:29: EPOCH 771: training on 2135928 raw words (1560804 effective words) took 1.2s, 1336137 effective words/s
INFO - 10:32:30: EPOCH 772 - PROGRESS: at 80.07% examples, 1278864 words/s, in_qsize 16, out_qsize 1
INFO - 10:32:30: EPOCH 772: training on 2135928 raw words (1561426 effective words) took 1.2s, 1312187 effective words/s
INFO - 10:32:31: EPOCH 773 - PROGRESS: at 80.08% examples, 1283777 words/s, in_qsize 16, out_qsize 0
INFO - 10:3

INFO - 10:33:11: EPOCH 806 - PROGRESS: at 79.65% examples, 1272275 words/s, in_qsize 14, out_qsize 1
INFO - 10:33:11: EPOCH 806: training on 2135928 raw words (1561019 effective words) took 1.2s, 1325782 effective words/s
INFO - 10:33:12: EPOCH 807 - PROGRESS: at 79.21% examples, 1262975 words/s, in_qsize 15, out_qsize 0
INFO - 10:33:12: EPOCH 807: training on 2135928 raw words (1560503 effective words) took 1.2s, 1282277 effective words/s
INFO - 10:33:13: EPOCH 808 - PROGRESS: at 79.64% examples, 1263520 words/s, in_qsize 14, out_qsize 1
INFO - 10:33:14: EPOCH 808: training on 2135928 raw words (1561219 effective words) took 1.2s, 1300704 effective words/s
INFO - 10:33:15: EPOCH 809 - PROGRESS: at 79.21% examples, 1259126 words/s, in_qsize 16, out_qsize 0
INFO - 10:33:15: EPOCH 809: training on 2135928 raw words (1560709 effective words) took 1.2s, 1303796 effective words/s
INFO - 10:33:16: EPOCH 810 - PROGRESS: at 79.64% examples, 1269017 words/s, in_qsize 14, out_qsize 1
INFO - 10:3

INFO - 10:33:55: EPOCH 843 - PROGRESS: at 79.64% examples, 1277853 words/s, in_qsize 13, out_qsize 2
INFO - 10:33:55: EPOCH 843: training on 2135928 raw words (1560981 effective words) took 1.2s, 1324358 effective words/s
INFO - 10:33:56: EPOCH 844 - PROGRESS: at 80.08% examples, 1275517 words/s, in_qsize 15, out_qsize 0
INFO - 10:33:57: EPOCH 844: training on 2135928 raw words (1561454 effective words) took 1.2s, 1309957 effective words/s
INFO - 10:33:58: EPOCH 845 - PROGRESS: at 78.78% examples, 1256372 words/s, in_qsize 15, out_qsize 0
INFO - 10:33:58: EPOCH 845: training on 2135928 raw words (1561121 effective words) took 1.2s, 1295479 effective words/s
INFO - 10:33:59: EPOCH 846 - PROGRESS: at 80.07% examples, 1278205 words/s, in_qsize 14, out_qsize 1
INFO - 10:33:59: EPOCH 846: training on 2135928 raw words (1561188 effective words) took 1.2s, 1328572 effective words/s
INFO - 10:34:00: EPOCH 847 - PROGRESS: at 80.08% examples, 1285333 words/s, in_qsize 14, out_qsize 1
INFO - 10:3

INFO - 10:34:40: EPOCH 880 - PROGRESS: at 80.51% examples, 1282562 words/s, in_qsize 15, out_qsize 0
INFO - 10:34:40: EPOCH 880: training on 2135928 raw words (1561659 effective words) took 1.2s, 1325455 effective words/s
INFO - 10:34:41: EPOCH 881 - PROGRESS: at 80.51% examples, 1276450 words/s, in_qsize 16, out_qsize 0
INFO - 10:34:41: EPOCH 881: training on 2135928 raw words (1562168 effective words) took 1.2s, 1324744 effective words/s
INFO - 10:34:42: EPOCH 882 - PROGRESS: at 80.08% examples, 1285663 words/s, in_qsize 15, out_qsize 0
INFO - 10:34:42: EPOCH 882: training on 2135928 raw words (1561639 effective words) took 1.2s, 1309478 effective words/s
INFO - 10:34:43: EPOCH 883 - PROGRESS: at 80.07% examples, 1265388 words/s, in_qsize 15, out_qsize 0
INFO - 10:34:43: EPOCH 883: training on 2135928 raw words (1561348 effective words) took 1.2s, 1313451 effective words/s
INFO - 10:34:44: EPOCH 884 - PROGRESS: at 79.21% examples, 1270424 words/s, in_qsize 15, out_qsize 0
INFO - 10:3

INFO - 10:35:24: EPOCH 917 - PROGRESS: at 81.82% examples, 1269850 words/s, in_qsize 15, out_qsize 0
INFO - 10:35:24: EPOCH 917: training on 2135928 raw words (1561480 effective words) took 1.2s, 1317161 effective words/s
INFO - 10:35:25: EPOCH 918 - PROGRESS: at 80.51% examples, 1286308 words/s, in_qsize 15, out_qsize 0
INFO - 10:35:25: EPOCH 918: training on 2135928 raw words (1561642 effective words) took 1.2s, 1324724 effective words/s
INFO - 10:35:26: EPOCH 919 - PROGRESS: at 77.91% examples, 1239868 words/s, in_qsize 15, out_qsize 0
INFO - 10:35:26: EPOCH 919: training on 2135928 raw words (1562024 effective words) took 1.2s, 1287759 effective words/s
INFO - 10:35:27: EPOCH 920 - PROGRESS: at 79.22% examples, 1270336 words/s, in_qsize 15, out_qsize 0
INFO - 10:35:28: EPOCH 920: training on 2135928 raw words (1561241 effective words) took 1.2s, 1313762 effective words/s
INFO - 10:35:29: EPOCH 921 - PROGRESS: at 78.34% examples, 1252412 words/s, in_qsize 16, out_qsize 1
INFO - 10:3

INFO - 10:36:08: EPOCH 954 - PROGRESS: at 79.64% examples, 1267285 words/s, in_qsize 15, out_qsize 0
INFO - 10:36:08: EPOCH 954: training on 2135928 raw words (1561387 effective words) took 1.2s, 1318771 effective words/s
INFO - 10:36:09: EPOCH 955 - PROGRESS: at 81.81% examples, 1304111 words/s, in_qsize 16, out_qsize 0
INFO - 10:36:10: EPOCH 955: training on 2135928 raw words (1561410 effective words) took 1.2s, 1331855 effective words/s
INFO - 10:36:11: EPOCH 956 - PROGRESS: at 80.51% examples, 1280206 words/s, in_qsize 13, out_qsize 2
INFO - 10:36:11: EPOCH 956: training on 2135928 raw words (1560965 effective words) took 1.2s, 1313368 effective words/s
INFO - 10:36:12: EPOCH 957 - PROGRESS: at 80.93% examples, 1297585 words/s, in_qsize 15, out_qsize 0
INFO - 10:36:12: EPOCH 957: training on 2135928 raw words (1560855 effective words) took 1.2s, 1309414 effective words/s
INFO - 10:36:13: EPOCH 958 - PROGRESS: at 80.95% examples, 1300888 words/s, in_qsize 15, out_qsize 0
INFO - 10:3

INFO - 10:36:53: EPOCH 991 - PROGRESS: at 79.22% examples, 1269019 words/s, in_qsize 14, out_qsize 3
INFO - 10:36:53: EPOCH 991: training on 2135928 raw words (1560862 effective words) took 1.2s, 1308319 effective words/s
INFO - 10:36:54: EPOCH 992 - PROGRESS: at 81.38% examples, 1296664 words/s, in_qsize 15, out_qsize 0
INFO - 10:36:54: EPOCH 992: training on 2135928 raw words (1559832 effective words) took 1.2s, 1334004 effective words/s
INFO - 10:36:55: EPOCH 993 - PROGRESS: at 80.94% examples, 1277644 words/s, in_qsize 15, out_qsize 0
INFO - 10:36:55: EPOCH 993: training on 2135928 raw words (1561120 effective words) took 1.2s, 1316156 effective words/s
INFO - 10:36:56: EPOCH 994 - PROGRESS: at 79.64% examples, 1272116 words/s, in_qsize 12, out_qsize 3
INFO - 10:36:56: EPOCH 994: training on 2135928 raw words (1561200 effective words) took 1.2s, 1316316 effective words/s
INFO - 10:36:57: EPOCH 995 - PROGRESS: at 80.08% examples, 1284289 words/s, in_qsize 16, out_qsize 0
INFO - 10:3

In [3]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

gutenberg_w2v = Word2Vec.load("gutenberg_w2v.hundd.model")

INFO - 10:37:02: loading Word2Vec object from gutenberg_w2v.hundd.model
INFO - 10:37:02: loading wv recursively from gutenberg_w2v.hundd.model.wv.* with mmap=None
INFO - 10:37:02: setting ignored attribute cum_table to None
INFO - 10:37:03: Word2Vec lifecycle event {'fname': 'gutenberg_w2v.hundd.model', 'datetime': '2023-04-08T10:37:03.199430', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'loaded'}


In [4]:
gutenberg_w2v.wv.most_similar(["bible"])


[('wrist', 0.4212106764316559),
 ('auger', 0.3718101978302002),
 ('drummers', 0.36828792095184326),
 ('alp', 0.36577168107032776),
 ('starless', 0.35932689905166626),
 ('loe', 0.35318851470947266),
 ('dreamer', 0.3490268886089325),
 ('stricter', 0.3463912308216095),
 ('theme', 0.3424712121486664),
 ('figured', 0.33751749992370605)]

In [5]:
gutenberg_w2v.wv.most_similar(["book"])


[('temple', 0.4853457510471344),
 ('written', 0.46804577112197876),
 ('letter', 0.45874887704849243),
 ('commandment', 0.434907466173172),
 ('xxviii', 0.43481698632240295),
 ('xxxi', 0.43277469277381897),
 ('xxv', 0.4307113587856293),
 ('law', 0.4304876923561096),
 ('xxvii', 0.4301656186580658),
 ('xxvi', 0.4266403615474701)]

In [6]:
gutenberg_w2v.wv.most_similar(["bank"])


[('wall', 0.5256211757659912),
 ('floor', 0.5242471098899841),
 ('ground', 0.5152381658554077),
 ('top', 0.5137965083122253),
 ('hill', 0.49940502643585205),
 ('river', 0.4871082305908203),
 ('side', 0.4861571788787842),
 ('table', 0.47034722566604614),
 ('bed', 0.46848219633102417),
 ('road', 0.4509953260421753)]

In [7]:
gutenberg_w2v.wv.most_similar(["water"])

[('waters', 0.648418664932251),
 ('river', 0.4997851252555847),
 ('wood', 0.4902501702308655),
 ('fire', 0.48621195554733276),
 ('ground', 0.4740127921104431),
 ('wine', 0.47030383348464966),
 ('hole', 0.4697657823562622),
 ('blood', 0.4655188322067261),
 ('streams', 0.46095797419548035),
 ('pondside', 0.45841994881629944)]

In [8]:
v = gutenberg_w2v.wv["tall"] - gutenberg_w2v.wv["taller"] + gutenberg_w2v.wv["good"]
gutenberg_w2v.wv.most_similar(v)

[('good', 0.5256189107894897),
 ('tout', 0.43754979968070984),
 ('decidedly', 0.3952230215072632),
 ('consigned', 0.38436630368232727),
 ('tall', 0.38100382685661316),
 ('chuckle', 0.37920597195625305),
 ('salamander', 0.3683614134788513),
 ('sandals', 0.36216801404953003),
 ('steadiness', 0.35800066590309143),
 ('annum', 0.3556828498840332)]

In [9]:
v = gutenberg_w2v.wv["girls"] - gutenberg_w2v.wv["queen"] + gutenberg_w2v.wv["kings"]
gutenberg_w2v.wv.most_similar(v)

[('kings', 0.6626361012458801),
 ('girls', 0.6021570563316345),
 ('men', 0.5352333784103394),
 ('cities', 0.5203460454940796),
 ('parts', 0.5036395788192749),
 ('women', 0.4714898467063904),
 ('children', 0.45983532071113586),
 ('platonists', 0.45207488536834717),
 ('transgressions', 0.44770491123199463),
 ('nations', 0.4398312568664551)]

In [10]:
v = gutenberg_w2v.wv["france"] - gutenberg_w2v.wv["paris"] + gutenberg_w2v.wv["london"]
gutenberg_w2v.wv.most_similar(v)

[('france', 0.6477273106575012),
 ('london', 0.4751870036125183),
 ('town', 0.4566294252872467),
 ('highbury', 0.4194543957710266),
 ('inferiorities', 0.39944344758987427),
 ('trenchers', 0.39931443333625793),
 ('schwartz', 0.3958463668823242),
 ('inn', 0.38943254947662354),
 ('nathanmelech', 0.3812830448150635),
 ('plan', 0.37123164534568787)]

In [167]:
from gensim.models import KeyedVectors

google_model = KeyedVectors.load_word2vec_format('/home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', binary=True,
limit=1000000)

INFO - 14:57:36: loading projection weights from /home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz
INFO - 14:57:58: KeyedVectors lifecycle event {'msg': 'loaded (1000000, 300) matrix of type float32 from /home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-04-08T14:57:58.642307', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'load_word2vec_format'}


In [110]:
google_model.most_similar(["bible"])

[('Bible', 0.736778199672699),
 ('bibles', 0.6052598357200623),
 ('Holy_Bible', 0.5989601612091064),
 ('scriptures', 0.574568510055542),
 ('scripture', 0.5697901844978333),
 ('New_Testament', 0.5638793110847473),
 ('Scripture', 0.5502957701683044),
 ('Scriptures', 0.5411645770072937),
 ('NRSV', 0.5341106057167053),
 ('Leviticus_##:##-##', 0.5247005224227905)]

In [111]:
google_model.most_similar(["book"])

[('tome', 0.7485830783843994),
 ('books', 0.7379177808761597),
 ('memoir', 0.7302926778793335),
 ('paperback_edition', 0.6868364214897156),
 ('autobiography', 0.6741527318954468),
 ('memoirs', 0.6505153179168701),
 ('Book', 0.6479282975196838),
 ('paperback', 0.6471226811408997),
 ('novels', 0.6341459155082703),
 ('hardback', 0.6283079981803894)]

In [112]:
google_model.most_similar(["bank"])

[('banks', 0.7440759539604187),
 ('banking', 0.690161406993866),
 ('Bank', 0.6698698401451111),
 ('lender', 0.6342284679412842),
 ('banker', 0.6092953085899353),
 ('depositors', 0.6031531691551208),
 ('mortgage_lender', 0.5797975659370422),
 ('depositor', 0.5716427564620972),
 ('BofA', 0.5714625120162964),
 ('Citibank', 0.5589520335197449)]

In [113]:
google_model.most_similar(["water"])

[('potable_water', 0.6799106001853943),
 ('Water', 0.6706871390342712),
 ('sewage', 0.6619377732276917),
 ('groundwater', 0.6588346362113953),
 ('Floridan_aquifer', 0.6422534584999084),
 ('freshwater', 0.6307883262634277),
 ('potable', 0.6251927614212036),
 ('wastewater', 0.6212229132652283),
 ('brackish_groundwater', 0.6206730604171753),
 ('aquifer', 0.6143589615821838)]

In [114]:
v = google_model["girls"] - google_model["queen"] + google_model["kings"]
google_model.most_similar(v)

[('boys', 0.6931698322296143),
 ('girls', 0.6385126709938049),
 ('kings', 0.4957888424396515),
 ('men', 0.48680540919303894),
 ('teenagers', 0.4788475036621094),
 ('schoolboys', 0.45804113149642944),
 ('pee_wees', 0.44774994254112244),
 ('Mitey_Mite', 0.44012460112571716),
 ('kids', 0.4373849332332611),
 ('youngsters', 0.43566834926605225)]

In [168]:
v = google_model["tall"] - google_model["taller"] + google_model["good"]
google_model.most_similar(v)

[('good', 0.6434131860733032),
 ('great', 0.49164238572120667),
 ('bad', 0.4760521948337555),
 ('terrific', 0.46986129879951477),
 ('wonderful', 0.4452008605003357),
 ('nice', 0.4425136148929596),
 ('fantastic', 0.43418607115745544),
 ('decent', 0.4307934641838074),
 ('excellent', 0.41867733001708984),
 ('terrible', 0.4151829481124878)]

In [116]:
v = google_model["france"] - google_model["paris"] + google_model["london"]
google_model.most_similar(v)

[('london', 0.754153847694397),
 ('france', 0.7366582751274109),
 ('england', 0.600825309753418),
 ('europe', 0.5708170533180237),
 ('birmingham', 0.5392330884933472),
 ('european', 0.5275605916976929),
 ('newcastle', 0.5263600945472717),
 ('barcelona', 0.5107599496841431),
 ('africa', 0.510517418384552),
 ('spain', 0.5082812905311584)]

In [94]:
import numpy as np

# Convert to numpy 2d array (n_vocab x vector_size)
def to_embeddings_Matrix(model):  
    embedding_matrix = np.zeros((len(model.wv), model.vector_size))
    word2idx = {}
    for i in range(len(model.wv)):
        embedding_matrix[i] = model.wv[model.wv.index_to_key[i]] 
    return embedding_matrix


embeddings=to_embeddings_Matrix(gutenberg_w2v)
print(np.shape(embeddings))


(41465, 100)


In [95]:
# Put in the data folder, the embeddings and the metadata to load into the data visualization tool

import csv
with open('../data/embeddings.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output, delimiter='\t')
    for embedding in embeddings:
        tsv_output.writerow(embedding)
    
with open('../data/metadata.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output)
    for voc_rows in voc:
        tsv_output.writerow([voc_rows])
  

In [3]:
import glob
import os
import re

import numpy as np
import sklearn

SCRIPT_DIRECTORY = os.path.realpath(os.getcwd())

data_dir = os.path.join(SCRIPT_DIRECTORY, "../data/aclImdb")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
pos_train_dir = os.path.join(train_dir, "pos")
neg_train_dir = os.path.join(train_dir, "neg")
pos_test_dir = os.path.join(test_dir, "pos")
neg_test_dir = os.path.join(test_dir, "neg")

# For memory limitations. These parameters fit in 8GB of RAM.
# If you have 16G of RAM you can experiment with the full dataset / W2V
MAX_NUM_SAMPLES = 5000
# Load first 1M word embeddings. This works because GoogleNews are roughly
# sorted from most frequent to least frequent.
# It may yield much worse results for other embeddings corpora
NUM_W2V_TO_LOAD = 1000000


SEED = 42

# Fix numpy random seed for reproducibility
np.random.seed(SEED)


def strip_punctuation(s):
    return re.sub(r"[^a-zA-Z\s]", " ", s)


def preprocess(s):
    return re.sub("\s+", " ", strip_punctuation(s).lower())


def tokenize(s):
    return s.split(" ")


def preproc_tok(s):
    return tokenize(preprocess(s))


# Preprocess and tokenize the reviews, it will come out as list of lists 
def token_proc(t_corpus):
    data=[]
    for i,ind in enumerate(t_corpus):
        proc_t_corpus=preproc_tok(train_corpus[i])
        data.append(proc_t_corpus)
    
    return data


def read_samples(folder, preprocess=lambda x: x):
    samples = glob.iglob(os.path.join(folder, "*.txt"))
    data = []

    for i, sample in enumerate(samples):
        if MAX_NUM_SAMPLES > 0 and i == MAX_NUM_SAMPLES:
            break
        with open(sample, "r") as fd:
            x = [preprocess(l) for l in fd][0]
            data.append(x)

    return data


def create_corpus(pos, neg):
    corpus = np.array(pos + neg)
    y = np.array([1 for _ in pos] + [0 for _ in neg])
    indices = np.arange(y.shape[0])
    np.random.shuffle(indices)

    return list(corpus[indices]), list(y[indices])


def extract_nbow(model,train_data,test_data):
    """Extract neural bag of words representations"""
    
    # The training dataset (sentences of the reviews) will be converted to vectors of 100 dimensions 
    X_train = np.zeros((np.size(train_data), 100))
    for row, rev in enumerate(train_data):
        words_included = 0

        # Tokenize current review
        rev_toks = preproc_tok(rev)
    
        for tok in rev_toks:
            if tok in model.wv:
                X_train[row] += model.wv[tok]
                words_included += 1
            
        # Get the mean value of each sentence in the embedding space
        X_train[row] = X_train[row]/words_included



    # The test dataset (sentences of the reviews) will be converted to vectors of 100 dimensions 
    X_test = np.zeros((np.size(test_data), 100)) 
    for row, rev in enumerate(test_data):
        words_included = 0
        
        # Tokenize current review
        rev_toks = preproc_tok(rev)
        for tok in rev_toks:
            # For each token check if it has a w2v representation
            # and if yes add it.
            if tok in model.wv:
                X_test[row] += model.wv[tok]
                words_included += 1
                
        # Get the mean value of each sentence in the embedding space
        X_test[row] = X_test[row]/words_included

    return X_train,X_test

    raise NotImplementedError("Implement nbow extractor")



In [14]:
# Take text and label data from the directories in raw format

train_data,train_labels = create_corpus(read_samples(pos_train_dir), read_samples(neg_train_dir))
test_data,test_labels = create_corpus(read_samples(pos_test_dir), read_samples(neg_test_dir))


# Tokenize each review of our training data to feed into the "sentences" argument of our model's traing phase
proc_tok_corpus=[]
proc_tok_rev=[]
for rev in train_data:
    proc_tok_rev=preproc_tok(rev)
    proc_tok_corpus.append(proc_tok_rev)


In [53]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

train_w2v_model(proc_tok_corpus,
   "my_sentiment_w2v.model",
    5,
    100,
    1000,
    1,
)

INFO - 11:13:26: collecting all words and their counts
INFO - 11:13:26: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:13:27: collected 51037 word types from a corpus of 2430609 raw words and 10000 sentences
INFO - 11:13:27: Creating a fresh vocabulary
INFO - 11:13:27: Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 51037 unique words (100.00% of original 51037, drops 0)', 'datetime': '2023-04-08T11:13:27.877800', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 11:13:27: Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 2430609 word corpus (100.00% of original 2430609, drops 0)', 'datetime': '2023-04-08T11:13:27.878725', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
INFO - 11:13:28:

INFO - 11:13:50: EPOCH 8 - PROGRESS: at 65.54% examples, 1164708 words/s, in_qsize 14, out_qsize 1
INFO - 11:13:50: EPOCH 8: training on 2430609 raw words (1812989 effective words) took 1.5s, 1235732 effective words/s
INFO - 11:13:51: EPOCH 9 - PROGRESS: at 73.96% examples, 1325235 words/s, in_qsize 14, out_qsize 1
INFO - 11:13:51: EPOCH 9: training on 2430609 raw words (1813005 effective words) took 1.4s, 1324632 effective words/s
INFO - 11:13:52: EPOCH 10 - PROGRESS: at 61.52% examples, 1105513 words/s, in_qsize 16, out_qsize 1
INFO - 11:13:53: EPOCH 10: training on 2430609 raw words (1813259 effective words) took 1.7s, 1063335 effective words/s
INFO - 11:13:54: EPOCH 11 - PROGRESS: at 58.94% examples, 1062283 words/s, in_qsize 16, out_qsize 1
INFO - 11:13:55: EPOCH 11: training on 2430609 raw words (1812850 effective words) took 1.7s, 1086837 effective words/s
INFO - 11:13:56: EPOCH 12 - PROGRESS: at 60.77% examples, 1088975 words/s, in_qsize 15, out_qsize 0
INFO - 11:13:56: EPOCH 1

INFO - 11:14:44: EPOCH 45: training on 2430609 raw words (1812335 effective words) took 1.3s, 1431187 effective words/s
INFO - 11:14:45: EPOCH 46 - PROGRESS: at 79.11% examples, 1413921 words/s, in_qsize 15, out_qsize 0
INFO - 11:14:46: EPOCH 46: training on 2430609 raw words (1813085 effective words) took 1.3s, 1442794 effective words/s
INFO - 11:14:47: EPOCH 47 - PROGRESS: at 77.15% examples, 1383072 words/s, in_qsize 15, out_qsize 0
INFO - 11:14:47: EPOCH 47: training on 2430609 raw words (1814244 effective words) took 1.3s, 1397351 effective words/s
INFO - 11:14:48: EPOCH 48 - PROGRESS: at 80.76% examples, 1454689 words/s, in_qsize 16, out_qsize 1
INFO - 11:14:48: EPOCH 48: training on 2430609 raw words (1813029 effective words) took 1.2s, 1481545 effective words/s
INFO - 11:14:49: EPOCH 49 - PROGRESS: at 79.55% examples, 1430428 words/s, in_qsize 15, out_qsize 0
INFO - 11:14:49: EPOCH 49: training on 2430609 raw words (1813844 effective words) took 1.3s, 1438676 effective words/s


INFO - 11:15:32: EPOCH 83 - PROGRESS: at 77.58% examples, 1378402 words/s, in_qsize 16, out_qsize 1
INFO - 11:15:32: EPOCH 83: training on 2430609 raw words (1812804 effective words) took 1.3s, 1416440 effective words/s
INFO - 11:15:33: EPOCH 84 - PROGRESS: at 76.36% examples, 1373484 words/s, in_qsize 15, out_qsize 0
INFO - 11:15:34: EPOCH 84: training on 2430609 raw words (1812832 effective words) took 1.3s, 1399785 effective words/s
INFO - 11:15:35: EPOCH 85 - PROGRESS: at 77.53% examples, 1391306 words/s, in_qsize 14, out_qsize 1
INFO - 11:15:35: EPOCH 85: training on 2430609 raw words (1812409 effective words) took 1.3s, 1418782 effective words/s
INFO - 11:15:36: EPOCH 86 - PROGRESS: at 78.37% examples, 1405561 words/s, in_qsize 14, out_qsize 1
INFO - 11:15:36: EPOCH 86: training on 2430609 raw words (1813470 effective words) took 1.3s, 1433677 effective words/s
INFO - 11:15:37: EPOCH 87 - PROGRESS: at 82.02% examples, 1476705 words/s, in_qsize 15, out_qsize 0
INFO - 11:15:38: EPO

INFO - 11:16:22: EPOCH 120: training on 2430609 raw words (1813032 effective words) took 1.3s, 1449295 effective words/s
INFO - 11:16:23: EPOCH 121 - PROGRESS: at 78.67% examples, 1418469 words/s, in_qsize 15, out_qsize 0
INFO - 11:16:23: EPOCH 121: training on 2430609 raw words (1812780 effective words) took 1.3s, 1435770 effective words/s
INFO - 11:16:24: EPOCH 122 - PROGRESS: at 79.11% examples, 1413929 words/s, in_qsize 13, out_qsize 2
INFO - 11:16:24: EPOCH 122: training on 2430609 raw words (1812623 effective words) took 1.3s, 1430850 effective words/s
INFO - 11:16:25: EPOCH 123 - PROGRESS: at 77.58% examples, 1386569 words/s, in_qsize 14, out_qsize 1
INFO - 11:16:26: EPOCH 123: training on 2430609 raw words (1813173 effective words) took 1.3s, 1429112 effective words/s
INFO - 11:16:27: EPOCH 124 - PROGRESS: at 76.76% examples, 1384153 words/s, in_qsize 16, out_qsize 1
INFO - 11:16:27: EPOCH 124: training on 2430609 raw words (1813221 effective words) took 1.3s, 1406368 effective

INFO - 11:17:09: EPOCH 157: training on 2430609 raw words (1813048 effective words) took 1.3s, 1401775 effective words/s
INFO - 11:17:10: EPOCH 158 - PROGRESS: at 77.15% examples, 1383892 words/s, in_qsize 16, out_qsize 1
INFO - 11:17:10: EPOCH 158: training on 2430609 raw words (1813286 effective words) took 1.3s, 1406613 effective words/s
INFO - 11:17:11: EPOCH 159 - PROGRESS: at 78.37% examples, 1412660 words/s, in_qsize 15, out_qsize 0
INFO - 11:17:12: EPOCH 159: training on 2430609 raw words (1813215 effective words) took 1.3s, 1422727 effective words/s
INFO - 11:17:13: EPOCH 160 - PROGRESS: at 76.76% examples, 1375962 words/s, in_qsize 14, out_qsize 1
INFO - 11:17:13: EPOCH 160: training on 2430609 raw words (1812244 effective words) took 1.3s, 1402616 effective words/s
INFO - 11:17:14: EPOCH 161 - PROGRESS: at 75.61% examples, 1358082 words/s, in_qsize 14, out_qsize 1
INFO - 11:17:14: EPOCH 161: training on 2430609 raw words (1813218 effective words) took 1.3s, 1374163 effective

INFO - 11:17:57: EPOCH 194: training on 2430609 raw words (1813202 effective words) took 1.2s, 1463194 effective words/s
INFO - 11:17:58: EPOCH 195 - PROGRESS: at 77.15% examples, 1389385 words/s, in_qsize 14, out_qsize 1
INFO - 11:17:58: EPOCH 195: training on 2430609 raw words (1812671 effective words) took 1.3s, 1414175 effective words/s
INFO - 11:17:59: EPOCH 196 - PROGRESS: at 78.24% examples, 1409986 words/s, in_qsize 13, out_qsize 2
INFO - 11:17:59: EPOCH 196: training on 2430609 raw words (1812210 effective words) took 1.3s, 1424176 effective words/s
INFO - 11:18:00: EPOCH 197 - PROGRESS: at 77.94% examples, 1400371 words/s, in_qsize 14, out_qsize 1
INFO - 11:18:00: EPOCH 197: training on 2430609 raw words (1813332 effective words) took 1.3s, 1433501 effective words/s
INFO - 11:18:01: EPOCH 198 - PROGRESS: at 77.96% examples, 1402457 words/s, in_qsize 15, out_qsize 0
INFO - 11:18:02: EPOCH 198: training on 2430609 raw words (1813285 effective words) took 1.3s, 1415711 effective

INFO - 11:18:44: EPOCH 231: training on 2430609 raw words (1812223 effective words) took 1.2s, 1475183 effective words/s
INFO - 11:18:45: EPOCH 232 - PROGRESS: at 77.15% examples, 1390069 words/s, in_qsize 15, out_qsize 0
INFO - 11:18:45: EPOCH 232: training on 2430609 raw words (1813098 effective words) took 1.3s, 1416877 effective words/s
INFO - 11:18:46: EPOCH 233 - PROGRESS: at 76.76% examples, 1377984 words/s, in_qsize 15, out_qsize 0
INFO - 11:18:46: EPOCH 233: training on 2430609 raw words (1812495 effective words) took 1.3s, 1408854 effective words/s
INFO - 11:18:47: EPOCH 234 - PROGRESS: at 78.67% examples, 1409732 words/s, in_qsize 16, out_qsize 1
INFO - 11:18:47: EPOCH 234: training on 2430609 raw words (1813422 effective words) took 1.3s, 1430907 effective words/s
INFO - 11:18:49: EPOCH 235 - PROGRESS: at 74.35% examples, 1324794 words/s, in_qsize 14, out_qsize 1
INFO - 11:18:49: EPOCH 235: training on 2430609 raw words (1812870 effective words) took 1.3s, 1376614 effective

INFO - 11:19:31: EPOCH 268: training on 2430609 raw words (1813320 effective words) took 1.4s, 1269921 effective words/s
INFO - 11:19:32: EPOCH 269 - PROGRESS: at 64.78% examples, 1160556 words/s, in_qsize 14, out_qsize 1
INFO - 11:19:33: EPOCH 269: training on 2430609 raw words (1813287 effective words) took 1.6s, 1141156 effective words/s
INFO - 11:19:34: EPOCH 270 - PROGRESS: at 61.49% examples, 1095712 words/s, in_qsize 15, out_qsize 0
INFO - 11:19:34: EPOCH 270: training on 2430609 raw words (1813578 effective words) took 1.5s, 1197026 effective words/s
INFO - 11:19:35: EPOCH 271 - PROGRESS: at 64.29% examples, 1152529 words/s, in_qsize 15, out_qsize 0
INFO - 11:19:36: EPOCH 271: training on 2430609 raw words (1813479 effective words) took 1.6s, 1158220 effective words/s
INFO - 11:19:37: EPOCH 272 - PROGRESS: at 58.94% examples, 1056554 words/s, in_qsize 14, out_qsize 1
INFO - 11:19:38: EPOCH 272: training on 2430609 raw words (1813684 effective words) took 1.6s, 1128585 effective

INFO - 11:20:30: EPOCH 305: training on 2430609 raw words (1813734 effective words) took 1.6s, 1110990 effective words/s
INFO - 11:20:31: EPOCH 306 - PROGRESS: at 62.66% examples, 1124217 words/s, in_qsize 15, out_qsize 0
INFO - 11:20:31: EPOCH 306: training on 2430609 raw words (1813356 effective words) took 1.6s, 1106042 effective words/s
INFO - 11:20:32: EPOCH 307 - PROGRESS: at 48.55% examples, 866369 words/s, in_qsize 16, out_qsize 0
INFO - 11:20:33: EPOCH 307 - PROGRESS: at 96.09% examples, 865873 words/s, in_qsize 10, out_qsize 0
INFO - 11:20:34: EPOCH 307: training on 2430609 raw words (1813156 effective words) took 2.1s, 874942 effective words/s
INFO - 11:20:35: EPOCH 308 - PROGRESS: at 51.39% examples, 916633 words/s, in_qsize 15, out_qsize 0
INFO - 11:20:36: EPOCH 308: training on 2430609 raw words (1813222 effective words) took 2.0s, 912319 effective words/s
INFO - 11:20:37: EPOCH 309 - PROGRESS: at 50.58% examples, 886849 words/s, in_qsize 15, out_qsize 0
INFO - 11:20:38: 

INFO - 11:21:33: EPOCH 342 - PROGRESS: at 69.06% examples, 1232124 words/s, in_qsize 13, out_qsize 2
INFO - 11:21:34: EPOCH 342: training on 2430609 raw words (1812824 effective words) took 1.4s, 1285474 effective words/s
INFO - 11:21:35: EPOCH 343 - PROGRESS: at 68.64% examples, 1232263 words/s, in_qsize 13, out_qsize 2
INFO - 11:21:35: EPOCH 343: training on 2430609 raw words (1813276 effective words) took 1.4s, 1292985 effective words/s
INFO - 11:21:36: EPOCH 344 - PROGRESS: at 68.69% examples, 1230433 words/s, in_qsize 15, out_qsize 0
INFO - 11:21:37: EPOCH 344: training on 2430609 raw words (1812486 effective words) took 1.4s, 1278996 effective words/s
INFO - 11:21:38: EPOCH 345 - PROGRESS: at 76.76% examples, 1366725 words/s, in_qsize 16, out_qsize 2
INFO - 11:21:38: EPOCH 345: training on 2430609 raw words (1813376 effective words) took 1.3s, 1369803 effective words/s
INFO - 11:21:39: EPOCH 346 - PROGRESS: at 76.05% examples, 1364976 words/s, in_qsize 16, out_qsize 0
INFO - 11:2

INFO - 11:22:29: EPOCH 379 - PROGRESS: at 69.06% examples, 1238430 words/s, in_qsize 13, out_qsize 2
INFO - 11:22:30: EPOCH 379: training on 2430609 raw words (1812973 effective words) took 1.5s, 1205655 effective words/s
INFO - 11:22:31: EPOCH 380 - PROGRESS: at 67.43% examples, 1211833 words/s, in_qsize 14, out_qsize 1
INFO - 11:22:31: EPOCH 380: training on 2430609 raw words (1812872 effective words) took 1.4s, 1256947 effective words/s
INFO - 11:22:32: EPOCH 381 - PROGRESS: at 74.35% examples, 1335079 words/s, in_qsize 15, out_qsize 0
INFO - 11:22:33: EPOCH 381: training on 2430609 raw words (1813594 effective words) took 1.3s, 1365713 effective words/s
INFO - 11:22:34: EPOCH 382 - PROGRESS: at 58.10% examples, 1035486 words/s, in_qsize 14, out_qsize 1
INFO - 11:22:35: EPOCH 382: training on 2430609 raw words (1813589 effective words) took 1.8s, 987109 effective words/s
INFO - 11:22:36: EPOCH 383 - PROGRESS: at 58.94% examples, 1053783 words/s, in_qsize 15, out_qsize 0
INFO - 11:22

INFO - 11:23:32: EPOCH 414: training on 2430609 raw words (1813476 effective words) took 2.0s, 917401 effective words/s
INFO - 11:23:34: EPOCH 415 - PROGRESS: at 53.31% examples, 957754 words/s, in_qsize 14, out_qsize 1
INFO - 11:23:34: EPOCH 415: training on 2430609 raw words (1813829 effective words) took 1.7s, 1054194 effective words/s
INFO - 11:23:35: EPOCH 416 - PROGRESS: at 59.44% examples, 1054557 words/s, in_qsize 14, out_qsize 1
INFO - 11:23:36: EPOCH 416: training on 2430609 raw words (1813107 effective words) took 1.6s, 1117685 effective words/s
INFO - 11:23:37: EPOCH 417 - PROGRESS: at 65.13% examples, 1165611 words/s, in_qsize 14, out_qsize 1
INFO - 11:23:37: EPOCH 417: training on 2430609 raw words (1812968 effective words) took 1.5s, 1185647 effective words/s
INFO - 11:23:38: EPOCH 418 - PROGRESS: at 59.36% examples, 1066924 words/s, in_qsize 16, out_qsize 1
INFO - 11:23:39: EPOCH 418: training on 2430609 raw words (1813391 effective words) took 1.6s, 1138056 effective w

INFO - 11:24:30: EPOCH 451: training on 2430609 raw words (1813053 effective words) took 1.5s, 1183286 effective words/s
INFO - 11:24:31: EPOCH 452 - PROGRESS: at 65.98% examples, 1164798 words/s, in_qsize 13, out_qsize 2
INFO - 11:24:32: EPOCH 452: training on 2430609 raw words (1813732 effective words) took 1.5s, 1192640 effective words/s
INFO - 11:24:33: EPOCH 453 - PROGRESS: at 65.19% examples, 1167974 words/s, in_qsize 14, out_qsize 1
INFO - 11:24:34: EPOCH 453: training on 2430609 raw words (1813336 effective words) took 1.5s, 1201443 effective words/s
INFO - 11:24:35: EPOCH 454 - PROGRESS: at 63.49% examples, 1123736 words/s, in_qsize 15, out_qsize 0
INFO - 11:24:35: EPOCH 454: training on 2430609 raw words (1812755 effective words) took 1.6s, 1166003 effective words/s
INFO - 11:24:36: EPOCH 455 - PROGRESS: at 66.31% examples, 1189624 words/s, in_qsize 13, out_qsize 2
INFO - 11:24:37: EPOCH 455: training on 2430609 raw words (1813437 effective words) took 1.5s, 1182840 effective

INFO - 11:25:30: EPOCH 486 - PROGRESS: at 57.24% examples, 1025125 words/s, in_qsize 15, out_qsize 0
INFO - 11:25:30: EPOCH 486: training on 2430609 raw words (1813302 effective words) took 1.6s, 1100340 effective words/s
INFO - 11:25:31: EPOCH 487 - PROGRESS: at 65.13% examples, 1170373 words/s, in_qsize 16, out_qsize 0
INFO - 11:25:32: EPOCH 487: training on 2430609 raw words (1813571 effective words) took 1.5s, 1187092 effective words/s
INFO - 11:25:33: EPOCH 488 - PROGRESS: at 62.26% examples, 1119972 words/s, in_qsize 13, out_qsize 2
INFO - 11:25:34: EPOCH 488: training on 2430609 raw words (1813228 effective words) took 1.6s, 1167534 effective words/s
INFO - 11:25:35: EPOCH 489 - PROGRESS: at 66.31% examples, 1185165 words/s, in_qsize 13, out_qsize 2
INFO - 11:25:35: EPOCH 489: training on 2430609 raw words (1813313 effective words) took 1.5s, 1205964 effective words/s
INFO - 11:25:36: EPOCH 490 - PROGRESS: at 65.54% examples, 1167050 words/s, in_qsize 13, out_qsize 2
INFO - 11:2

INFO - 11:26:27: EPOCH 523 - PROGRESS: at 65.57% examples, 1175808 words/s, in_qsize 15, out_qsize 0
INFO - 11:26:27: EPOCH 523: training on 2430609 raw words (1813268 effective words) took 1.5s, 1174109 effective words/s
INFO - 11:26:28: EPOCH 524 - PROGRESS: at 62.68% examples, 1117416 words/s, in_qsize 15, out_qsize 0
INFO - 11:26:29: EPOCH 524: training on 2430609 raw words (1813704 effective words) took 1.6s, 1141753 effective words/s
INFO - 11:26:30: EPOCH 525 - PROGRESS: at 64.29% examples, 1147482 words/s, in_qsize 15, out_qsize 0
INFO - 11:26:30: EPOCH 525: training on 2430609 raw words (1812966 effective words) took 1.5s, 1185075 effective words/s
INFO - 11:26:31: EPOCH 526 - PROGRESS: at 65.98% examples, 1185034 words/s, in_qsize 15, out_qsize 0
INFO - 11:26:32: EPOCH 526: training on 2430609 raw words (1813093 effective words) took 1.5s, 1197076 effective words/s
INFO - 11:26:33: EPOCH 527 - PROGRESS: at 64.78% examples, 1150149 words/s, in_qsize 14, out_qsize 1
INFO - 11:2

INFO - 11:27:24: EPOCH 560 - PROGRESS: at 64.29% examples, 1155868 words/s, in_qsize 15, out_qsize 0
INFO - 11:27:24: EPOCH 560: training on 2430609 raw words (1813935 effective words) took 1.5s, 1180246 effective words/s
INFO - 11:27:25: EPOCH 561 - PROGRESS: at 64.29% examples, 1150127 words/s, in_qsize 15, out_qsize 0
INFO - 11:27:26: EPOCH 561: training on 2430609 raw words (1813361 effective words) took 1.5s, 1191182 effective words/s
INFO - 11:27:27: EPOCH 562 - PROGRESS: at 65.57% examples, 1177852 words/s, in_qsize 16, out_qsize 0
INFO - 11:27:27: EPOCH 562: training on 2430609 raw words (1813287 effective words) took 1.5s, 1198613 effective words/s
INFO - 11:27:28: EPOCH 563 - PROGRESS: at 66.67% examples, 1187405 words/s, in_qsize 15, out_qsize 0
INFO - 11:27:29: EPOCH 563: training on 2430609 raw words (1813378 effective words) took 1.5s, 1199767 effective words/s
INFO - 11:27:30: EPOCH 564 - PROGRESS: at 66.35% examples, 1186633 words/s, in_qsize 16, out_qsize 1
INFO - 11:2

INFO - 11:28:20: EPOCH 597 - PROGRESS: at 66.35% examples, 1182132 words/s, in_qsize 15, out_qsize 0
INFO - 11:28:21: EPOCH 597: training on 2430609 raw words (1813259 effective words) took 1.5s, 1209815 effective words/s
INFO - 11:28:22: EPOCH 598 - PROGRESS: at 65.98% examples, 1167640 words/s, in_qsize 15, out_qsize 0
INFO - 11:28:22: EPOCH 598: training on 2430609 raw words (1812994 effective words) took 1.5s, 1180698 effective words/s
INFO - 11:28:23: EPOCH 599 - PROGRESS: at 64.64% examples, 1150933 words/s, in_qsize 15, out_qsize 0
INFO - 11:28:24: EPOCH 599: training on 2430609 raw words (1812900 effective words) took 1.5s, 1180608 effective words/s
INFO - 11:28:25: EPOCH 600 - PROGRESS: at 65.87% examples, 1182586 words/s, in_qsize 14, out_qsize 1
INFO - 11:28:25: EPOCH 600: training on 2430609 raw words (1812742 effective words) took 1.5s, 1200642 effective words/s
INFO - 11:28:26: EPOCH 601 - PROGRESS: at 65.19% examples, 1165706 words/s, in_qsize 16, out_qsize 2
INFO - 11:2

INFO - 11:29:17: EPOCH 634 - PROGRESS: at 63.91% examples, 1128400 words/s, in_qsize 14, out_qsize 1
INFO - 11:29:17: EPOCH 634: training on 2430609 raw words (1813404 effective words) took 1.6s, 1136548 effective words/s
INFO - 11:29:18: EPOCH 635 - PROGRESS: at 60.16% examples, 1077539 words/s, in_qsize 14, out_qsize 1
INFO - 11:29:19: EPOCH 635: training on 2430609 raw words (1813093 effective words) took 1.6s, 1119204 effective words/s
INFO - 11:29:20: EPOCH 636 - PROGRESS: at 64.78% examples, 1157637 words/s, in_qsize 14, out_qsize 1
INFO - 11:29:21: EPOCH 636: training on 2430609 raw words (1813109 effective words) took 1.5s, 1182222 effective words/s
INFO - 11:29:22: EPOCH 637 - PROGRESS: at 63.49% examples, 1139688 words/s, in_qsize 13, out_qsize 2
INFO - 11:29:22: EPOCH 637: training on 2430609 raw words (1813570 effective words) took 1.6s, 1167330 effective words/s
INFO - 11:29:23: EPOCH 638 - PROGRESS: at 66.35% examples, 1182516 words/s, in_qsize 15, out_qsize 0
INFO - 11:2

INFO - 11:30:14: EPOCH 671 - PROGRESS: at 65.98% examples, 1170098 words/s, in_qsize 15, out_qsize 0
INFO - 11:30:15: EPOCH 671: training on 2430609 raw words (1813956 effective words) took 1.5s, 1171254 effective words/s
INFO - 11:30:16: EPOCH 672 - PROGRESS: at 64.29% examples, 1145399 words/s, in_qsize 15, out_qsize 0
INFO - 11:30:16: EPOCH 672: training on 2430609 raw words (1813027 effective words) took 1.5s, 1186169 effective words/s
INFO - 11:30:17: EPOCH 673 - PROGRESS: at 65.13% examples, 1166425 words/s, in_qsize 14, out_qsize 1
INFO - 11:30:18: EPOCH 673: training on 2430609 raw words (1812216 effective words) took 1.5s, 1197193 effective words/s
INFO - 11:30:19: EPOCH 674 - PROGRESS: at 65.54% examples, 1169406 words/s, in_qsize 14, out_qsize 1
INFO - 11:30:19: EPOCH 674: training on 2430609 raw words (1812622 effective words) took 1.5s, 1198357 effective words/s
INFO - 11:30:20: EPOCH 675 - PROGRESS: at 65.13% examples, 1161045 words/s, in_qsize 16, out_qsize 1
INFO - 11:3

INFO - 11:31:11: EPOCH 708 - PROGRESS: at 65.54% examples, 1175000 words/s, in_qsize 15, out_qsize 0
INFO - 11:31:11: EPOCH 708: training on 2430609 raw words (1812663 effective words) took 1.5s, 1203848 effective words/s
INFO - 11:31:12: EPOCH 709 - PROGRESS: at 65.19% examples, 1168412 words/s, in_qsize 14, out_qsize 1
INFO - 11:31:13: EPOCH 709: training on 2430609 raw words (1814086 effective words) took 1.5s, 1197630 effective words/s
INFO - 11:31:14: EPOCH 710 - PROGRESS: at 65.87% examples, 1181201 words/s, in_qsize 13, out_qsize 2
INFO - 11:31:15: EPOCH 710: training on 2430609 raw words (1812847 effective words) took 1.5s, 1190594 effective words/s
INFO - 11:31:16: EPOCH 711 - PROGRESS: at 65.13% examples, 1167824 words/s, in_qsize 14, out_qsize 1
INFO - 11:31:16: EPOCH 711: training on 2430609 raw words (1813533 effective words) took 1.5s, 1188892 effective words/s
INFO - 11:31:17: EPOCH 712 - PROGRESS: at 63.55% examples, 1128658 words/s, in_qsize 16, out_qsize 1
INFO - 11:3

INFO - 11:32:08: EPOCH 745 - PROGRESS: at 62.68% examples, 1116227 words/s, in_qsize 15, out_qsize 0
INFO - 11:32:09: EPOCH 745: training on 2430609 raw words (1813139 effective words) took 1.6s, 1154487 effective words/s
INFO - 11:32:10: EPOCH 746 - PROGRESS: at 64.78% examples, 1155383 words/s, in_qsize 15, out_qsize 0
INFO - 11:32:10: EPOCH 746: training on 2430609 raw words (1813460 effective words) took 1.5s, 1188887 effective words/s
INFO - 11:32:11: EPOCH 747 - PROGRESS: at 66.35% examples, 1180876 words/s, in_qsize 15, out_qsize 0
INFO - 11:32:12: EPOCH 747: training on 2430609 raw words (1813227 effective words) took 1.5s, 1209597 effective words/s
INFO - 11:32:13: EPOCH 748 - PROGRESS: at 63.55% examples, 1132029 words/s, in_qsize 16, out_qsize 1
INFO - 11:32:13: EPOCH 748: training on 2430609 raw words (1813242 effective words) took 1.6s, 1168531 effective words/s
INFO - 11:32:14: EPOCH 749 - PROGRESS: at 65.19% examples, 1170645 words/s, in_qsize 16, out_qsize 0
INFO - 11:3

INFO - 11:33:06: EPOCH 782 - PROGRESS: at 65.54% examples, 1174048 words/s, in_qsize 15, out_qsize 0
INFO - 11:33:06: EPOCH 782: training on 2430609 raw words (1812534 effective words) took 1.5s, 1185329 effective words/s
INFO - 11:33:07: EPOCH 783 - PROGRESS: at 65.91% examples, 1175141 words/s, in_qsize 13, out_qsize 2
INFO - 11:33:08: EPOCH 783: training on 2430609 raw words (1812981 effective words) took 1.5s, 1200401 effective words/s
INFO - 11:33:09: EPOCH 784 - PROGRESS: at 64.29% examples, 1153969 words/s, in_qsize 15, out_qsize 0
INFO - 11:33:09: EPOCH 784: training on 2430609 raw words (1813402 effective words) took 1.6s, 1167242 effective words/s
INFO - 11:33:10: EPOCH 785 - PROGRESS: at 62.75% examples, 1125615 words/s, in_qsize 13, out_qsize 2
INFO - 11:33:11: EPOCH 785: training on 2430609 raw words (1813328 effective words) took 1.5s, 1173230 effective words/s
INFO - 11:33:12: EPOCH 786 - PROGRESS: at 65.57% examples, 1173834 words/s, in_qsize 14, out_qsize 1
INFO - 11:3

INFO - 11:34:05: EPOCH 818: training on 2430609 raw words (1813077 effective words) took 3.9s, 470775 effective words/s
INFO - 11:34:06: EPOCH 819 - PROGRESS: at 25.35% examples, 438796 words/s, in_qsize 14, out_qsize 1
INFO - 11:34:07: EPOCH 819 - PROGRESS: at 50.06% examples, 448981 words/s, in_qsize 14, out_qsize 1
INFO - 11:34:08: EPOCH 819 - PROGRESS: at 77.94% examples, 467046 words/s, in_qsize 15, out_qsize 0
INFO - 11:34:08: EPOCH 819: training on 2430609 raw words (1812140 effective words) took 3.8s, 471737 effective words/s
INFO - 11:34:10: EPOCH 820 - PROGRESS: at 25.31% examples, 437707 words/s, in_qsize 15, out_qsize 0
INFO - 11:34:11: EPOCH 820 - PROGRESS: at 49.71% examples, 442488 words/s, in_qsize 15, out_qsize 0
INFO - 11:34:12: EPOCH 820 - PROGRESS: at 73.96% examples, 441237 words/s, in_qsize 15, out_qsize 0
INFO - 11:34:12: EPOCH 820: training on 2430609 raw words (1813673 effective words) took 4.0s, 455898 effective words/s
INFO - 11:34:13: EPOCH 821 - PROGRESS: a

INFO - 11:35:02: EPOCH 853: training on 2430609 raw words (1811984 effective words) took 1.4s, 1266615 effective words/s
INFO - 11:35:03: EPOCH 854 - PROGRESS: at 61.05% examples, 1096956 words/s, in_qsize 15, out_qsize 0
INFO - 11:35:04: EPOCH 854: training on 2430609 raw words (1813578 effective words) took 1.6s, 1144740 effective words/s
INFO - 11:35:05: EPOCH 855 - PROGRESS: at 66.31% examples, 1191523 words/s, in_qsize 13, out_qsize 2
INFO - 11:35:05: EPOCH 855: training on 2430609 raw words (1813285 effective words) took 1.5s, 1242552 effective words/s
INFO - 11:35:06: EPOCH 856 - PROGRESS: at 61.02% examples, 1089583 words/s, in_qsize 15, out_qsize 0
INFO - 11:35:07: EPOCH 856: training on 2430609 raw words (1813206 effective words) took 1.7s, 1038890 effective words/s
INFO - 11:35:08: EPOCH 857 - PROGRESS: at 50.10% examples, 891972 words/s, in_qsize 16, out_qsize 0
INFO - 11:35:09: EPOCH 857: training on 2430609 raw words (1813614 effective words) took 1.9s, 967906 effective w

INFO - 11:36:05: EPOCH 890: training on 2430609 raw words (1812621 effective words) took 1.3s, 1399395 effective words/s
INFO - 11:36:06: EPOCH 891 - PROGRESS: at 73.94% examples, 1325262 words/s, in_qsize 14, out_qsize 1
INFO - 11:36:07: EPOCH 891: training on 2430609 raw words (1813339 effective words) took 1.3s, 1343587 effective words/s
INFO - 11:36:08: EPOCH 892 - PROGRESS: at 72.82% examples, 1304106 words/s, in_qsize 16, out_qsize 1
INFO - 11:36:08: EPOCH 892: training on 2430609 raw words (1812157 effective words) took 1.4s, 1339787 effective words/s
INFO - 11:36:09: EPOCH 893 - PROGRESS: at 75.69% examples, 1345470 words/s, in_qsize 14, out_qsize 1
INFO - 11:36:09: EPOCH 893: training on 2430609 raw words (1813667 effective words) took 1.3s, 1368473 effective words/s
INFO - 11:36:10: EPOCH 894 - PROGRESS: at 73.94% examples, 1331629 words/s, in_qsize 15, out_qsize 0
INFO - 11:36:11: EPOCH 894: training on 2430609 raw words (1813281 effective words) took 1.3s, 1365387 effective

INFO - 11:37:00: EPOCH 927 - PROGRESS: at 61.52% examples, 1103193 words/s, in_qsize 16, out_qsize 0
INFO - 11:37:01: EPOCH 927: training on 2430609 raw words (1812877 effective words) took 1.6s, 1158176 effective words/s
INFO - 11:37:02: EPOCH 928 - PROGRESS: at 63.55% examples, 1136813 words/s, in_qsize 15, out_qsize 0
INFO - 11:37:03: EPOCH 928: training on 2430609 raw words (1813315 effective words) took 1.6s, 1129996 effective words/s
INFO - 11:37:04: EPOCH 929 - PROGRESS: at 67.87% examples, 1222156 words/s, in_qsize 13, out_qsize 2
INFO - 11:37:04: EPOCH 929: training on 2430609 raw words (1813024 effective words) took 1.4s, 1261809 effective words/s
INFO - 11:37:05: EPOCH 930 - PROGRESS: at 71.12% examples, 1277335 words/s, in_qsize 15, out_qsize 0
INFO - 11:37:06: EPOCH 930: training on 2430609 raw words (1813247 effective words) took 1.4s, 1284348 effective words/s
INFO - 11:37:07: EPOCH 931 - PROGRESS: at 71.56% examples, 1286674 words/s, in_qsize 14, out_qsize 1
INFO - 11:3

INFO - 11:37:59: EPOCH 964 - PROGRESS: at 63.13% examples, 1129499 words/s, in_qsize 15, out_qsize 0
INFO - 11:38:00: EPOCH 964: training on 2430609 raw words (1813826 effective words) took 1.6s, 1136807 effective words/s
INFO - 11:38:01: EPOCH 965 - PROGRESS: at 61.52% examples, 1092219 words/s, in_qsize 13, out_qsize 2
INFO - 11:38:01: EPOCH 965: training on 2430609 raw words (1812758 effective words) took 1.5s, 1211434 effective words/s
INFO - 11:38:02: EPOCH 966 - PROGRESS: at 74.80% examples, 1341514 words/s, in_qsize 16, out_qsize 1
INFO - 11:38:02: EPOCH 966: training on 2430609 raw words (1812468 effective words) took 1.3s, 1356752 effective words/s
INFO - 11:38:03: EPOCH 967 - PROGRESS: at 73.96% examples, 1316732 words/s, in_qsize 14, out_qsize 1
INFO - 11:38:04: EPOCH 967: training on 2430609 raw words (1813701 effective words) took 1.3s, 1347724 effective words/s
INFO - 11:38:05: EPOCH 968 - PROGRESS: at 73.94% examples, 1329881 words/s, in_qsize 14, out_qsize 1
INFO - 11:3

INFO - 11:38:53: Word2Vec lifecycle event {'fname_or_handle': 'my_sentiment_w2v.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-04-08T11:38:53.604361', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'saving'}
INFO - 11:38:53: not storing attribute cum_table
INFO - 11:38:53: saved my_sentiment_w2v.model


<gensim.models.word2vec.Word2Vec at 0x7fa2463a61d0>

In [11]:
import numpy as np


my_sentiment_w2v = Word2Vec.load("my_sentiment_w2v.model")


# Convert to numpy 2d array (n_vocab x vector_size)
def to_embeddings_Matrix(model):  
    embedding_matrix = np.zeros((len(model.wv.index_to_key), model.vector_size))
    word2idx = {}
    for i in range(len(model.wv.index_to_key)):
        embedding_matrix[i] = model.wv[model.wv.index_to_key[i]] 
    return embedding_matrix


embeddings_my_sentiment=to_embeddings_Matrix(my_sentiment_w2v)
print(np.shape(embeddings_my_sentiment))

INFO - 20:17:09: loading Word2Vec object from my_sentiment_w2v.model
INFO - 20:17:09: loading wv recursively from my_sentiment_w2v.model.wv.* with mmap=None
INFO - 20:17:09: setting ignored attribute cum_table to None
INFO - 20:17:09: Word2Vec lifecycle event {'fname': 'my_sentiment_w2v.model', 'datetime': '2023-04-08T20:17:09.745884', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'loaded'}


(51037, 100)


In [139]:
# Training and test dataset as neural bag of words for training our model

X_train,X_test=extract_nbow(my_sentiment_w2v,train_data,test_data)

In [140]:
# Training our model with the default Logistic Regression classifier for (maximum) 1000 iterations.
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression(max_iter=1000).fit(X_train,train_labels)


In [141]:
from sklearn import metrics
y_pred = clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_pred,test_labels))



Accuracy: 0.8399


In [12]:
# Loading google's model
from gensim.models import KeyedVectors
google_sentiment_w2v = KeyedVectors.load_word2vec_format('/home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', binary=True,
limit=1000000)




# Extracting the embeddings of google's model
embeddings_google_sentiment = np.zeros((len(google_sentiment_w2v.index_to_key), google_sentiment_w2v.vector_size))
word2idx = {}
for i in range(len(google_sentiment_w2v.index_to_key)):
    embeddings_google_sentiment[i] = google_sentiment_w2v[google_sentiment_w2v.index_to_key[i]] 

print(np.shape(embeddings_google_sentiment))

INFO - 20:17:33: loading projection weights from /home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz
INFO - 20:17:44: KeyedVectors lifecycle event {'msg': 'loaded (1000000, 300) matrix of type float32 from /home/brewed/Desktop/GoogleNews-vectors-negative300.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-04-08T20:17:44.698840', 'gensim': '4.3.1', 'python': '3.10.7 (main, Mar 10 2023, 10:47:39) [GCC 12.2.0]', 'platform': 'Linux-5.19.0-38-generic-x86_64-with-glibc2.36', 'event': 'load_word2vec_format'}


(1000000, 300)


In [15]:
# Getting the neural bag of words for google's model

X_google_train = np.zeros((np.size(train_data), 300))
for row, rev in enumerate(train_data):
    words_included = 0
    rev_toks = preproc_tok(rev)
    
    for tok in rev_toks:
        if tok in google_sentiment_w2v:
            X_google_train[row] += google_sentiment_w2v[tok]
            words_included += 1

    # Get the mean value
    X_google_train[row] = X_google_train[row]/words_included




X_google_test = np.zeros((np.size(test_data), 300)) 
for row, rev in enumerate(test_data):
    words_included = 0
    # Tokenize current review
    rev_toks = preproc_tok(rev)
    for tok in rev_toks:
        # For each token check if it has a w2v representation
        # and if yes add it.
        if tok in google_sentiment_w2v:
            X_google_test[row] += google_sentiment_w2v[tok]
            words_included += 1
    # Get the mean value
    X_google_test[row] = X_google_test[row]/words_included


In [16]:
# Training google's model with the default Logistic Regression classifier for (maximum) 1000 iterations.

from sklearn.linear_model import LogisticRegression

clf_google=LogisticRegression(max_iter=1000).fit(X_google_train,train_labels)


In [17]:
from sklearn import metrics
y_pred_google = clf_google.predict(X_google_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_pred_google,test_labels))


Accuracy: 0.8345


In [166]:
# Example of our model's prediction
print(test_data[11],'\n\n',y_pred[11],': Positive' if y_pred[11]==1 else ': Negative')

Jeff Garlin's film is filled with heart and laughter. As in Curb Your Enthusiasm, his screen persona is hilarious; but in addition we get to see both warmth and a sense of emotional vulnerability that makes the story universal. While the film chronicles his character's dreams of love, performing success, and weight loss, it will appeal to anyone who dreams of a better life. The supporting cast brings the frustrations and joys of his life to the screen in funny and heartbreaking ways. The simple joys of food, friendship, and trying once again once life has disappointed us are all themes. The use of music is creative and adds to the many pleasures of this film. Any fan of Jeff Garlin's TV appearances must see it! 

 1 : Positive
