# Word2Vec Model

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

2023-04-04 13:11:08,017 : INFO : Creating C:\Users\AxelArcidiaco/gensim-data




2023-04-04 13:18:59,820 : INFO : word2vec-google-news-300 downloaded
2023-04-04 13:18:59,840 : INFO : loading projection weights from C:\Users\AxelArcidiaco/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2023-04-04 13:20:44,571 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:\\Users\\AxelArcidiaco/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-04-04T13:20:44.570345', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'load_word2vec_format'}


In [3]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said


In [4]:
vec_king = wv['king']

In [5]:
try:
    vec_cameroon = wv['cameroon']
except KeyError:
    print("The word 'cameroon' does not appear in this model")

The word 'cameroon' does not appear in this model


In [6]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


In [7]:
print(wv.most_similar(positive=['car', 'minivan'], topn=5))

[('SUV', 0.8532191514968872), ('vehicle', 0.8175783753395081), ('pickup_truck', 0.7763689160346985), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.7565719485282898)]


In [8]:
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

car


# Training Your Own Model

In [9]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

2023-04-04 13:21:30,781 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-04-04 13:21:30,783 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2023-04-04 13:21:30,785 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2023-04-04T13:21:30.785522', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [10]:
import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

2023-04-04 13:21:31,814 : INFO : collecting all words and their counts
2023-04-04 13:21:31,820 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:21:32,011 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2023-04-04 13:21:32,013 : INFO : Creating a fresh vocabulary
2023-04-04 13:21:32,028 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1750 unique words (25.07% of original 6981, drops 5231)', 'datetime': '2023-04-04T13:21:32.028637', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:21:32,116 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 49335 word corpus (84.84% of original 58152, drops 8817)', 'datetime': '2023-04-04T13:21:32.116075', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'pla

In [11]:
vec_king = model.wv['king']

In [12]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said


# Storing and loading models

In [13]:
import tempfile

with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    model.save(temporary_filepath)
    #
    # The model is now safely stored in the filepath.
    # You can copy it to other machines, share it with others, etc.
    #
    # To load a saved model:
    #
    new_model = gensim.models.Word2Vec.load(temporary_filepath)

2023-04-04 13:21:40,515 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'C:\\Users\\AxelArcidiaco\\AppData\\Local\\Temp\\gensim-model-8c727zao', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-04-04T13:21:40.514561', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-04-04 13:21:40,517 : INFO : not storing attribute cum_table
2023-04-04 13:21:40,526 : INFO : saved C:\Users\AxelArcidiaco\AppData\Local\Temp\gensim-model-8c727zao
2023-04-04 13:21:40,528 : INFO : loading Word2Vec object from C:\Users\AxelArcidiaco\AppData\Local\Temp\gensim-model-8c727zao
2023-04-04 13:21:40,619 : INFO : loading wv recursively from C:\Users\AxelArcidiaco\AppData\Local\Temp\gensim-model-8c727zao.wv.* with mmap=None
2023-04-04 13:21:40,621 : INFO : setting ignored attribute cum_table to None
2023-04-04 13:21:40,661 : INFO : Word2Vec lifecycle event {'fname'

In [14]:
model = gensim.models.KeyedVectors.load_word2vec_format('./tmp/vectors.txt', binary=False)
# using gzipped/bz2 input works too, no need to unzip
model = gensim.models.KeyedVectors.load_word2vec_format('./tmp/vectors.bin.gz', binary=True)

2023-04-04 13:21:41,732 : INFO : loading projection weights from /tmp/vectors.txt


FileNotFoundError: [Errno 2] No such file or directory: '/tmp/vectors.txt'

# Training Parameters

In [15]:
# min_count
model = gensim.models.Word2Vec(sentences, min_count=10)

2023-04-04 13:39:01,487 : INFO : collecting all words and their counts
2023-04-04 13:39:01,491 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:39:01,719 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2023-04-04 13:39:01,720 : INFO : Creating a fresh vocabulary
2023-04-04 13:39:01,734 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 889 unique words (12.73% of original 6981, drops 6092)', 'datetime': '2023-04-04T13:39:01.734286', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:39:01,736 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 43776 word corpus (75.28% of original 58152, drops 14376)', 'datetime': '2023-04-04T13:39:01.736278', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'p

In [16]:
# vector_size
model = gensim.models.Word2Vec(sentences, vector_size=200)

2023-04-04 13:39:03,347 : INFO : collecting all words and their counts
2023-04-04 13:39:03,353 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:39:03,654 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2023-04-04 13:39:03,655 : INFO : Creating a fresh vocabulary
2023-04-04 13:39:03,683 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1750 unique words (25.07% of original 6981, drops 5231)', 'datetime': '2023-04-04T13:39:03.683507', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:39:03,685 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 49335 word corpus (84.84% of original 58152, drops 8817)', 'datetime': '2023-04-04T13:39:03.685496', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'pla

In [17]:
# workers
# default value of workers=3
model = gensim.models.Word2Vec(sentences, workers=4)

2023-04-04 13:39:05,290 : INFO : collecting all words and their counts
2023-04-04 13:39:05,295 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:39:05,507 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2023-04-04 13:39:05,508 : INFO : Creating a fresh vocabulary
2023-04-04 13:39:05,532 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1750 unique words (25.07% of original 6981, drops 5231)', 'datetime': '2023-04-04T13:39:05.532418', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:39:05,534 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 49335 word corpus (84.84% of original 58152, drops 8817)', 'datetime': '2023-04-04T13:39:05.534439', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'pla

# Evaluating

In [18]:
model.wv.evaluate_word_analogies(datapath('questions-words.txt'))

2023-04-04 13:39:08,606 : INFO : Evaluating word analogies for top 300000 words in the model on c:\Users\AxelArcidiaco\anaconda3\envs\SpacyEnv\lib\site-packages\gensim\test\test_data\questions-words.txt
2023-04-04 13:39:08,621 : INFO : capital-common-countries: 0.0% (0/6)
2023-04-04 13:39:08,662 : INFO : capital-world: 0.0% (0/2)
2023-04-04 13:39:08,695 : INFO : family: 0.0% (0/6)
2023-04-04 13:39:08,733 : INFO : gram3-comparative: 0.0% (0/20)
2023-04-04 13:39:08,750 : INFO : gram4-superlative: 0.0% (0/12)
2023-04-04 13:39:08,783 : INFO : gram5-present-participle: 0.0% (0/20)
2023-04-04 13:39:08,828 : INFO : gram6-nationality-adjective: 0.0% (0/30)
2023-04-04 13:39:08,863 : INFO : gram7-past-tense: 0.0% (0/20)
2023-04-04 13:39:08,897 : INFO : gram8-plural: 0.0% (0/30)
2023-04-04 13:39:08,905 : INFO : Quadruplets with out-of-vocabulary words: 99.3%
2023-04-04 13:39:08,908 : INFO : NB: analogies containing OOV words were skipped from evaluation! To change this behavior, use "dummy4unknow

(0.0,
 [{'section': 'capital-common-countries',
   'correct': [],
   'incorrect': [('CANBERRA', 'AUSTRALIA', 'KABUL', 'AFGHANISTAN'),
    ('CANBERRA', 'AUSTRALIA', 'PARIS', 'FRANCE'),
    ('KABUL', 'AFGHANISTAN', 'PARIS', 'FRANCE'),
    ('KABUL', 'AFGHANISTAN', 'CANBERRA', 'AUSTRALIA'),
    ('PARIS', 'FRANCE', 'CANBERRA', 'AUSTRALIA'),
    ('PARIS', 'FRANCE', 'KABUL', 'AFGHANISTAN')]},
  {'section': 'capital-world',
   'correct': [],
   'incorrect': [('CANBERRA', 'AUSTRALIA', 'KABUL', 'AFGHANISTAN'),
    ('KABUL', 'AFGHANISTAN', 'PARIS', 'FRANCE')]},
  {'section': 'currency', 'correct': [], 'incorrect': []},
  {'section': 'city-in-state', 'correct': [], 'incorrect': []},
  {'section': 'family',
   'correct': [],
   'incorrect': [('HE', 'SHE', 'HIS', 'HER'),
    ('HE', 'SHE', 'MAN', 'WOMAN'),
    ('HIS', 'HER', 'MAN', 'WOMAN'),
    ('HIS', 'HER', 'HE', 'SHE'),
    ('MAN', 'WOMAN', 'HE', 'SHE'),
    ('MAN', 'WOMAN', 'HIS', 'HER')]},
  {'section': 'gram1-adjective-to-adverb', 'correct': [

# Online training / Resuming training

In [19]:
model = gensim.models.Word2Vec.load(temporary_filepath)
more_sentences = [
    ['Advanced', 'users', 'can', 'load', 'a', 'model',
     'and', 'continue', 'training', 'it', 'with', 'more', 'sentences'],
]
model.build_vocab(more_sentences, update=True)
model.train(more_sentences, total_examples=model.corpus_count, epochs=model.epochs)

# cleaning up temporary file
import os
os.remove(temporary_filepath)

2023-04-04 13:39:10,617 : INFO : loading Word2Vec object from C:\Users\AxelArcidiaco\AppData\Local\Temp\gensim-model-8c727zao
2023-04-04 13:39:10,625 : INFO : loading wv recursively from C:\Users\AxelArcidiaco\AppData\Local\Temp\gensim-model-8c727zao.wv.* with mmap=None
2023-04-04 13:39:10,626 : INFO : setting ignored attribute cum_table to None
2023-04-04 13:39:10,667 : INFO : Word2Vec lifecycle event {'fname': 'C:\\Users\\AxelArcidiaco\\AppData\\Local\\Temp\\gensim-model-8c727zao', 'datetime': '2023-04-04T13:39:10.667449', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'loaded'}
2023-04-04 13:39:10,669 : INFO : collecting all words and their counts
2023-04-04 13:39:10,670 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:39:10,672 : INFO : collected 13 word types from a corpus of 13 raw words and 1 sentences
2023-04-04 13:39:10,673 : INFO : Updati

# Training Loss Computation

In [20]:
# instantiating and training the Word2Vec model
model_with_loss = gensim.models.Word2Vec(
    sentences,
    min_count=1,
    compute_loss=True,
    hs=0,
    sg=1,
    seed=42,
)

# getting the training loss value
training_loss = model_with_loss.get_latest_training_loss()
print(training_loss)

2023-04-04 13:39:12,323 : INFO : collecting all words and their counts
2023-04-04 13:39:12,327 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:39:12,531 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2023-04-04 13:39:12,534 : INFO : Creating a fresh vocabulary
2023-04-04 13:39:12,594 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 6981 unique words (100.00% of original 6981, drops 0)', 'datetime': '2023-04-04T13:39:12.593203', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:39:12,595 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 58152 word corpus (100.00% of original 58152, drops 0)', 'datetime': '2023-04-04T13:39:12.595206', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platfor

1360236.25


# Benchmarks

In [21]:
import io
import os

import gensim.models.word2vec
import gensim.downloader as api
import smart_open


def head(path, size):
    with smart_open.open(path) as fin:
        return io.StringIO(fin.read(size))


def generate_input_data():
    lee_path = datapath('lee_background.cor')
    ls = gensim.models.word2vec.LineSentence(lee_path)
    ls.name = '25kB'
    yield ls

    text8_path = api.load('text8').fn
    labels = ('1MB', '10MB', '50MB', '100MB')
    sizes = (1024 ** 2, 10 * 1024 ** 2, 50 * 1024 ** 2, 100 * 1024 ** 2)
    for l, s in zip(labels, sizes):
        ls = gensim.models.word2vec.LineSentence(head(text8_path, s))
        ls.name = l
        yield ls


input_data = list(generate_input_data())



2023-04-04 13:39:30,330 : INFO : text8 downloaded


In [22]:
# Temporarily reduce logging verbosity
logging.root.level = logging.ERROR

import time
import numpy as np
import pandas as pd

train_time_values = []
seed_val = 42
sg_values = [0, 1]
hs_values = [0, 1]

fast = True
if fast:
    input_data_subset = input_data[:3]
else:
    input_data_subset = input_data


for data in input_data_subset:
    for sg_val in sg_values:
        for hs_val in hs_values:
            for loss_flag in [True, False]:
                time_taken_list = []
                for i in range(3):
                    start_time = time.time()
                    w2v_model = gensim.models.Word2Vec(
                        data,
                        compute_loss=loss_flag,
                        sg=sg_val,
                        hs=hs_val,
                        seed=seed_val,
                    )
                    time_taken_list.append(time.time() - start_time)

                time_taken_list = np.array(time_taken_list)
                time_mean = np.mean(time_taken_list)
                time_std = np.std(time_taken_list)

                model_result = {
                    'train_data': data.name,
                    'compute_loss': loss_flag,
                    'sg': sg_val,
                    'hs': hs_val,
                    'train_time_mean': time_mean,
                    'train_time_std': time_std,
                }
                print("Word2vec model #%i: %s" % (len(train_time_values), model_result))
                train_time_values.append(model_result)

train_times_table = pd.DataFrame(train_time_values)
train_times_table = train_times_table.sort_values(
    by=['train_data', 'sg', 'hs', 'compute_loss'],
    ascending=[False, False, True, False],
)
print(train_times_table)

2023-04-04 13:39:48,600 : INFO : collecting all words and their counts
2023-04-04 13:39:48,603 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:39:48,635 : INFO : collected 10781 word types from a corpus of 59890 raw words and 300 sentences
2023-04-04 13:39:48,637 : INFO : Creating a fresh vocabulary
2023-04-04 13:39:48,664 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1762 unique words (16.34% of original 10781, drops 9019)', 'datetime': '2023-04-04T13:39:48.664459', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:39:48,666 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 46084 word corpus (76.95% of original 59890, drops 13806)', 'datetime': '2023-04-04T13:39:48.666461', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', '

Word2vec model #0: {'train_data': '25kB', 'compute_loss': True, 'sg': 0, 'hs': 0, 'train_time_mean': 0.6145718097686768, 'train_time_std': 0.022985500695540954}


2023-04-04 13:39:50,725 : INFO : EPOCH 0: training on 59890 raw words (32543 effective words) took 0.1s, 363343 effective words/s
2023-04-04 13:39:50,811 : INFO : EPOCH 1: training on 59890 raw words (32552 effective words) took 0.1s, 405975 effective words/s
2023-04-04 13:39:50,903 : INFO : EPOCH 2: training on 59890 raw words (32603 effective words) took 0.1s, 376672 effective words/s
2023-04-04 13:39:50,983 : INFO : EPOCH 3: training on 59890 raw words (32587 effective words) took 0.1s, 448391 effective words/s
2023-04-04 13:39:51,074 : INFO : EPOCH 4: training on 59890 raw words (32592 effective words) took 0.1s, 395887 effective words/s
2023-04-04 13:39:51,075 : INFO : Word2Vec lifecycle event {'msg': 'training on 299450 raw words (162877 effective words) took 0.4s, 366249 effective words/s', 'datetime': '2023-04-04T13:39:51.075702', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 't

Word2vec model #1: {'train_data': '25kB', 'compute_loss': False, 'sg': 0, 'hs': 0, 'train_time_mean': 0.696085532506307, 'train_time_std': 0.0657968697615705}


2023-04-04 13:39:52,868 : INFO : built huffman tree with maximum node depth 13
2023-04-04 13:39:52,923 : INFO : estimated required memory for 1762 words and 100 dimensions: 3347800 bytes
2023-04-04 13:39:52,928 : INFO : resetting layer weights
2023-04-04 13:39:52,933 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-04T13:39:52.933599', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'build_vocab'}
2023-04-04 13:39:52,938 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 1762 vocabulary and 100 features, using sg=0 hs=1 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-04-04T13:39:52.938144', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-04-04 13:39:53,169 : INFO : EPOCH 0: training o

Word2vec model #2: {'train_data': '25kB', 'compute_loss': True, 'sg': 0, 'hs': 1, 'train_time_mean': 1.5733048915863037, 'train_time_std': 0.17942882024326978}


2023-04-04 13:39:57,505 : INFO : built huffman tree with maximum node depth 13
2023-04-04 13:39:57,541 : INFO : estimated required memory for 1762 words and 100 dimensions: 3347800 bytes
2023-04-04 13:39:57,543 : INFO : resetting layer weights
2023-04-04 13:39:57,546 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-04T13:39:57.546278', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'build_vocab'}
2023-04-04 13:39:57,551 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 1762 vocabulary and 100 features, using sg=0 hs=1 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-04-04T13:39:57.551857', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-04-04 13:39:57,710 : INFO : EPOCH 0: training o

Word2vec model #3: {'train_data': '25kB', 'compute_loss': False, 'sg': 0, 'hs': 1, 'train_time_mean': 1.0613186359405518, 'train_time_std': 0.015410054581855438}


2023-04-04 13:40:00,822 : INFO : EPOCH 0: training on 59890 raw words (32543 effective words) took 0.2s, 157100 effective words/s
2023-04-04 13:40:01,047 : INFO : EPOCH 1: training on 59890 raw words (32552 effective words) took 0.2s, 147642 effective words/s
2023-04-04 13:40:01,252 : INFO : EPOCH 2: training on 59890 raw words (32603 effective words) took 0.2s, 163898 effective words/s
2023-04-04 13:40:01,441 : INFO : EPOCH 3: training on 59890 raw words (32587 effective words) took 0.2s, 177131 effective words/s
2023-04-04 13:40:01,609 : INFO : EPOCH 4: training on 59890 raw words (32592 effective words) took 0.2s, 200952 effective words/s
2023-04-04 13:40:01,611 : INFO : Word2Vec lifecycle event {'msg': 'training on 299450 raw words (162877 effective words) took 1.0s, 162235 effective words/s', 'datetime': '2023-04-04T13:40:01.611302', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 't

Word2vec model #4: {'train_data': '25kB', 'compute_loss': True, 'sg': 1, 'hs': 0, 'train_time_mean': 1.246769905090332, 'train_time_std': 0.055282476235718726}


2023-04-04 13:40:04,491 : INFO : EPOCH 0: training on 59890 raw words (32676 effective words) took 0.2s, 200983 effective words/s
2023-04-04 13:40:04,661 : INFO : EPOCH 1: training on 59890 raw words (32585 effective words) took 0.2s, 200539 effective words/s
2023-04-04 13:40:04,872 : INFO : EPOCH 2: training on 59890 raw words (32615 effective words) took 0.2s, 161800 effective words/s
2023-04-04 13:40:05,052 : INFO : EPOCH 3: training on 59890 raw words (32602 effective words) took 0.2s, 190077 effective words/s
2023-04-04 13:40:05,216 : INFO : EPOCH 4: training on 59890 raw words (32563 effective words) took 0.2s, 197954 effective words/s
2023-04-04 13:40:05,216 : INFO : Word2Vec lifecycle event {'msg': 'training on 299450 raw words (163041 effective words) took 0.9s, 181596 effective words/s', 'datetime': '2023-04-04T13:40:05.216956', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 't

Word2vec model #5: {'train_data': '25kB', 'compute_loss': False, 'sg': 1, 'hs': 0, 'train_time_mean': 1.0432551701863606, 'train_time_std': 0.009498107327648228}


2023-04-04 13:40:07,547 : INFO : built huffman tree with maximum node depth 13
2023-04-04 13:40:07,577 : INFO : estimated required memory for 1762 words and 100 dimensions: 3347800 bytes
2023-04-04 13:40:07,577 : INFO : resetting layer weights
2023-04-04 13:40:07,577 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-04T13:40:07.577169', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'build_vocab'}
2023-04-04 13:40:07,587 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 1762 vocabulary and 100 features, using sg=1 hs=1 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-04-04T13:40:07.587549', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-04-04 13:40:07,977 : INFO : EPOCH 0: training o

Word2vec model #6: {'train_data': '25kB', 'compute_loss': True, 'sg': 1, 'hs': 1, 'train_time_mean': 2.120964209238688, 'train_time_std': 0.1604011660980497}


2023-04-04 13:40:13,921 : INFO : built huffman tree with maximum node depth 13
2023-04-04 13:40:13,960 : INFO : estimated required memory for 1762 words and 100 dimensions: 3347800 bytes
2023-04-04 13:40:13,962 : INFO : resetting layer weights
2023-04-04 13:40:13,967 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-04T13:40:13.967522', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'build_vocab'}
2023-04-04 13:40:13,971 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 1762 vocabulary and 100 features, using sg=1 hs=1 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-04-04T13:40:13.971328', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-04-04 13:40:14,342 : INFO : EPOCH 0: training o

Word2vec model #7: {'train_data': '25kB', 'compute_loss': False, 'sg': 1, 'hs': 1, 'train_time_mean': 2.0520317554473877, 'train_time_std': 0.16663456503251287}


2023-04-04 13:40:20,078 : INFO : estimated required memory for 4125 words and 100 dimensions: 5362500 bytes
2023-04-04 13:40:20,080 : INFO : resetting layer weights
2023-04-04 13:40:20,084 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-04T13:40:20.084210', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'build_vocab'}
2023-04-04 13:40:20,085 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 4125 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-04-04T13:40:20.085146', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-04-04 13:40:20,277 : INFO : EPOCH 0: training on 175599 raw words (110284 effective words) took 0.2s, 679533 effective words/s

Word2vec model #8: {'train_data': '1MB', 'compute_loss': True, 'sg': 0, 'hs': 0, 'train_time_mean': 1.4942554632822673, 'train_time_std': 0.23456594456286886}


2023-04-04 13:40:24,554 : INFO : collected 17251 word types from a corpus of 175599 raw words and 18 sentences
2023-04-04 13:40:24,558 : INFO : Creating a fresh vocabulary
2023-04-04 13:40:24,653 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 4125 unique words (23.91% of original 17251, drops 13126)', 'datetime': '2023-04-04T13:40:24.653755', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:40:24,655 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 154201 word corpus (87.81% of original 175599, drops 21398)', 'datetime': '2023-04-04T13:40:24.655762', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:40:24,762 : INFO : deleting the raw counts dictionary of 17251 items
2023-04-04 13:40:2

Word2vec model #9: {'train_data': '1MB', 'compute_loss': False, 'sg': 0, 'hs': 0, 'train_time_mean': 1.4536771774291992, 'train_time_std': 0.2802451038893119}


2023-04-04 13:40:28,875 : INFO : sample=0.001 downsamples 40 most-common words
2023-04-04 13:40:28,879 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 110199.4281334271 word corpus (71.5%% of prior 154201)', 'datetime': '2023-04-04T13:40:28.878298', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:40:28,886 : INFO : constructing a huffman tree from 4125 words
2023-04-04 13:40:29,251 : INFO : built huffman tree with maximum node depth 15
2023-04-04 13:40:29,303 : INFO : estimated required memory for 4125 words and 100 dimensions: 7837500 bytes
2023-04-04 13:40:29,305 : INFO : resetting layer weights
2023-04-04 13:40:29,311 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-04T13:40:29.311920', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'pla

Word2vec model #10: {'train_data': '1MB', 'compute_loss': True, 'sg': 0, 'hs': 1, 'train_time_mean': 2.5346561272939048, 'train_time_std': 0.15934209415004857}


2023-04-04 13:40:36,521 : INFO : sample=0.001 downsamples 40 most-common words
2023-04-04 13:40:36,524 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 110199.4281334271 word corpus (71.5%% of prior 154201)', 'datetime': '2023-04-04T13:40:36.523587', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:40:36,532 : INFO : constructing a huffman tree from 4125 words
2023-04-04 13:40:36,820 : INFO : built huffman tree with maximum node depth 15
2023-04-04 13:40:36,909 : INFO : estimated required memory for 4125 words and 100 dimensions: 7837500 bytes
2023-04-04 13:40:36,910 : INFO : resetting layer weights
2023-04-04 13:40:36,917 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-04T13:40:36.917254', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'pla

Word2vec model #11: {'train_data': '1MB', 'compute_loss': False, 'sg': 0, 'hs': 1, 'train_time_mean': 2.68886391321818, 'train_time_std': 0.014530134089780829}


2023-04-04 13:40:44,591 : INFO : sample=0.001 downsamples 40 most-common words
2023-04-04 13:40:44,593 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 110199.4281334271 word corpus (71.5%% of prior 154201)', 'datetime': '2023-04-04T13:40:44.593617', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:40:44,675 : INFO : estimated required memory for 4125 words and 100 dimensions: 5362500 bytes
2023-04-04 13:40:44,676 : INFO : resetting layer weights
2023-04-04 13:40:44,685 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-04-04T13:40:44.685849', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'build_vocab'}
2023-04-04 13:40:44,686 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 worker

Word2vec model #12: {'train_data': '1MB', 'compute_loss': True, 'sg': 1, 'hs': 0, 'train_time_mean': 3.2219104766845703, 'train_time_std': 0.2010461983028164}


2023-04-04 13:40:54,213 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 154201 word corpus (87.81% of original 175599, drops 21398)', 'datetime': '2023-04-04T13:40:54.213362', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:40:54,292 : INFO : deleting the raw counts dictionary of 17251 items
2023-04-04 13:40:54,295 : INFO : sample=0.001 downsamples 40 most-common words
2023-04-04 13:40:54,296 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 110199.4281334271 word corpus (71.5%% of prior 154201)', 'datetime': '2023-04-04T13:40:54.296925', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:40:54,385 : INFO : estimated required memory for 4125 words and 100 dimensions: 5362500 bytes
2023-0

Word2vec model #13: {'train_data': '1MB', 'compute_loss': False, 'sg': 1, 'hs': 0, 'train_time_mean': 3.4671924908955893, 'train_time_std': 0.2715605549303989}


2023-04-04 13:41:04,605 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 154201 word corpus (87.81% of original 175599, drops 21398)', 'datetime': '2023-04-04T13:41:04.605795', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:41:04,687 : INFO : deleting the raw counts dictionary of 17251 items
2023-04-04 13:41:04,690 : INFO : sample=0.001 downsamples 40 most-common words
2023-04-04 13:41:04,693 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 110199.4281334271 word corpus (71.5%% of prior 154201)', 'datetime': '2023-04-04T13:41:04.693687', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:41:04,700 : INFO : constructing a huffman tree from 4125 words
2023-04-04 13:41:04,944 : INFO : buil

Word2vec model #14: {'train_data': '1MB', 'compute_loss': True, 'sg': 1, 'hs': 1, 'train_time_mean': 7.286693255106608, 'train_time_std': 0.06879217709999681}


2023-04-04 13:41:26,500 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 154201 word corpus (87.81% of original 175599, drops 21398)', 'datetime': '2023-04-04T13:41:26.500915', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:41:26,594 : INFO : deleting the raw counts dictionary of 17251 items
2023-04-04 13:41:26,596 : INFO : sample=0.001 downsamples 40 most-common words
2023-04-04 13:41:26,598 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 110199.4281334271 word corpus (71.5%% of prior 154201)', 'datetime': '2023-04-04T13:41:26.598248', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:41:26,610 : INFO : constructing a huffman tree from 4125 words
2023-04-04 13:41:27,143 : INFO : buil

Word2vec model #15: {'train_data': '1MB', 'compute_loss': False, 'sg': 1, 'hs': 1, 'train_time_mean': 9.320667266845703, 'train_time_std': 0.43521996591898127}


2023-04-04 13:41:54,826 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:41:55,523 : INFO : collected 73167 word types from a corpus of 1788017 raw words and 179 sentences
2023-04-04 13:41:55,524 : INFO : Creating a fresh vocabulary
2023-04-04 13:41:55,737 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 20167 unique words (27.56% of original 73167, drops 53000)', 'datetime': '2023-04-04T13:41:55.737829', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:41:55,738 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 1703716 word corpus (95.29% of original 1788017, drops 84301)', 'datetime': '2023-04-04T13:41:55.738855', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab

Word2vec model #16: {'train_data': '10MB', 'compute_loss': True, 'sg': 0, 'hs': 0, 'train_time_mean': 16.541062513987224, 'train_time_std': 0.22554318021683137}


2023-04-04 13:42:45,543 : INFO : collected 73167 word types from a corpus of 1788017 raw words and 179 sentences
2023-04-04 13:42:45,545 : INFO : Creating a fresh vocabulary
2023-04-04 13:42:45,852 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 20167 unique words (27.56% of original 73167, drops 53000)', 'datetime': '2023-04-04T13:42:45.852965', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:42:45,854 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 1703716 word corpus (95.29% of original 1788017, drops 84301)', 'datetime': '2023-04-04T13:42:45.854970', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:42:46,207 : INFO : deleting the raw counts dictionary of 73167 items
2023-04-04 13

Word2vec model #17: {'train_data': '10MB', 'compute_loss': False, 'sg': 0, 'hs': 0, 'train_time_mean': 17.175948063532513, 'train_time_std': 0.69259191625874}


2023-04-04 13:43:37,079 : INFO : collected 73167 word types from a corpus of 1788017 raw words and 179 sentences
2023-04-04 13:43:37,081 : INFO : Creating a fresh vocabulary
2023-04-04 13:43:37,611 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 20167 unique words (27.56% of original 73167, drops 53000)', 'datetime': '2023-04-04T13:43:37.611457', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:43:37,612 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 1703716 word corpus (95.29% of original 1788017, drops 84301)', 'datetime': '2023-04-04T13:43:37.612455', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:43:37,979 : INFO : deleting the raw counts dictionary of 73167 items
2023-04-04 13

Word2vec model #18: {'train_data': '10MB', 'compute_loss': True, 'sg': 0, 'hs': 1, 'train_time_mean': 33.42635194460551, 'train_time_std': 1.346471509499782}


2023-04-04 13:45:17,115 : INFO : collected 73167 word types from a corpus of 1788017 raw words and 179 sentences
2023-04-04 13:45:17,117 : INFO : Creating a fresh vocabulary
2023-04-04 13:45:17,396 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 20167 unique words (27.56% of original 73167, drops 53000)', 'datetime': '2023-04-04T13:45:17.396226', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:45:17,397 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 1703716 word corpus (95.29% of original 1788017, drops 84301)', 'datetime': '2023-04-04T13:45:17.397255', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:45:17,684 : INFO : deleting the raw counts dictionary of 73167 items
2023-04-04 13

Word2vec model #19: {'train_data': '10MB', 'compute_loss': False, 'sg': 0, 'hs': 1, 'train_time_mean': 33.80125188827515, 'train_time_std': 3.7604229152441206}


2023-04-04 13:46:57,406 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:46:58,262 : INFO : collected 73167 word types from a corpus of 1788017 raw words and 179 sentences
2023-04-04 13:46:58,264 : INFO : Creating a fresh vocabulary
2023-04-04 13:46:58,466 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 20167 unique words (27.56% of original 73167, drops 53000)', 'datetime': '2023-04-04T13:46:58.466568', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:46:58,467 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 1703716 word corpus (95.29% of original 1788017, drops 84301)', 'datetime': '2023-04-04T13:46:58.467591', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab

Word2vec model #20: {'train_data': '10MB', 'compute_loss': True, 'sg': 1, 'hs': 0, 'train_time_mean': 45.6685045560201, 'train_time_std': 0.4791627236874829}


2023-04-04 13:49:15,323 : INFO : collected 73167 word types from a corpus of 1788017 raw words and 179 sentences
2023-04-04 13:49:15,325 : INFO : Creating a fresh vocabulary
2023-04-04 13:49:15,502 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 20167 unique words (27.56% of original 73167, drops 53000)', 'datetime': '2023-04-04T13:49:15.502439', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:49:15,505 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 1703716 word corpus (95.29% of original 1788017, drops 84301)', 'datetime': '2023-04-04T13:49:15.505920', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:49:15,699 : INFO : deleting the raw counts dictionary of 73167 items
2023-04-04 13

Word2vec model #21: {'train_data': '10MB', 'compute_loss': False, 'sg': 1, 'hs': 0, 'train_time_mean': 48.66086856524149, 'train_time_std': 1.2453250929376887}


2023-04-04 13:51:40,432 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-04 13:51:41,416 : INFO : collected 73167 word types from a corpus of 1788017 raw words and 179 sentences
2023-04-04 13:51:41,417 : INFO : Creating a fresh vocabulary
2023-04-04 13:51:41,672 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 20167 unique words (27.56% of original 73167, drops 53000)', 'datetime': '2023-04-04T13:51:41.672628', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:51:41,675 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 1703716 word corpus (95.29% of original 1788017, drops 84301)', 'datetime': '2023-04-04T13:51:41.675172', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab

Word2vec model #22: {'train_data': '10MB', 'compute_loss': True, 'sg': 1, 'hs': 1, 'train_time_mean': 109.1268162727356, 'train_time_std': 6.534056513501884}


2023-04-04 13:57:09,361 : INFO : collected 73167 word types from a corpus of 1788017 raw words and 179 sentences
2023-04-04 13:57:09,365 : INFO : Creating a fresh vocabulary
2023-04-04 13:57:09,818 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 20167 unique words (27.56% of original 73167, drops 53000)', 'datetime': '2023-04-04T13:57:09.817097', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:57:09,819 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 1703716 word corpus (95.29% of original 1788017, drops 84301)', 'datetime': '2023-04-04T13:57:09.819111', 'gensim': '4.3.1', 'python': '3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-04 13:57:10,294 : INFO : deleting the raw counts dictionary of 73167 items
2023-04-04 13

Word2vec model #23: {'train_data': '10MB', 'compute_loss': False, 'sg': 1, 'hs': 1, 'train_time_mean': 121.25435598691304, 'train_time_std': 6.657785280488426}
   train_data  compute_loss  sg  hs  train_time_mean  train_time_std
4        25kB          True   1   0         1.246770        0.055282
5        25kB         False   1   0         1.043255        0.009498
6        25kB          True   1   1         2.120964        0.160401
7        25kB         False   1   1         2.052032        0.166635
0        25kB          True   0   0         0.614572        0.022986
1        25kB         False   0   0         0.696086        0.065797
2        25kB          True   0   1         1.573305        0.179429
3        25kB         False   0   1         1.061319        0.015410
12        1MB          True   1   0         3.221910        0.201046
13        1MB         False   1   0         3.467192        0.271561
14        1MB          True   1   1         7.286693        0.068792
15        1M

# Visualising Word Embeddings

In [28]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)