In [6]:
import copy
import gensim
import logging
import pyndri
import pyndri.compat
import sys

# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
# ch.setLevel(logging.DEBUG)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

index = pyndri.Index('../index/')
logging.info('Loading vocabulary.')
dictionary = pyndri.extract_dictionary(index)
sentences = pyndri.compat.IndriSentences(index, dictionary)

root - INFO - Loading vocabulary.


In [7]:
logging.info('Initializing word2vec.')

word2vec_init = gensim.models.Word2Vec(
    size=300,  # Embedding size
    window=5,  # One-sided window size
    sg=True,  # Skip-gram.
    min_count=1,  # Minimum word frequency.
    sample=1e-3,  # Sub-sample threshold.
    hs=False,  # Hierarchical softmax.
    negative=10,  # Number of negative examples.
    iter=1,  # Number of iterations.
    workers=8,  # Number of workers.
)

logging.info('Constructing word2vec vocabulary.')

# Build vocab.
word2vec_init.build_vocab(sentences, trim_rule=None)

models = [word2vec_init]

for epoch in range(1, 2):
    logging.info('Epoch %d', epoch)

    model = copy.deepcopy(models[-1])
    model.train(sentences, total_examples=model.corpus_count, epochs=1)

    models.append(model)

logging.info('Trained models: %s', models)

root - INFO - Initializing word2vec.
root - INFO - Constructing word2vec vocabulary.
gensim.models.word2vec - INFO - collecting all words and their counts
gensim.models.word2vec - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
gensim.models.word2vec - INFO - PROGRESS: at sentence #10000, processed 2607270 words, keeping 73866 word types
gensim.models.word2vec - INFO - PROGRESS: at sentence #20000, processed 5208413 words, keeping 101162 word types
gensim.models.word2vec - INFO - PROGRESS: at sentence #30000, processed 7779447 words, keeping 121185 word types
gensim.models.word2vec - INFO - PROGRESS: at sentence #40000, processed 10402346 words, keeping 138774 word types
gensim.models.word2vec - INFO - PROGRESS: at sentence #50000, processed 12981963 words, keeping 153153 word types
gensim.models.word2vec - INFO - PROGRESS: at sentence #60000, processed 15578280 words, keeping 165943 word types
gensim.models.word2vec - INFO - PROGRESS: at sentence #70000, proce

gensim.models.word2vec - INFO - PROGRESS: at 23.48% examples, 189529 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 23.96% examples, 189719 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 24.41% examples, 189750 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 24.86% examples, 189705 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 25.33% examples, 189958 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 25.81% examples, 189908 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 26.29% examples, 190172 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 26.78% examples, 190259 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 27.28% examples, 190457 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 27.79% examples, 190790 word

gensim.models.word2vec - INFO - PROGRESS: at 61.22% examples, 191112 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 61.78% examples, 191215 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 62.29% examples, 191285 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 62.76% examples, 191308 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 63.32% examples, 191494 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 63.74% examples, 191263 words/s, in_qsize 15, out_qsize 1
gensim.models.word2vec - INFO - PROGRESS: at 64.28% examples, 191435 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 64.78% examples, 191506 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 65.22% examples, 191429 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 65.74% examples, 191570 word

gensim.models.word2vec - INFO - PROGRESS: at 98.14% examples, 187842 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 98.60% examples, 187814 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 98.99% examples, 187713 words/s, in_qsize 15, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 99.40% examples, 187615 words/s, in_qsize 16, out_qsize 0
gensim.models.word2vec - INFO - PROGRESS: at 99.72% examples, 187354 words/s, in_qsize 13, out_qsize 0
gensim.models.word2vec - INFO - worker thread finished; awaiting finish of 7 more threads
gensim.models.word2vec - INFO - worker thread finished; awaiting finish of 6 more threads
gensim.models.word2vec - INFO - worker thread finished; awaiting finish of 5 more threads
gensim.models.word2vec - INFO - worker thread finished; awaiting finish of 4 more threads
gensim.models.word2vec - INFO - worker thread finished; awaiting finish of 3 more threads
gensim.models.word2vec - INFO - wor

In [8]:
final_model = models[-1]

In [9]:
final_model.save("../models/Word2Vec")

gensim.utils - INFO - saving Word2Vec object under ../models/Word2Vec, separately None
gensim.utils - INFO - storing np array 'syn0' to ../models/Word2Vec.wv.syn0.npy
gensim.utils - INFO - not storing attribute syn0norm
gensim.utils - INFO - storing np array 'syn1neg' to ../models/Word2Vec.syn1neg.npy
gensim.utils - INFO - not storing attribute cum_table
gensim.utils - INFO - saved ../models/Word2Vec


In [None]:
m = gensim.models.Word2Vec.load('../models/Word2Vec_model')

In [None]:
m.wv['provosts']

In [None]:
dictionary.token2id['provosts']