In [1]:
%load_ext cython
import gensim.models
import os
from pathlib import Path

In [2]:
data_folder = os.path.join(Path(os.getcwd()).parent, 'data')

In [3]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = f'{data_folder}/spacy_pre.txt'
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [4]:
corpus_path = f'{data_folder}/spacy_pre.txt'
corp_count = 0
for line in open(corpus_path):
    corp_count += 1

In [26]:
sentences = MyCorpus()
model = gensim.models.Word2Vec(workers=12, vector_size=32, min_count=10, window=5)

In [6]:
import json
with open(f'{data_folder}/spacy_dictionary.json', 'r') as file:
    dictionary = json.load(file)

In [28]:
model.build_vocab_from_freq(dictionary, corpus_count=corp_count)

In [8]:
from gensim.models.callbacks import CallbackAny2Vec
from pprint import pprint
class MonitorCallback(CallbackAny2Vec):
    def __init__(self, test_words):
        self._test_words = test_words
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss
        for word in self._test_words:  # show wv logic changes
            pprint({word : model.wv.most_similar(word)[:3]})
        print('')

In [29]:
monitor = MonitorCallback(['internet', 'vax', 'botw'])
model.train(sentences, total_examples=model.corpus_count, epochs=1, callbacks=[monitor], compute_loss=True)

Loss after epoch 0: 2991131.75
{'internet': [('website', 0.8468858003616333),
              ('site', 0.8424097895622253),
              ('user', 0.8165557980537415)]}
{'vax': [('statist', 0.939915120601654),
         ('sympathetic', 0.9387148022651672),
         ('globalist', 0.9371417760848999)]}
{'botw': [('battleborn', 0.9339108467102051),
          ('remaster', 0.9239599108695984),
          ('tlou', 0.9229303598403931)]}



(14368748, 15351048)

In [31]:
model.wv.save_word2vec_format('../saved_model/w2v_spacy_32.txt', binary=False)

In [30]:
model.wv.most_similar('xbox')

[('ps', 0.9757418036460876),
 ('gta', 0.9434359073638916),
 ('vr', 0.9424576759338379),
 ('console', 0.9387712478637695),
 ('beta', 0.935235321521759),
 ('playstation', 0.9293619394302368),
 ('nintendo', 0.9288991093635559),
 ('exclusive', 0.9270581603050232),
 ('wii', 0.9227364659309387),
 ('ds', 0.9206281304359436)]