In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from random import shuffle
import spacy
import msgpack
from dss_selc.utils import PRJ_PATH

nlp = spacy.load("en_core_web_lg")

with open(PRJ_PATH / "dss-selc-dump/glove/corpus.txt", "r") as f:
    corpus = f.readlines()
print(len(corpus), "docs loaded")


def dump_corpus(new_corpus):
    fp = PRJ_PATH / "dss-selc-dump/glove/processed_corpus.msgpack"
    with open(fp, "wb") as f:
        msgpack.dump(new_corpus, f)


new_corpus = []
n_ = len(corpus)
for indx, doc_ in enumerate(corpus, start=1):
    new_doc = []
    doc = nlp(doc_)
    print(f"\r[*] Processing: {indx:>07}/{n_:>07}, ", end="")
    for sentence in doc.sents:
        for token in sentence:
            if token.is_punct is True or token.is_stop is True:
                continue
            new_doc.append(str(token.lemma_)lower())
    print(f"added {len(new_doc):>03} tokens.", " " * 20, end="")
    new_corpus.append(new_doc)
    if indx % 10 == 0:
        dump_corpus(new_corpus)
        
        
shuffle(new_corpus)

dump_corpus(new_corpus)



glove_vectors = KeyedVectors.load_word2vec_format(
    fname="../.models/glove/glove.840B.300d.txt",
    binary=False,
    no_header=True,
)

#! creating base model class. Vector size should match that of pretrained vectors
base_model = Word2Vec(
    vector_size=300,
    min_count=5,
    epochs=40,
    workers=12,
)
base_model.build_vocab(new_corpus)
total_examples = base_model.corpus_count

#! add pretrained GloVe's vocabulary
base_model.build_vocab(list(glove_vectors.key_to_index.keys()), update=True)

from gensim.models.callbacks import CallbackAny2Vec


class callback(CallbackAny2Vec):
    """Callback to print loss after each epoch."""

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print(f"Loss after epoch {self.epoch:>02}: {loss_now:>010}")
        self.epoch += 1


#! Training the model
base_model.train(
    new_corpus,
    total_examples=total_examples,
    epochs=base_model.epochs,
    compute_loss=True,
    callbacks=[callback()],
)
base_model_wv = base_model.wv

from pathlib import Path

dr = PRJ_PATH / "dss-selc-dump/glove/model_01/"
dr.mkdir(exist_ok=True, parents=True)
base_model.save(str(dr) + "/base_model.model")

# Find top 10 most similar words to "example"
similar_words = base_model_wv.most_similar("solar", topn=20)

# Print the similar words and their cosine similarities
for word, score in similar_words:
    # print(f"{repr(word)}: {score}")
    print(f"{word}: {score}")