In [20]:
import os
from datetime import datetime
import gensim
from gensim.models.word2vec import LineSentence
import multiprocessing


def train_word2vec(dim, language):
    """Training a Word2Vec model in Hebrew"""
    inp = f'wiki.{language}.text'
    outp = f'word2vec_{dim}_{language}.model'

    if inp not in os.listdir('./corpora/'):
        print('Corpus file not present, impossible to train')

    if outp not in os.listdir('./models/'):
        print('Model file not present, starts training')
        print(f'Training started {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

        model = gensim.models.word2vec.Word2Vec(LineSentence(f'./corpora/{inp}'), sg = 1, #1=SkipGram, 0=CBOW
                                            size=dim, window=5, min_count=5, workers=multiprocessing.cpu_count())
        # trim unneeded model memory = use (much) less RAM
        model.init_sims(replace=True)

        print(f'Training ended {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

        model.save(f'./models/{outp}')
    else:
        print('Model file already present, no need to train')

In [21]:
train_word2vec(300, 'he')

Model file not present, starts training
Training started 2022-09-30 18:57:07
Training ended 2022-09-30 21:18:22
