In [40]:
import os
from datetime import datetime
import gensim
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

import multiprocessing
import shutil
import wget
from zipfile import *

In [36]:
model = 'word2vec'
#model = 'glove'
language='he'
dim=50

In [37]:
if 'models' not in os.listdir('.'):
    os.mkdir('./models/')
if 'raws' not in os.listdir('./models/'):
    os.mkdir('./models/raws')

In [41]:
def train_word2vec(dim, language):
    """Training a Word2Vec model in Hebrew"""
    inp = f'wiki.{language}.text'
    outp = f'word2vec_{dim}_{language}.model'

    if inp not in os.listdir('./corpora/'):
        print('Corpus file not present, impossible to train')

    if outp not in os.listdir('./models/raws/'):
        print('Model file not present, starts training')
        print(f'Training started {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

        model = gensim.models.word2vec.Word2Vec(LineSentence(f'./corpora/{inp}'), sg = 1, #1=SkipGram, 0=CBOW
                                            size=dim, window=5, min_count=5, workers=multiprocessing.cpu_count())
        # trim unneeded model memory = use (much) less RAM
        model.init_sims(replace=True)

        print(f'Training ended {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

        model.save(f'./models/raws/{outp}')
    else:
        print('Model file already present, no need to train')
        
def convert_word2vec_model_to_txt(dim, language):
    """ For converting Word2Vec .model files to txt files containing the vectors in plain text."""
    inp = f'word2vec_{dim}_{language}.model'
    print(f'Loading started {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    m = Word2Vec.load(f'./models/raws/{inp}')
    print(f'Loading ended {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

    with open(f'./models/raws/word2vec_{dim}_{language}_vectors.txt', 'w') as f:
        vocab = m.wv.vocab.keys()
        model = m.wv
        i = 0
        for word in list(vocab):
            vecs = model[word]
            vecs = [str(v) for v in vecs]
            i += 1
            l = word+' '+' '.join(vecs)+'\n'
            f.write(l)
    os.remove(f'./models/raws/word2vec_{dim}_{language}.model')
    os.remove(f'./models/raws/word2vec_{dim}_{language}.model.wv.vectors.npy')
    os.remove(f'./models/raws/word2vec_{dim}_{language}.model.trainables.syn1neg.npy')

In [42]:
if model == 'word2vec':
    train_word2vec(dim, language)
    convert_word2vec_model_to_txt(dim, language)

Model file already present, no need to train
Loading started 2022-10-04 19:41:09
Loading ended 2022-10-04 19:41:12


In [35]:
def train_glove(dim, language):
    if 'GloVe' not in os.listdir('./models/'):
        URL = 'https://github.com/stanfordnlp/GloVe/archive/refs/heads/master.zip'
        response = wget.download(URL, './models/master.zip')
        with ZipFile('./models/master.zip', 'r') as zipObj:
                zipObj.extractall('./models/')
                #!unzip glove.42B.300d.zip
        os.rename('./models/GloVe-master', './models/GloVe')
        os.remove('./models/master.zip')
                
    inp = f'wiki.{language}.text'

    shutil.copy('./models/GloVe/demo.sh', './models/GloVe/demo_backup.sh')

    with open('./models/GloVe/demo.sh', 'r') as file :
        filedata = file.read()

    # Replace the target string
    filedata = filedata.replace('CORPUS=text8', f'CORPUS=../../corpora/{inp}')
    filedata = filedata.replace('VECTOR_SIZE=50', f'VECTOR_SIZE={dim}')

    # Write the file out again
    with open('./models/GloVe/demo.sh', 'w') as file:
        file.write(filedata)

    # note that this command will recreate the vocab file and the coocurrence matrix even if they already exist...
    !cd ./models/GloVe/ && make && ./demo.sh

    os.rename('./models/GloVe/demo_backup.sh', './models/GloVe/demo.sh')
    os.rename('./models/GloVe/vectors.txt', f'./models/raws/glove_{dim}_{language}_vectors.txt')
    os.remove('./models/GloVe/vectors.bin')
    

In [34]:
if model == 'glove':
    train_glove(dim, language)

-1 / unknown