In [8]:
import os
from datetime import datetime
import gensim
from gensim.models.word2vec import LineSentence
import multiprocessing
import shutil

In [11]:
#model = 'word2vec'
model = 'glove'
language='he'
dim=300

In [12]:
def train_word2vec(dim, language):
    """Training a Word2Vec model in Hebrew"""
    inp = f'wiki.{language}.text'
    outp = f'word2vec_{dim}_{language}.model'

    if inp not in os.listdir('./corpora/'):
        print('Corpus file not present, impossible to train')

    if outp not in os.listdir('./models/raws/'):
        print('Model file not present, starts training')
        print(f'Training started {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

        model = gensim.models.word2vec.Word2Vec(LineSentence(f'./corpora/{inp}'), sg = 1, #1=SkipGram, 0=CBOW
                                            size=dim, window=5, min_count=5, workers=multiprocessing.cpu_count())
        # trim unneeded model memory = use (much) less RAM
        model.init_sims(replace=True)

        print(f'Training ended {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

        model.save(f'./models/raws/{outp}')
    else:
        print('Model file already present, no need to train')

In [13]:
if 'models' not in os.listdir('.'):
    os.mkdir('./models/')
if 'raws' not in os.listdir('./models/'):
    os.mkdir('./models/raws')

In [17]:
if model == 'word2vec':
    train_word2vec(dim, language)

In [18]:
def train_glove(dim, language):
    inp = f'wiki.{language}.text'

    shutil.copy('./models/GloVe/demo.sh', './models/GloVe/demo_backup.sh')

    with open('./models/GloVe/demo.sh', 'r') as file :
        filedata = file.read()

    # Replace the target string
    filedata = filedata.replace('CORPUS=text8', f'CORPUS=../../corpora/{inp}')
    filedata = filedata.replace('VECTOR_SIZE=50', f'VECTOR_SIZE={dim}')

    # Write the file out again
    with open('./models/GloVe/demo.sh', 'w') as file:
        file.write(filedata)

    # note that this command will recreate the vocab file and the coocurrence matrix even if they already exist...
    !cd ./models/GloVe/ && make && ./demo.sh

    os.rename('./models/GloVe/demo_backup.sh', './models/GloVe/demo.sh')
    os.rename('./models/GloVe/vectors.txt', f'./models/raws/glove_{dim}_{language}_vectors.txt')
    os.remove('./models/GloVe/vectors.bin')


In [None]:
if model == 'glove':
    train_glove(dim, language)

mkdir -p build

$ build/vocab_count -min-count 5 -verbose 2 < ../../corpora/wiki.he.text > vocab.txt
BUILDING VOCABULARY


Processed 0 tokens.[11G100000 tokens.[11G200000 tokens.[11G300000 tokens.[11G400000 tokens.[11G500000 tokens.[11G600000 tokens.[11G700000 tokens.[11G800000 tokens.[11G900000 tokens.[11G1000000 tokens.[11G1100000 tokens.[11G1200000 tokens.[11G1300000 tokens.[11G1400000 tokens.[11G1500000 tokens.[11G1600000 tokens.[11G1700000 tokens.[11G1800000 tokens.[11G1900000 tokens.[11G2000000 tokens.[11G2100000 tokens.[11G2200000 tokens.[11G2300000 tokens.[11G2400000 tokens.[11G2500000 tokens.[11G2600000 tokens.[11G2700000 tokens.[11G2800000 tokens.[11G2900000 tokens.[11G3000000 tokens.[11G3100000 tokens.[11G3200000 tokens.[11G3300000 tokens.[11G3400000 tokens.[11G3500000 tokens.[11G3600000 tokens.[11G3700000 tokens.[11G3800000 tokens.[11G3900000 tokens.[11G4000000 tokens.[11G4100000 tokens.[11G4200000 tokens.[11G4300000 tokens.[11G4400000 tokens.[11G4500000 tokens.[11G4600000 tokens.[11G4700000 tokens.[11G4800000 tokens.[11G4900000 tokens.[11G50000

[11G39700000 tokens.[11G39800000 tokens.[11G39900000 tokens.[11G40000000 tokens.[11G40100000 tokens.[11G40200000 tokens.[11G40300000 tokens.[11G40400000 tokens.[11G40500000 tokens.[11G40600000 tokens.[11G40700000 tokens.[11G40800000 tokens.[11G40900000 tokens.[11G41000000 tokens.[11G41100000 tokens.[11G41200000 tokens.[11G41300000 tokens.[11G41400000 tokens.[11G41500000 tokens.[11G41600000 tokens.[11G41700000 tokens.[11G41800000 tokens.[11G41900000 tokens.[11G42000000 tokens.[11G42100000 tokens.[11G42200000 tokens.[11G42300000 tokens.[11G42400000 tokens.[11G42500000 tokens.[11G42600000 tokens.[11G42700000 tokens.[11G42800000 tokens.[11G42900000 tokens.[11G43000000 tokens.[11G43100000 tokens.[11G43200000 tokens.[11G43300000 tokens.[11G43400000 tokens.[11G43500000 tokens.[11G43600000 tokens.[11G43700000 tokens.[11G43800000 tokens.[11G43900000 tokens.[11G44000000 tokens.[11G44100000 tokens.[11G44200000 tokens.[11G44300000 tokens.[11G44400000

[11G79000000 tokens.[11G79100000 tokens.[11G79200000 tokens.[11G79300000 tokens.[11G79400000 tokens.[11G79500000 tokens.[11G79600000 tokens.[11G79700000 tokens.[11G79800000 tokens.[11G79900000 tokens.[11G80000000 tokens.[11G80100000 tokens.[11G80200000 tokens.[11G80300000 tokens.[11G80400000 tokens.[11G80500000 tokens.[11G80600000 tokens.[11G80700000 tokens.[11G80800000 tokens.[11G80900000 tokens.[11G81000000 tokens.[11G81100000 tokens.[11G81200000 tokens.[11G81300000 tokens.[11G81400000 tokens.[11G81500000 tokens.[11G81600000 tokens.[11G81700000 tokens.[11G81800000 tokens.[11G81900000 tokens.[11G82000000 tokens.[11G82100000 tokens.[11G82200000 tokens.[11G82300000 tokens.[11G82400000 tokens.[11G82500000 tokens.[11G82600000 tokens.[11G82700000 tokens.[11G82800000 tokens.[11G82900000 tokens.[11G83000000 tokens.[11G83100000 tokens.[11G83200000 tokens.[11G83300000 tokens.[11G83400000 tokens.[11G83500000 tokens.[11G83600000 tokens.[11G83700000

[11G117300000 tokens.[11G117400000 tokens.[11G117500000 tokens.[11G117600000 tokens.[11G117700000 tokens.[11G117800000 tokens.[11G117900000 tokens.[11G118000000 tokens.[11G118100000 tokens.[11G118200000 tokens.[11G118300000 tokens.[11G118400000 tokens.[11G118500000 tokens.[11G118600000 tokens.[11G118700000 tokens.[11G118800000 tokens.[11G118900000 tokens.[11G119000000 tokens.[11G119100000 tokens.[11G119200000 tokens.[11G119300000 tokens.[11G119400000 tokens.[11G119500000 tokens.[11G119600000 tokens.[11G119700000 tokens.[11G119800000 tokens.[11G119900000 tokens.[11G120000000 tokens.[11G120100000 tokens.[11G120200000 tokens.[11G120300000 tokens.[11G120400000 tokens.[11G120500000 tokens.[11G120600000 tokens.[11G120700000 tokens.[11G120800000 tokens.[11G120900000 tokens.[11G121000000 tokens.[11G121100000 tokens.[11G121200000 tokens.[11G121300000 tokens.[11G121400000 tokens.[11G121500000 tokens.[11G121600000 tokens.[11G121700000 tokens.[11G12180

[11G154700000 tokens.[11G154800000 tokens.[11G154900000 tokens.[11G155000000 tokens.[11G155100000 tokens.[11G155200000 tokens.[11G155300000 tokens.[11G155400000 tokens.[11G155500000 tokens.[11G155600000 tokens.[11G155700000 tokens.[11G155800000 tokens.[11G155900000 tokens.[11G156000000 tokens.[11G156100000 tokens.[11G156200000 tokens.[11G156300000 tokens.[11G156400000 tokens.[11G156500000 tokens.[11G156600000 tokens.[11G156700000 tokens.[11G156800000 tokens.[11G156900000 tokens.[11G157000000 tokens.[11G157100000 tokens.[11G157200000 tokens.[11G157300000 tokens.[11G157400000 tokens.[11G157500000 tokens.[11G157600000 tokens.[11G157700000 tokens.[11G157800000 tokens.[11G157900000 tokens.[11G158000000 tokens.[11G158100000 tokens.[11G158200000 tokens.[11G158300000 tokens.[11G158400000 tokens.[11G158500000 tokens.[11G158600000 tokens.[11G158700000 tokens.[11G158800000 tokens.[11G158900000 tokens.[11G159000000 tokens.[11G159100000 tokens.[11G15920

Processing token: 0[19G100000[19G200000[19G300000[19G400000[19G500000[19G600000[19G700000[19G800000[19G900000[19G1000000[19G1100000[19G1200000[19G1300000[19G1400000[19G1500000[19G1600000[19G1700000[19G1800000[19G1900000[19G2000000[19G2100000[19G2200000[19G2300000[19G2400000[19G2500000[19G2600000[19G2700000[19G2800000[19G2900000[19G3000000[19G3100000[19G3200000[19G3300000[19G3400000[19G3500000[19G3600000[19G3700000[19G3800000[19G3900000[19G4000000[19G4100000[19G4200000[19G4300000[19G4400000[19G4500000[19G4600000[19G4700000[19G4800000[19G4900000[19G5000000[19G5100000[19G5200000[19G5300000[19G5400000[19G5500000[19G5600000[19G5700000[19G5800000[19G5900000[19G6000000[19G6100000[19G6200000[19G6300000[19G6400000[19G6500000[19G6600000[19G6700000[19G6800000[19G6900000[19G7000000[19G7100000[19G7200000[19G7300000[19G7400000[19G7500000[19G7600000[19G7700000[19G7800000[19G7900000[19G8000000[19G8100000[19G8200000[19G8