In [None]:
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from pathlib import Path

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# Dataset

In [None]:
data_fn = Path('../input/tokenized_paragraphs.txt')

In [None]:
tokenized_paras = [para.split(' ') for para in data_fn.read_text().split('\n')]

In [None]:
tokenized_paras[0]

['ན་མོ་',
 'གུ་རུ་',
 'དེ་བ་',
 'ཌཱ་ཀི་',
 'ནཱི་',
 'ཡཻ',
 '།_',
 'དགོངས་པ',
 'འི་',
 'སྟོབས་',
 'དང་',
 'ཚུལ་ལྡན་',
 'ཆོ་ག',
 'འི་',
 'མཐུ',
 'ས',
 '།_།',
 'ཐོག་མེད་',
 'འཁྲུལ་',
 'པ',
 'འི་',
 'འཆིང་བ་',
 'སྐད་ཅིག་',
 'ལ',
 '།_།',
 'བྲལ་',
 'ན',
 'ས་',
 'མངོན་སུམ་',
 'ཡེ་ཤེས་',
 'སད་',
 'མཛད་པ',
 '།_།',
 'དཀྱིལ་འཁོར་',
 'དབང་ཕྱུག་',
 'དཔལ་ལྡན་',
 'བླ་མ',
 'ར་',
 'འདུད',
 '།_།',
 'རྡོ་རྗེ་',
 'ཐེག་པ',
 'འི་',
 'རྩ་བ་',
 'སྨིན་',
 'བྱེད་',
 'ཀྱི',
 '།_།',
 'ཚུལ་',
 'འདི་',
 'ཟབ་',
 'རྒྱ',
 'ས་',
 'ཉིད་',
 'ཕྱི',
 'ར་',
 'རྟོགས་དཀའ་',
 'ཡང་',
 '།_།',
 'དང་པོ',
 'འི་',
 'ལས་ཅན་',
 'ཕྱོགས་',
 'ཙམ་',
 'ངེས་',
 'རྙེད་',
 'ཕྱི',
 'ར',
 '།_།',
 'གོ་',
 'བདེ',
 'འི་',
 'ངག་',
 'གི',
 'ས་',
 'མདོར་བསྡུས་',
 'བརྗོད་པ',
 'ར་',
 'བྱ',
 '།_།',
 'དེ',
 'འང་',
 'རྡོ་རྗེ་',
 'ཐེག་པ',
 'འི་',
 'ལམ་',
 'གྱི་',
 'གནད་',
 'ཐམས་ཅད་',
 'ཚང་',
 'ཞིང་',
 'ཁྱད་པར་',
 'གསང་སྔགས་',
 'ཀྱི་',
 'རྒྱུད་',
 'ལུང་',
 'མན་ངག་',
 'རྣམས་',
 'ལ་',
 'ཐོས་བསམ་',
 'སྒོམ་པ་',
 'གང་',
 'བྱེད་',
 'ཀྱང་',
 'ངེས་པ',
 'ར་',
 'སྔོན

In [None]:
word_freq = defaultdict(int)
for para in tokenized_paras:
    for i in para:
        word_freq[i] += 1
len(word_freq)

58888

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['ས་', '།_', 'འི་', 'ར་', '།_།', 'ལ་', 'དང་', 'ན', 'ཀྱི་', 'དུ་']

# Training the Model

In [None]:
import multiprocessing

from gensim.models import Word2Vec

## Why I seperate the training of the model in 3 steps:
I prefer to separate the training in 3 distinctive steps for clarity and monitoring.
1. `Word2Vec()`: 
>In this first step, I set up the parameters of the model one-by-one. <br>I do not supply the parameter `sentences`, and therefore leave the model uninitialized, purposefully.
2. `.build_vocab()`: 
>Here it builds the vocabulary from a sequence of sentences and thus initialized the model. <br>With the loggings, I can follow the progress and even more important, the effect of `min_count` and `sample` on the word corpus. I noticed that these two parameters, and in particular `sample`, have a great influence over the performance of a model. Displaying both allows for a more accurate and an easier management of their influence.
3. `.train()`:
>Finally, trains the model.<br>
The loggings here are mainly useful for monitoring, making sure that no threads are executed instantaneously.

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

## The parameters:

* `min_count` <font color='purple'>=</font> <font color='green'>int</font> - Ignores all words with total absolute frequency lower than this - (2, 100)


* `window` <font color='purple'>=</font> <font color='green'>int</font> - The maximum distance between the current and predicted word within a sentence. E.g. `window` words on the left and `window` words on the left of our target - (2, 10)


* `size` <font color='purple'>=</font> <font color='green'>int</font> - Dimensionality of the feature vectors. - (50, 300)


* `sample` <font color='purple'>=</font> <font color='green'>float</font> - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial.  - (0, 1e-5)


* `alpha` <font color='purple'>=</font> <font color='green'>float</font> - The initial learning rate - (0.01, 0.05)


* `min_alpha` <font color='purple'>=</font> <font color='green'>float</font> - Learning rate will linearly drop to `min_alpha` as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00


* `negative` <font color='purple'>=</font> <font color='green'>int</font> - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)


* `workers` <font color='purple'>=</font> <font color='green'>int</font> - Use these many worker threads to train the model (=faster training with multicore machines)

### Dimension of word embedding
The optimal dimensionality of word embeddings is mostly task-dependent: a smaller dimensionality works better for more syntactic tasks such as named entity recognition (Melamud et al., 2016) [3] or part-of-speech (POS) tagging (Plank et al., 2016) [4], while a larger dimensionality is more useful for more semantic tasks such as sentiment analysis (Ruder et al., 2016) [5].

- [3] -> http://arxiv.org/abs/1601.00893
- [4] -> Plank, B., Søgaard, A., & Goldberg, Y. (2016). Multilingual Part-of-Speech Tagging with Bidirectional Long Short-Term Memory Models and Auxiliary Loss. In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics. 
- [5] -> http://arxiv.org/abs/1609.02745

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=5,
                     size=150,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

## Building the Vocabulary Table:
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [None]:
t = time()

w2v_model.build_vocab(tokenized_paras, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.11 mins


## Training of the model:
_Parameters of the training:_
* `total_examples` <font color='purple'>=</font> <font color='green'>int</font> - Count of sentences;
* `epochs` <font color='purple'>=</font> <font color='green'>int</font> - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [None]:
t = time()

w2v_model.train(tokenized_paras, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

KeyboardInterrupt: 

In [None]:
w2v_model.wv['ཞུགས']

array([ 0.3729301 , -0.8534347 , -1.0309534 ,  0.8542432 ,  1.2413125 ,
        0.1571531 , -0.10291172,  0.58237725, -2.644014  , -1.8357233 ,
        0.5710879 ,  0.83698714,  0.6882576 ,  0.8948287 , -0.02084561,
        2.117036  ,  1.7047074 ,  2.2853692 ,  3.6181774 , -0.58283055,
       -0.26574504,  0.07433303,  2.093018  , -0.48373508, -0.90534794,
        0.8568248 , -2.666746  ,  0.05026199,  1.6255131 , -3.249759  ,
        0.03756811, -0.74018884, -1.1021843 ,  0.8330723 ,  0.24384524,
       -0.436396  ,  0.82031775,  1.571845  ,  0.66659415, -1.6746459 ,
        0.3454264 , -0.09310634, -1.4394299 , -0.40920028,  1.1556567 ,
        0.38380772,  0.78428346,  1.3377736 , -0.9033879 ,  0.21484745,
        0.5911416 , -1.1893706 ,  0.5863942 , -0.39954457, -1.1904216 ,
        2.515153  ,  0.483994  , -0.21034713, -1.2771081 ,  0.49367416,
        0.7438269 , -1.1765704 ,  1.0646936 , -0.5540529 , -0.72172844,
        0.08064974,  1.1871132 , -2.1466894 , -1.2714894 , -2.09

# Save the word2vec

In [None]:
w2v_model.wv.save_word2vec_format("./bo_word2vec",
                              "./vocab",
                               binary=False)

In [None]:
!ls

__notebook_source__.ipynb  vocabulary  word2vec_org


In [None]:
from gensim.models import KeyedVectors

In [None]:
wv_from_text = KeyedVectors.load_word2vec_format('word2vec_org', binary=False)

In [None]:
wv_from_text['ཞུགས']

array([ 0.3729301 , -0.8534347 , -1.0309534 ,  0.8542432 ,  1.2413125 ,
        0.1571531 , -0.10291172,  0.58237725, -2.644014  , -1.8357233 ,
        0.5710879 ,  0.83698714,  0.6882576 ,  0.8948287 , -0.02084561,
        2.117036  ,  1.7047074 ,  2.2853692 ,  3.6181774 , -0.58283055,
       -0.26574504,  0.07433303,  2.093018  , -0.48373508, -0.90534794,
        0.8568248 , -2.666746  ,  0.05026199,  1.6255131 , -3.249759  ,
        0.03756811, -0.74018884, -1.1021843 ,  0.8330723 ,  0.24384524,
       -0.436396  ,  0.82031775,  1.571845  ,  0.66659415, -1.6746459 ,
        0.3454264 , -0.09310634, -1.4394299 , -0.40920028,  1.1556567 ,
        0.38380772,  0.78428346,  1.3377736 , -0.9033879 ,  0.21484745,
        0.5911416 , -1.1893706 ,  0.5863942 , -0.39954457, -1.1904216 ,
        2.515153  ,  0.483994  , -0.21034713, -1.2771081 ,  0.49367416,
        0.7438269 , -1.1765704 ,  1.0646936 , -0.5540529 , -0.72172844,
        0.08064974,  1.1871132 , -2.1466894 , -1.2714894 , -2.09

In [None]:
!head vocabulary

ས་ 687133
།_ 533771
འི་ 506420
ར་ 462038
།_། 341664
ལ་ 208912
དང་ 197993
ན 144106
ཀྱི་ 140586
དུ་ 137221


In [None]:
import gensim
gensim.__version__

'3.8.0'