## Word2Vec
- Continuous bag of words
- skip gram

In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [2]:
corpus = [
    'ฉัน กิน ข้าว มัน ไก่',
    'ข้าว มัน ไก่ อร่อย ดี นะ',
    'ฉัน ชอบ ผัด ไทย',
    'ไก่ ย่าง วิเชียร์บุรี',
    'วันนี้ จะ กิน ข้าว หมูทอด มัน อร่อย มาก',
    'หิว ข้าว จัง',
]

### Tokenize corpus & create a list of unique word
- word2idx
- idx2word

In [3]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

In [4]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
print(vocabulary_size)

19


In [5]:
word2idx

{'ฉัน': 0,
 'กิน': 1,
 'ข้าว': 2,
 'มัน': 3,
 'ไก่': 4,
 'อร่อย': 5,
 'ดี': 6,
 'นะ': 7,
 'ชอบ': 8,
 'ผัด': 9,
 'ไทย': 10,
 'ย่าง': 11,
 'วิเชียร์บุรี': 12,
 'วันนี้': 13,
 'จะ': 14,
 'หมูทอด': 15,
 'มาก': 16,
 'หิว': 17,
 'จัง': 18}

### Generate pair center word & contex word

In [6]:
window_size = 3
idx_pairs = []
for sentence in tokenized_corpus: #sentence
    indices = [word2idx[word] for word in sentence]
    for center_word_pos in range(len(indices)): # center word pos
        for w in range(-window_size, window_size + 1):  # context word pos
            context_word_pos = center_word_pos + w
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            idx_pairs.append((
                indices[center_word_pos], 
                indices[context_word_pos]
            ))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
print('shape of pairs center word & contex word :', idx_pairs.shape)

shape of pairs center word & contex word : (102, 2)


#### example of idx_pairs

In [7]:
idx_pairs[0:5]

array([[0, 1],
       [0, 2],
       [0, 3],
       [1, 0],
       [1, 2]])

#### example of word pairs

In [8]:
np.vectorize(idx2word.get)(idx_pairs[0:5])

array([['ฉัน', 'กิน'],
       ['ฉัน', 'ข้าว'],
       ['ฉัน', 'มัน'],
       ['กิน', 'ฉัน'],
       ['กิน', 'ข้าว']], dtype='<U4')

### Skip-gram objective
Predict context given center word
> $$\Pr(context \mid center\;;\;\theta)$$
Example :
- ฉัน กิน ข้าว มัน ไก่
- ฉัน กิน ... มัน ไก่

Ojective function : 
> $$\max\prod_{center}\prod_{context}\Pr(context \mid center\;;\;\theta)$$
> 1. Applying log give better computational 
> $$\min_{\theta}-\log\prod_{center}\prod_{context}\Pr(context \mid center\;;\;\theta)$$
> 2. Replace Produce with sum
> $$\log(a.b) = \log a + \log b$$
> 3. dividing by number of paragraph (T)
> $$loss = -\frac{1}{T}\sum_{center}\sum_{context}\log\Pr(context \mid center\;;\;\theta)$$

### Define Pr
> $$\Pr(context \mid center) = \frac{\exp(u_{context}^{T}v_{center})}{\sum_{w\in vocab}\exp(u_{w}^{T}v_{center})}$$
> - **Pr** is softmax function
> - **u** is vector of context
> - **v** is vector of center

### Neural Network Architecture
![alt text](http://mccormickml.com/assets/word2vec/skip_gram_net_arch.png "Network Archintecture")


### Let's start Implement

In [9]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

embedding_dims = 10
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 5001
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data[0]
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data
        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 500 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 5.578896999359131




Loss at epo 500: 2.050281286239624
Loss at epo 1000: 1.83573317527771
Loss at epo 1500: 1.7598172426223755
Loss at epo 2000: 1.728287935256958
Loss at epo 2500: 1.7139993906021118
Loss at epo 3000: 1.7066351175308228
Loss at epo 3500: 1.7023707628250122
Loss at epo 4000: 1.699673056602478
Loss at epo 4500: 1.697850227355957
Loss at epo 5000: 1.6965572834014893


In [10]:
print('W1 dimension: ', list(W1.shape))
print('W2 dimension: ', list(W2.shape))

W1 dimension:  [10, 19]
W2 dimension:  [19, 10]


#### using W1

In [11]:
vec1 = W1[:,word2idx['กิน']]
print(vec1)

tensor([ 1.5298,  0.6374,  0.3966,  0.5490, -0.6637,  1.3452, -0.6883,
         0.0556, -0.9485, -2.1520])


#### using W2

In [12]:
vec2 = W2[word2idx['กิน'],:]
print(vec2)

tensor([-0.4906, -0.2285,  2.5428,  0.4268,  0.3793,  1.0411, -0.1335,
        -1.0654, -0.4700,  1.5466])


#### using avg of W1 and W2

In [13]:
print((vec1+vec2)/2)

tensor([ 0.5196,  0.2045,  1.4697,  0.4879, -0.1422,  1.1931, -0.4109,
        -0.5049, -0.7093, -0.3027])


### Continuous bag of words using lib: gensim

In [14]:
tokenized_corpus

[['ฉัน', 'กิน', 'ข้าว', 'มัน', 'ไก่'],
 ['ข้าว', 'มัน', 'ไก่', 'อร่อย', 'ดี', 'นะ'],
 ['ฉัน', 'ชอบ', 'ผัด', 'ไทย'],
 ['ไก่', 'ย่าง', 'วิเชียร์บุรี'],
 ['วันนี้', 'จะ', 'กิน', 'ข้าว', 'หมูทอด', 'มัน', 'อร่อย', 'มาก'],
 ['หิว', 'ข้าว', 'จัง']]

In [15]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

path = get_tmpfile("word2vec.model")

model = Word2Vec(tokenized_corpus, size=10, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [16]:
print(list(model.wv.vocab))

['ฉัน', 'กิน', 'ข้าว', 'มัน', 'ไก่', 'อร่อย', 'ดี', 'นะ', 'ชอบ', 'ผัด', 'ไทย', 'ย่าง', 'วิเชียร์บุรี', 'วันนี้', 'จะ', 'หมูทอด', 'มาก', 'หิว', 'จัง']


In [17]:
model.wv['กิน']

array([ 0.01344029,  0.02174879,  0.03583036, -0.00045144, -0.04118928,
        0.02446591, -0.01483102,  0.03524507, -0.01538611, -0.01963457],
      dtype=float32)