In [23]:
import random
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [14]:
sent = "you say good by and I say hello"
words = sent.split(" ")
words

['you', 'say', 'good', 'by', 'and', 'I', 'say', 'hello']

# Word2Vec
추론기반의 방법

$MatMul$ $\rightarrow$ $Softmax$ $\rightarrow$ $Cross$ $Entropy$ $Error$ $\rightarrow$ $Loss$  
($W_o$$_u$$_t$)

## 2.One Hot Vector 형태의 입력값을 Win과 곱

In [18]:
input1 = np.array([[1, 0, 0, 0, 0, 0, 0]]) # You
input2 = np.array([[0, 0, 1, 0, 0, 0, 0]])

In [24]:
random.seed(42)

# (input x dimension size) = choose the dimension size yourself.
W_in = np.random.randn(7, 3)
W_in

array([[ 0.45390396, -0.46723849,  1.19417119],
       [ 0.24201282,  0.09550312,  1.3573311 ],
       [-1.55918386, -0.76461944,  0.42788491],
       [-1.92830391, -1.22788364,  1.344552  ],
       [ 1.10358013,  0.18851949,  0.13372865],
       [-0.73367621, -0.96367713,  0.64497318],
       [-0.71071827,  0.21414537, -0.07345437]])

In [26]:
h_1 = np.matmul(input1, W_in)   # hidden layer
h_2 = np.matmul(input2, W_in)   # hidden layer

print(h_1)
print(h_2)

[[ 0.45390396 -0.46723849  1.19417119]]
[[-1.55918386 -0.76461944  0.42788491]]


In [21]:
print((h_1+h_2)/2)

[[-1.22291247  0.00921342 -1.0041676 ]]


In [30]:
# with rounded numbers
W_in_rounded = np.round(W_in, 2)
h_1_rounded = np.matmul(input1, W_in_rounded)
h_2_rounded = np.matmul(input2, W_in_rounded)

print(h_1_rounded)
print(h_2_rounded)

[[ 0.45 -0.47  1.19]]
[[-1.56 -0.76  0.43]]


## 3.은닉 상태(Hidden state)의 값을 W_out과 곱해서 score를 추출

In [31]:
# 초기 weight 는 며느리도 모름. Random하게 -_-)..
W_out = np.random.randn(3, 7)
np.round(W_out, 2)

array([[ 0.56,  0.65,  0.75,  0.64,  2.45,  1.41,  0.2 ],
       [-0.07,  0.8 ,  0.2 , -1.07, -0.27,  1.47,  0.88],
       [-1.28,  2.  ,  1.19, -0.93, -0.7 , -0.58,  1.66]])

## 4.Softmax(score) 
Score에 softmax를 취해서 각 단어가 나올 확률을 계산.

$y_k$ = $\frac{exp(a_k)}{\sum_{i=1}^{n}exp(a_i)}$

The sum of all softmax values is 1. $Softmax$ reflects the property of probability.

In [47]:
h = h_1 + h_2
score = np.matmul(h, W_out)
max = np.max(score)
idx = np.where(score == max)

print(f'final hidden layer: {h}')
print(f'score: {np.round(score, 3)}')
print(f'score max: {np.round(max,3)} at pos {idx}')

final hidden layer: [[-1.10527991 -1.23185794  1.6220561 ]]
score: [[-2.611  1.549  0.854 -0.901 -3.499 -4.31   1.389]]
score max: 1.549 at pos (array([0]), array([1]))


In [55]:
def softmax(x):
    exp_x = np.exp(x)           # get exponents of x for base e.
    sum_exp_x = np.sum(exp_x)   # merge exponent values into a sum
    y = exp_x / sum_exp_x       # for each exponent value x, divide it by the sum of all exponents.
    return y

softmax(score)

array([[0.00634187, 0.4060328 , 0.20275529, 0.0350664 , 0.00260916,
        0.00115947, 0.34603501]])

In [57]:
pred = softmax(score)
print(np.round(pred, 4))

pred_max = np.max(pred)
max_idx = np.where(pred == pred_max)
print(max_idx)

[[0.0063 0.406  0.2028 0.0351 0.0026 0.0012 0.346 ]]
(array([0]), array([1]))


## 5.정답과 Cross Entropy Loss 계산
$-\frac{1}{N}\sum_{i=1}^{N}\sum_{j=1}^{N}t_i,_jlog(p_i,_j)$

In [58]:
# Cross Entropy Loss

def cross_entropy_error(y, t):
    '''
    y : prediction
    t : target
    '''
    delta = 1e-7    # log의 내부가 0이 되는 것을 방지

    # y.shape[0]으로 나눠주는 이유는 배치 사이즈 반영
    return -np.sum(t * np.log(y + delta)) / y.shape[0]

In [59]:
cross_entropy_error(pred, [[0, 1, 0, 0, 0, 0, 0]])

0.9013210951205277

## 6.5에서 계산한 loss를 가지고 back-propagation 과정을 통해 weight 업데이트
- Softmax의 backpropagation values $(P_i - y_i)$
    - ```dw_out(Delta for W_out) = np.outer(Hidden Layers, ds)```


In [64]:
# Loss를 통해 weight를 업데이트
# Softmax의 미분값
# ds = Pi - yi 
answer = [0, 1, 0, 0, 0, 0, 0]
ds = np.round(pred - answer, 4)
print(ds)

[[ 0.0063 -0.594   0.2028  0.0351  0.0026  0.0012  0.346 ]]


In [65]:
# ds (Delta for W_out) 계산
dW_out = np.outer(h, ds)   # 소프트맥스에서 발생한 delta 값에 대해 역전파를 취하면 
print(np.round(dW_out, 4))

[[-0.007   0.6565 -0.2242 -0.0388 -0.0029 -0.0013 -0.3824]
 [-0.0078  0.7317 -0.2498 -0.0432 -0.0032 -0.0015 -0.4262]
 [ 0.0102 -0.9635  0.329   0.0569  0.0042  0.0019  0.5612]]


In [66]:
# Hidden layer 에 대한 역전파 값을 계산
da = np.dot(ds, W_out.T)
print(np.round(da, 4))

[[-0.1289 -0.1621 -0.4134]]


In [67]:
dw_1 = np.round(np.outer(np.array([[1, 0, 0, 0, 0, 0, 0]]), (da/2)), 4)
print(dw_1)

[[-0.0644 -0.081  -0.2067]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]]


In [68]:
dw_2 = np.round(np.outer(np.array([[0, 0, 1, 0, 0, 0, 0]]), (da/2)), 4)
print(dw_2)

[[-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.0644 -0.081  -0.2067]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]]


Weight 값은 10회, 100회 등 반복해주면서 손질.   
- $W\ in\ new = W\ in - learning\ rate * dw\ in$

In [69]:
learning_rate = 1
W_in_new = W_in - learning_rate * dw_1
W_in_new = W_in_new - learning_rate * dw_2
print(np.round(W_in_new, 4))

[[ 0.5183 -0.3862  1.4009]
 [ 0.242   0.0955  1.3573]
 [-1.4948 -0.6836  0.6346]
 [-1.9283 -1.2279  1.3446]
 [ 1.1036  0.1885  0.1337]
 [-0.7337 -0.9637  0.645 ]
 [-0.7107  0.2141 -0.0735]]


In [70]:
print(np.round(W_in, 4))

[[ 0.4539 -0.4672  1.1942]
 [ 0.242   0.0955  1.3573]
 [-1.5592 -0.7646  0.4279]
 [-1.9283 -1.2279  1.3446]
 [ 1.1036  0.1885  0.1337]
 [-0.7337 -0.9637  0.645 ]
 [-0.7107  0.2141 -0.0735]]


# Skip-gram

## 2.One Hot Vector 형태의 입력값을 $W_i$$_n$과 곱

In [71]:
# 입력값은 원-핫 벡터 형태 
input = np.array([[0, 1, 0, 0, 0, 0, 0]]) # say

output1 = np.array([[1, 0, 0, 0, 0, 0, 0]]) # you
output2 = np.array([[0, 0, 1, 0, 0, 0, 0]])

In [72]:
# (입력 x 차원의 크기) - 차원의 크기는 사용자가 선정
## 초기의 Weight 는 랜덤하게 결정
W_in = np.random.randn(7, 3)

In [73]:
# 은닉층 값
h = np.matmul(input, W_in) 

In [74]:
print(h)

[[-0.15246184  0.40483246 -1.5847999 ]]


## 3.Hidden state x W_out = score
Hidden state 값을 W_out과 곱해 score 추출


In [75]:
W_out = np.random.randn(3, 7)
score = np.matmul(h, W_out)
print(np.round(score, 4))

[[-1.4894 -0.9158 -1.7142  0.0935 -0.9794 -0.7962 -5.1275]]


## 4.softmax(score)
score에 softmax를 취해서 각 단어가 나올 확률로 변환을 해줌

In [76]:
pred = softmax(score)
print(np.round(pred, 4))

[[0.0824 0.1463 0.0658 0.4013 0.1372 0.1648 0.0022]]


## 5.정답과 Cross Entropy Loss 계산
정답 업데이트는 한 번에

## 6.5에서 계산한 Loss를 가지고 Backpropagation 과정을 통해 weight 업데이트
- 두 개의 answer에 오차를 더함

In [77]:
ds1 = np.round(pred - output1, 4)
ds2 = np.round(pred - output2, 4)
ds = ds1 + ds2
print(ds)

[[-0.8352  0.2926 -0.8684  0.8026  0.2744  0.3296  0.0044]]


In [79]:
 dw_out = np.round(np.outer(h, ds), 4)
 print(np.round(dw_out, 4))

[[ 1.2730e-01 -4.4600e-02  1.3240e-01 -1.2240e-01 -4.1800e-02 -5.0300e-02
  -7.0000e-04]
 [-3.3810e-01  1.1850e-01 -3.5160e-01  3.2490e-01  1.1110e-01  1.3340e-01
   1.8000e-03]
 [ 1.3236e+00 -4.6370e-01  1.3762e+00 -1.2720e+00 -4.3490e-01 -5.2240e-01
  -7.0000e-03]]


In [80]:
da = np.dot(ds, W_out.T)
print(np.round(da, 4))

[[-0.6844 -1.1482 -1.4806]]


In [81]:
dw_in = np.outer(np.array([[0, 1, 0, 0, 0, 0, 0]]), da)
print(np.round(dw_in, 4))

[[-0.     -0.     -0.    ]
 [-0.6844 -1.1482 -1.4806]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]
 [-0.     -0.     -0.    ]]


```W_in```값을 update 

In [82]:
learning_rate = 1
W_in_new = W_in - learning_rate * dw_in
print(np.round(W_in_new, 4))

[[-0.0605 -0.4595  0.4204]
 [ 0.5319  1.553  -0.1042]
 [ 0.1175 -1.2151 -1.4647]
 [-0.6205 -0.5375  0.212 ]
 [-1.1247  0.4217 -0.052 ]
 [-0.6861 -0.077   1.7683]
 [-1.2858  0.7717  0.0048]]


In [83]:
print(np.round(W_in, 4))

[[-0.0605 -0.4595  0.4204]
 [-0.1525  0.4048 -1.5848]
 [ 0.1175 -1.2151 -1.4647]
 [-0.6205 -0.5375  0.212 ]
 [-1.1247  0.4217 -0.052 ]
 [-0.6861 -0.077   1.7683]
 [-1.2858  0.7717  0.0048]]


In [84]:
learning_rate = 1
W_out_new = W_out - learning_rate * dw_out
print(np.round(W_out_new, 4))

[[-0.6686 -0.6569 -0.8771 -0.9833 -2.767   0.2844  0.7096]
 [ 0.0464 -1.3836  0.9125 -0.2994 -0.7465 -1.2845 -0.3013]
 [-0.4062  0.7859 -0.0796  1.3259  1.1608  0.7083  3.0978]]


In [85]:
print(np.round(W_out, 4))

[[-0.5413 -0.7015 -0.7447 -1.1057 -2.8088  0.2341  0.7089]
 [-0.2917 -1.2651  0.5609  0.0255 -0.6354 -1.1511 -0.2995]
 [ 0.9174  0.3222  1.2966  0.0539  0.7259  0.1859  3.0908]]


skip-gram이 풀려는 문제가 CBOW보다 훨씬 어려움. 

# 참고문헌
- T Academy Recommendation2
https://www.youtube.com/watch?v=3jfHP0Rq1Gg
https://www.kaggle.com/chocozzz/t-academy-recommendation2/code