# word2vec 속도 개선

## Word2vec 개선 1
Embedding 계층 구현

In [1]:
import numpy as np

W = np.arange(21).reshape(7,3)
print(W)
print()
print(W[2])
print()
print(W[5])

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]
 [15 16 17]
 [18 19 20]]

[6 7 8]

[15 16 17]


In [2]:
idx = np.array([1,0,3,0])
W[idx]

array([[ 3,  4,  5],
       [ 0,  1,  2],
       [ 9, 10, 11],
       [ 0,  1,  2]])

In [3]:
class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None
    
    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out
    
    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0
        dW[self.idx] = dout # 잘못된 구현
        return None

In [30]:
class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None
    
    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out
    
    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0
        
        # for i, word_id in enumerate(self.idx):
        #     dW[word_id] += dout[i]
        np.add.at(dW, self.idx, dout)

        return None

## word2vec 개선 2
다중 분류에서 이진 분류로 (구현)

In [28]:
class EmbeddingDot:
    def __init__(self, W):
        self.embed = Embedding(W) ## W_out
        self.params = self.embed.params
        self.grads = self.embed.grads
        self.cache = None

    def forward(self, h, idx):
        target_W = self.embed.forward(idx)
        out = np.sum(target_W * h, axis=1) ## (배치, 히든) * (배치, 히든) -> (배치, )

        self.cache = (h, target_W)
        return out
    
    def backward(self, dout):
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)

        dtarget_W = dout * h ## 내적 연산의 미분
        self.embed.backward(dtarget_W)
        dh = dout * target_W ## 내적 연산의 미분
        return dh

네거티브 샘플링

In [12]:
import numpy as np

print(np.random.choice(10))
print(np.random.choice(10))

words = ['you', 'say', 'goodbye', 'I', 'hello', '.']
print()
print(np.random.choice(words))
print()

print(np.random.choice(words, size=5))
print()

print(np.random.choice(words, size=5, replace=False))
print()

p = [0.5, 0.1, 0.05, 0.2, 0.05, 0.1]
print(np.random.choice(words, p=p))

0
7

goodbye

['hello' 'I' '.' 'you' 'hello']

['I' 'hello' 'you' 'goodbye' 'say']

.


In [13]:
p = [0.7, 0.29, 0.01]
new_p = np.power(p, 0.75)
new_p /= np.sum(new_p)
print(new_p)

[0.64196878 0.33150408 0.02652714]


네거티브 샘플링 구현

In [21]:
import collections

class UnigramSampler:
    def __init__(self, corpus, power, sample_size):
        self.sample_size = sample_size
        self.vocab_size = None
        self.word_p = None

        counts = collections.Counter()
        for word_id in corpus:
            counts[word_id] += 1

        vocab_size = len(counts)
        self.vocab_size = vocab_size

        self.word_p = np.zeros(vocab_size)
        for i in range(vocab_size):
            self.word_p[i] = counts[i]

        self.word_p = np.power(self.word_p, power)
        self.word_p /= np.sum(self.word_p)

    def get_negative_sample(self, target, GPU=False):
        batch_size = target.shape[0]

        if not GPU:
            negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)

            for i in range(batch_size):
                p = self.word_p.copy()
                target_idx = target[i]
                p[target_idx] = 0
                p /= p.sum()
                negative_sample[i, :] = np.random.choice(self.vocab_size, size=self.sample_size, replace=False, p=p)
        else:
            # GPU(cupy）로 계산할 때는 속도를 우선한다.
            # 부정적 예에 타깃이 포함될 수 있다.
            negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size),
                                               replace=True, p=self.word_p)

        return negative_sample

In [22]:
from common.layers import SigmoidWithLoss

class NegativeSamplingLoss:
    def __init__(self, W, corpus, power=0.75, sample_size=5):
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]
        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]

        self.params, self.grads = [], []
        for layer in self.embed_dot_layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, h, target):
        batch_size = target.shape[0]
        negative_sample = self.sampler.get_negative_sample(target)

        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)

        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size):
            negative_target = negative_sample[:, i]
            score = self.embed_dot_layers[1+i].forward(h, negative_target)
            loss += self.loss_layers[i+1].forward(score, negative_label)

        return loss
    
    def backward(self, dout=1):
        dh = 0
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
            dscore = l0.backward(dout)
            dh += l1.backward(dscore) ## repeat 노드 역전파. 170p. 도식 참고.
        
        return dh

## 개선판 word2vec 학습

CBOW 모델 구현

In [23]:
class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        V, H = vocab_size, hidden_size

        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(H, V).astype('f')

        self.in_layers = []
        for i in range(2 * window_size):
            layer = Embedding(W_in)
            self.in_layers.append(layer)
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)

        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        self.word_vecs = W_in

    def forward(self, contexts, target):
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:,i]) ## contexts: (batch, window * 2)
        h *= 1 / len(self.in_layers) ## self.in_layers = window * 2
        loss = self.ns_loss.forward(h, target)
        return loss
    
    def backward(self, dout=1):
        dout = self.ns_loss.backward(dout)
        dout *= 1 / len(self.in_layers)
        for layer in self.in_layers:
            layer.backward(dout)
        return None

CBOW 모델 학습 코드

In [None]:
## 맥북이라 코랩 환경에서 실시
## cupy error: https://velog.io/@a01152a/word2vec-%EC%86%8D%EB%8F%84-%EA%B0%9C%EC%84%A02
## cupy error: https://madrabbit7.tistory.com/56
import pickle
from common import config
from common.trainer import Trainer
from common.util import create_contexts_target
from common.optimizer import Adam
from dataset import ptb

config.GPU = False
print(config.GPU)
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)

contexts, target = create_contexts_target(corpus, window_size)

model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()

word_vecs = model.word_vecs
params = {}
params['words_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
params['id_to_word'] = id_to_word
pkl_file = 'cbow_params.pkl'
with open(pkl_file, 'wb') as f:
    pickle.dump(params, f, -1)

CBOW 모델 평가

In [34]:
from common.util import most_similar
import pickle

pkl_file = 'cbow_params.pkl'

with open(pkl_file, 'rb') as f:
    params = pickle.load(f)
    word_vecs = params['word_vecs']
    word_to_id = params['word_to_id']
    id_to_word = params['id_to_word']

querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)


[query] you
 we: 0.71240234375
 i: 0.7099609375
 your: 0.63916015625
 they: 0.60205078125
 anybody: 0.58056640625

[query] year
 month: 0.85009765625
 week: 0.763671875
 spring: 0.7529296875
 summer: 0.75
 decade: 0.685546875

[query] car
 auto: 0.6015625
 luxury: 0.60009765625
 merkur: 0.591796875
 truck: 0.58935546875
 window: 0.54345703125

[query] toyota
 honda: 0.64501953125
 nissan: 0.63720703125
 seita: 0.609375
 engines: 0.59521484375
 motor: 0.5947265625


In [37]:
from common.util import analogy

## 결과 희한함. 레포에 있던 것과 비교하기
analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs)
analogy('taek', 'took', 'go', word_to_id, id_to_word, word_vecs)
analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs)
analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs)


[analogy] king:man = queen:?
 a.m: 5.984375
 woman: 5.35546875
 naczelnik: 4.95703125
 lady: 4.55078125
 lucky: 4.47265625
taek(을)를 찾을 수 없습니다.

[analogy] car:cars = child:?
 a.m: 6.4375
 children: 5.49609375
 rape: 5.35546875
 adults: 5.2734375
 daffynition: 4.99609375

[analogy] good:better = bad:?
 rather: 5.921875
 more: 5.57421875
 less: 5.40625
 greater: 4.59765625
 worse: 4.01953125


기존 래포에 있던 것

In [38]:
from common.util import most_similar
import pickle

pkl_file = 'cbow_params_origin.pkl'

with open(pkl_file, 'rb') as f:
    params = pickle.load(f)
    word_vecs = params['word_vecs']
    word_to_id = params['word_to_id']
    id_to_word = params['id_to_word']

querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)


[query] you
 we: 0.6103515625
 someone: 0.59130859375
 i: 0.55419921875
 something: 0.48974609375
 anyone: 0.47314453125

[query] year
 month: 0.71875
 week: 0.65234375
 spring: 0.62744140625
 summer: 0.6259765625
 decade: 0.603515625

[query] car
 luxury: 0.497314453125
 arabia: 0.47802734375
 auto: 0.47119140625
 disk-drive: 0.450927734375
 travel: 0.4091796875

[query] toyota
 ford: 0.55078125
 instrumentation: 0.509765625
 mazda: 0.49365234375
 bethlehem: 0.47509765625
 nissan: 0.474853515625


In [39]:
from common.util import analogy

## 결과 희한함. 레포에 있던 것과 비교하기
analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs)
analogy('taek', 'took', 'go', word_to_id, id_to_word, word_vecs)
analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs)
analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs)


[analogy] king:man = queen:?
 woman: 5.16015625
 veto: 4.9296875
 ounce: 4.69140625
 earthquake: 4.6328125
 successor: 4.609375
taek(을)를 찾을 수 없습니다.

[analogy] car:cars = child:?
 children: 5.21875
 average: 4.7265625
 yield: 4.20703125
 cattle: 4.1875
 priced: 4.1796875

[analogy] good:better = bad:?
 more: 6.6484375
 less: 6.0625
 rather: 5.21875
 slower: 4.734375
 greater: 4.671875
