In [3]:
import numpy as np
# 행렬에서 특정 행만 추출하기.
W = np.arange(21).reshape(7, 3)
print(W)
print(W[2]) # 2행 추출
print(W[5])

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]
 [15 16 17]
 [18 19 20]]
[6 7 8]
[15 16 17]


In [4]:
# 여러 행을 한꺼번에 추출하기
idx = np.array([1, 0, 3, 0])
W[idx]

array([[ 3,  4,  5],
       [ 0,  1,  2],
       [ 9, 10, 11],
       [ 0,  1,  2]])

In [5]:
# Embedding 계층의 forward() 메서드와 backward() 메서드 구현
class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None
    
    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out
    
    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0 # dW의 형상은 유지하되 원소만 0으로 뒤덮기
        
        for i, word_id in enumerate(self.idx):
            dW[word_id] += dout[i]
            
        return None

In [6]:
# Embedding 계층과 dot 연산의 처리를 합친 Embedding_dot 계층
class EmbeddingDot:
    # 생성자
    def __init__(self, W):
        self.embed = Embedding(W) # 임베딩 계층 
        self.params = self.embed.params # 매개변수 저장소
        self.grads = self.embed.grads # 기울기 저장
        self.cache = None # 중간 연산 결과를 저장할 저장소
        
    # 순전파
    def forward(self, h, idx):
        target_W = self.embed.forward(idx) # 임베딩 계층의 순전파를 이용해, 가중치를 구함
        out = np.sum(target_W * h, axis=1) # 특정 행만 dot 계산을 수행
        
        self.cache = (h, target_W)
        return out
    
    # 역전파
    def backward(self, dout):
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)
        dtarget_W = dout * h
        self.embed.backward(dtarget_W)
        dh = dout * target_W
        return dh
    
        

In [7]:
# 확률분포에 따른 네거티브 샘플링하는 예제
print(np.random.choice(10))
words = ['you', 'say', 'goodbye', 'i', 'hello', '.']
print(np.random.choice(words))
# 무작위 5개만 샘플링
print(np.random.choice(words, size=5))
# 무작위 5개 샘플링 중복 제거
print(np.random.choice(words, size=5, replace=False))
# 확률분포를 설정 후 샘플링
p = [0.5, 0.1, 0.05, 0.2, 0.05, 0.1]
np.random.choice(words, p=p)

4
say
['hello' 'goodbye' 'hello' 'you' 'hello']
['hello' 'you' 'i' 'goodbye' '.']


'i'

In [24]:
import collections
import sys
GPU = False
class UnigramSampler:
    def __init__(self, corpus, power, sample_size):
        self.sample_size = sample_size
        self.vocab_size = None
        self.word_p = None

        counts = collections.Counter()
        for word_id in corpus:
            counts[word_id] += 1

        vocab_size = len(counts)
        self.vocab_size = vocab_size

        self.word_p = np.zeros(vocab_size)
        for i in range(vocab_size):
            self.word_p[i] = counts[i]

        self.word_p = np.power(self.word_p, power)
        self.word_p /= np.sum(self.word_p)

    def get_negative_sample(self, target):
        batch_size = target.shape[0]

        if not GPU:
            negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)

            for i in range(batch_size):
                p = self.word_p.copy()
                target_idx = target[i]
                p[target_idx] = 0
                p /= p.sum()
                negative_sample[i, :] = np.random.choice(self.vocab_size, size=self.sample_size, replace=False, p=p)
        else:
            # GPU(cupy）로 계산할 때는 속도를 우선한다.
            # 부정적 예에 타깃이 포함될 수 있다.
            negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size),
                                               replace=True, p=self.word_p)

        return negative_sample

In [25]:
corpus = np.array([0, 1, 2, 3, 4, 1, 2, 3])
power = 0.75
sample_size = 2

sampler = UnigramSampler(corpus, power, sample_size)
target = np.array([1, 3, 0])
negative_sample = sampler.get_negative_sample(target)
negative_sample


array([[2, 4],
       [2, 1],
       [2, 1]], dtype=int32)

In [30]:
# 네거티브 샘플링 구현
class NegativeSamplingLoss:
    def __init__(self, W, corpus, power=0.75, sample_size=5):
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]
        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]

        self.params, self.grads = [], []
        for layer in self.embed_dot_layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, h, target):
        batch_size = target.shape[0]
        negative_sample = self.sampler.get_negative_sample(target)

        # 긍정적 예 순전파
        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)

        # 부정적 예 순전파
        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size):
            negative_target = negative_sample[:, i]
            score = self.embed_dot_layers[1 + i].forward(h, negative_target)
            loss += self.loss_layers[1 + i].forward(score, negative_label)

        return loss

    def backward(self, dout=1):
        dh = 0
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)

        return dh

In [32]:
# Embedding 계층과 negative sample 기법을 적용한 CBOW 모델
class CBOW:
    def __init__(self, vocab_size, hidden_size, window_size, corpus):
        V, H = vocab_size, hidden_size

        # 가중치 초기화
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(V, H).astype('f')

        # 계층 생성
        self.in_layers = []
        for i in range(2 * window_size):
            layer = Embedding(W_in)  # Embedding 계층 사용
            self.in_layers.append(layer)
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)

        # 모든 가중치와 기울기를 배열에 모은다.
        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

        # 인스턴스 변수에 단어의 분산 표현을 저장한다.
        self.word_vecs = W_in

    def forward(self, contexts, target):
        h = 0
        for i, layer in enumerate(self.in_layers):
            h += layer.forward(contexts[:, i])
        h *= 1 / len(self.in_layers)
        loss = self.ns_loss.forward(h, target)
        return loss

    def backward(self, dout=1):
        dout = self.ns_loss.backward(dout)
        dout *= 1 / len(self.in_layers)
        for layer in self.in_layers:
            layer.backward(dout)
        return None

class Trainer:
    def __init__(self, model, optimizer):
        self.model = model
        self.optimizer = optimizer
        self.loss_list = []
        self.eval_interval = None
        self.current_epoch = 0

    def fit(self, x, t, max_epoch=10, batch_size=32, max_grad=None, eval_interval=20):
        data_size = len(x)
        max_iters = data_size // batch_size
        self.eval_interval = eval_interval
        model, optimizer = self.model, self.optimizer
        total_loss = 0
        loss_count = 0

        start_time = time.time()
        for epoch in range(max_epoch):
            # 뒤섞기
            idx = numpy.random.permutation(numpy.arange(data_size))
            x = x[idx]
            t = t[idx]

            for iters in range(max_iters):
                batch_x = x[iters*batch_size:(iters+1)*batch_size]
                batch_t = t[iters*batch_size:(iters+1)*batch_size]

                # 기울기 구해 매개변수 갱신
                loss = model.forward(batch_x, batch_t)
                model.backward()
                params, grads = remove_duplicate(model.params, model.grads)  # 공유된 가중치를 하나로 모음
                if max_grad is not None:
                    clip_grads(grads, max_grad)
                optimizer.update(params, grads)
                total_loss += loss
                loss_count += 1

                # 평가
                if (eval_interval is not None) and (iters % eval_interval) == 0:
                    avg_loss = total_loss / loss_count
                    elapsed_time = time.time() - start_time
                    print('| 에폭 %d |  반복 %d / %d | 시간 %d[s] | 손실 %.2f'
                          % (self.current_epoch + 1, iters + 1, max_iters, elapsed_time, avg_loss))
                    self.loss_list.append(float(avg_loss))
                    total_loss, loss_count = 0, 0

            self.current_epoch += 1

    def plot(self, ylim=None):
        x = numpy.arange(len(self.loss_list))
        if ylim is not None:
            plt.ylim(*ylim)
        plt.plot(x, self.loss_list, label='train')
        plt.xlabel('반복 (x' + str(self.eval_interval) + ')')
        plt.ylabel('손실')
        plt.show()

class Adam:
    '''
    Adam (http://arxiv.org/abs/1412.6980v8)
    '''
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = [], []
            for param in params:
                self.m.append(np.zeros_like(param))
                self.v.append(np.zeros_like(param))
        
        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

        for i in range(len(params)):
            self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
            self.v[i] += (1 - self.beta2) * (grads[i]**2 - self.v[i])
            
            params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)


In [33]:
# Embedding 계층과 negative sample 기법을 적용한 CBOW 모델 학습
GPU = True
import pickle

def create_contexts_target(corpus, window_size=1):
    '''맥락과 타깃 생성

    :param corpus: 말뭉치(단어 ID 목록)
    :param window_size: 윈도우 크기(윈도우 크기가 1이면 타깃 단어 좌우 한 단어씩이 맥락에 포함)
    :return:
    '''
    target = corpus[window_size:-window_size]
    contexts = []

    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx + t])
        contexts.append(cs)

    return np.array(contexts), np.array(target)


def to_cpu(x):
    import numpy
    if type(x) == numpy.ndarray:
        return x
    return np.asnumpy(x)


def to_gpu(x):
    import cupy
    if type(x) == cupy.ndarray:
        return x
    return cupy.asarray(x)

In [37]:
import torch
from torch import nn
n, d, m = 3, 5, 7
embedding = nn.Embedding(n, d, max_norm=True)
W = torch.randn((m,d), requires_grad=True)
idx = torch.tensor([1, 2])
a = embedding.weight.clone() @W.t()
b = embedding(idx) @ W.t()
out = (a.unsqueeze(0) + b.unsqueeze(1))
loss = out.sigmoid().prod()
loss.backward()

In [39]:
from torch import LongTensor, norm
from torch.nn import Embedding
embedding = nn.Embedding(10, 3)
sentences = LongTensor([[1,2,4,5],[4,3,2,9]])
embedding(sentences)
# tensor([[[ 1.0141, -2.1953,  0.3030],
#          [-0.8147,  2.2799,  0.7509],
#          [-0.6491, -0.2387,  0.8809],
#          [ 0.9722,  0.6464, -0.5913]],

#         [[-0.6491, -0.2387,  0.8809],
#          [-0.3678, -1.7773, -1.3405],
#          [-0.8147,  2.2799,  0.7509],
#          [ 0.6034, -0.2334, -0.4138]]], grad_fn=<EmbeddingBackward0>)

tensor([[[ 1.0141, -2.1953,  0.3030],
         [-0.8147,  2.2799,  0.7509],
         [-0.6491, -0.2387,  0.8809],
         [ 0.9722,  0.6464, -0.5913]],

        [[-0.6491, -0.2387,  0.8809],
         [-0.3678, -1.7773, -1.3405],
         [-0.8147,  2.2799,  0.7509],
         [ 0.6034, -0.2334, -0.4138]]], grad_fn=<EmbeddingBackward0>)

In [45]:
sentences = LongTensor([[1,2,4,5], [4,3,2,9]])
embedding = Embedding(num_embeddings=10, embedding_dim=100, max_norm=1)
for sentence in embedding(sentences):
    for word in sentence:
        print(norm(word))
        
# tensor(1.0000, grad_fn=<CopyBackwards>)
# tensor(1.0000, grad_fn=<CopyBackwards>)
# tensor(1.0000, grad_fn=<CopyBackwards>)
# tensor(1.0000, grad_fn=<CopyBackwards>)
# tensor(1.0000, grad_fn=<CopyBackwards>)
# tensor(1.0000, grad_fn=<CopyBackwards>)
# tensor(1.0000, grad_fn=<CopyBackwards>)
# tensor(1.0000, grad_fn=<CopyBackwards>)

tensor(1.0000, grad_fn=<CopyBackwards>)
tensor(1.0000, grad_fn=<CopyBackwards>)
tensor(1.0000, grad_fn=<CopyBackwards>)
tensor(1.0000, grad_fn=<CopyBackwards>)
tensor(1.0000, grad_fn=<CopyBackwards>)
tensor(1.0000, grad_fn=<CopyBackwards>)
tensor(1.0000, grad_fn=<CopyBackwards>)
tensor(1.0000, grad_fn=<CopyBackwards>)
