In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

In [24]:
def make_batch():
    input_batch = []  # 마지막 단어 이전 단어까지의 word_dict value
    target_batch = [] # 마지막 단어의 word_dict value

    for sen in sentences:
        word = sen.split() # space tokenizer
        input = [word_dict[n] for n in word[:-1]] # create (1~n-1) as input
        target = word_dict[word[-1]] # create (n) as target, We usually call this 'casual language model'

        input_batch.append(input)
        target_batch.append(target)

    return input_batch, target_batch
# input_batch: [[6, 2], [6, 0], [6, 4]]
# target_batch: [5, 1, 3]

In [44]:
# Model
class NNLM(nn.Module): # nn.Module을 상속 받는다
    def __init__(self):
        super(NNLM, self).__init__() # nn.Module의 생성자 함수 호출
        self.C = nn.Embedding(n_class, m) # n_class: embedding의 딕셔너리 크기, m: embedding 벡터의 크기
        self.H = nn.Linear(n_step * m, n_hidden, bias=False) # n_step*m: 각 input의 크기, n_hidden: 각 output의 크기
        self.d = nn.Parameter(torch.ones(n_hidden)) # 파라미터로 넣어줄 tensor
        self.U = nn.Linear(n_hidden, n_class, bias=False) # n_hidden: 각 input의 크기, n_class: 각 output의 크기 
        self.W = nn.Linear(n_step * m, n_class, bias=False)
        self.b = nn.Parameter(torch.ones(n_class))

    def forward(self, X):
#         X: tensor([[6, 2], 
#                    [6, 0], 
#                    [6, 4]])

        X = self.C(X) # X : [batch_size, n_step, m]

#         X: tensor([[[-0.3198,  0.3602],
#                    [ 0.3201, -0.7619]],
#                   [[-0.3198,  0.3602],
#                    [-2.1598, -0.1377]],
#                   [[-0.3198,  0.3602],
#                    [ 1.1099,  1.4730]]], grad_fn=<EmbeddingBackward>)

        X = X.view(-1, n_step * m) # [batch_size, n_step * m]
        tanh = torch.tanh(self.d + self.H(X)) # [batch_size, n_hidden]
        output = self.b + self.W(X) + self.U(tanh) # [batch_size, n_class]
        return output

In [45]:
model(input_batch)

tensor([[ 0.3247, -1.0386,  1.7355,  0.7373,  2.5494,  2.0638,  0.6364],
        [ 0.5098, -1.2549,  2.0247,  0.6980,  2.6067,  1.8777,  0.6331],
        [-0.3549,  0.3776,  0.4636,  0.6714,  1.6076,  1.6843,  0.4200]],
       grad_fn=<AddBackward0>)

In [46]:
n_step = 2 # number of steps, n-1 in paper
n_hidden = 2 # number of hidden size, h in paper
m = 2 # size of embedding vector, m in paper

sentences = ["i like dog", "i love coffee", "i hate milk"]

In [48]:
word_list = " ".join(sentences).split()

['i', 'like', 'dog', 'i', 'love', 'coffee', 'i', 'hate', 'milk']

In [4]:
word_list = list(set(word_list))

['love', 'coffee', 'like', 'milk', 'hate', 'dog', 'i']

In [5]:
word_dict = {w: i for i, w in enumerate(word_list)}

{'love': 0, 'coffee': 1, 'like': 2, 'milk': 3, 'hate': 4, 'dog': 5, 'i': 6}

In [6]:
number_dict = {i: w for i, w in enumerate(word_list)}

{0: 'love', 1: 'coffee', 2: 'like', 3: 'milk', 4: 'hate', 5: 'dog', 6: 'i'}

In [10]:
n_class = len(word_dict)  # number of Vocabulary

In [60]:
model = NNLM()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) 
# Adam()의 첫 번째 인자에는 파라미터를 넣어준다
# parameters(): 모델의 모든 하위 모듈을 탐색하면서 모든 파라미터를 반환

In [61]:
input_batch, target_batch = make_batch()
# input_batch: [[6, 2], [6, 0], [6, 4]]
# target_batch: [5, 1, 3]

input_batch = torch.LongTensor(input_batch) # LongTensor(): 64비트의 부호 있는 정수
target_batch = torch.LongTensor(target_batch)

In [62]:
# Training
for epoch in range(5000):
    optimizer.zero_grad() # backward()를 할 때 gradient를 계속 더해주기 때문에 항상 zero로 만들어주고 시작해야함
    output = model(input_batch)

    # output : [batch_size, n_class], target_batch : [batch_size]
    loss = criterion(output, target_batch) # loss 계산
    if (epoch + 1) % 1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step() # optimization 단계를 한번 수행한다. 파라미터를 업데이트한다.

Epoch: 1000 cost = 0.046850
Epoch: 2000 cost = 0.009255
Epoch: 3000 cost = 0.003518
Epoch: 4000 cost = 0.001652
Epoch: 5000 cost = 0.000854


In [63]:
# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]
predict

tensor([[5],
        [1],
        [3]])

In [64]:
# Test
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])

[['i', 'like'], ['i', 'love'], ['i', 'hate']] -> ['dog', 'coffee', 'milk']
