In [1]:
%pylab inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


Populating the interactive namespace from numpy and matplotlib


# LSTM

nn.LSTM
 - input_size: x의 features 갯수
 - hidden_size:  hidden state의 크기
 - num_layers: default=1 이며 stacking LSTM 사용시 늘려준다
 - bias: default=True 이며, False일 경우 bias weights b_ih, b_hh 사용 안함
 - batch_first: default=False이며, True이면 (batch, seq_feature) 순으로 intput, output 제공됨 
 - dropout: LSTM안의 output마다 dropout이 제공됨. default=0 (확률) 
 - bidirectional: default=False

Input Arguments
 - input: (seq_len, batch, input_size)
 - h_0: (n_layers * n_directions, batch, hidden_size)
 - c_0: (n_layers * n_directions, batch, hidden_size)

## Inference step at a time

In [81]:
# batch = 32, input=7, hidden=5
torch.manual_seed(1)

batch_size = 32
input_size = 7
hidden_size = 5

# initialize LSTM and hidden state.
lstm = nn.LSTM(input_size=input_size, hidden_size=5)
h_0 = torch.rand(1, batch_size, 5)
c_0 = torch.randn(1, batch_size, 5)

# Sequence에서 하나씩 처리를 한다
inputs = [torch.randn(batch_size, input_size) for _ in range(9)] # (seq_len, batch, input_size)
for i in inputs:
    output, (h_0, c_0) = lstm(i.unsqueeze(0), (h_0, c_0))
    print(output.shape, output.mean())

torch.Size([1, 32, 5]) tensor(0.0698, grad_fn=<MeanBackward0>)
torch.Size([1, 32, 5]) tensor(0.0867, grad_fn=<MeanBackward0>)
torch.Size([1, 32, 5]) tensor(0.0916, grad_fn=<MeanBackward0>)
torch.Size([1, 32, 5]) tensor(0.0923, grad_fn=<MeanBackward0>)
torch.Size([1, 32, 5]) tensor(0.0804, grad_fn=<MeanBackward0>)
torch.Size([1, 32, 5]) tensor(0.0788, grad_fn=<MeanBackward0>)
torch.Size([1, 32, 5]) tensor(0.0980, grad_fn=<MeanBackward0>)
torch.Size([1, 32, 5]) tensor(0.1026, grad_fn=<MeanBackward0>)
torch.Size([1, 32, 5]) tensor(0.0984, grad_fn=<MeanBackward0>)


## Inference entire sequence all at once

In [83]:
inputs2 = torch.cat(inputs).view(-1, batch_size, input_size)
output, (h_0, c_0) = lstm(inputs2, (h_0, c_0))

print('output:', output.shape)
print('h_0:', h_0.shape)
print('c_0:', c_0.shape)

output: torch.Size([9, 32, 5])
h_0: torch.Size([1, 32, 5])
c_0: torch.Size([1, 32, 5])


In [370]:
y_train

[['NNG', 'JKS', 'NP+JKG', 'NNG', 'JKO', 'VV', 'EP', 'EC'],
 ['MAG', 'VV', 'ETM', 'NNG', 'JX', 'NNG', 'NNG', 'NNP', 'NNG'],
 ['NNG', 'JKS', 'NP+JKG', 'NNG', 'NNP', 'NNG', 'NNG', 'JKO', 'VV', 'EP', 'EC']]

In [330]:
from konlpy.tag import Mecab
mecab = Mecab()

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    '강아지가 내 사과를 먹었다',
    '어제 먹은 사과는 완전 핵존맛', 
    '강아지가 내 핵존맛 저녁을 먹었다']

x_train = []
y_train = []
word_to_ix = {}
tag_to_ix = {}
for st in training_data:
    pos = mecab.pos(st)
    x_train.append([i[0] for i in pos])
    y_train.append([i[1] for i in pos])
    
    for word in x_train[-1]:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
    
    for tag in y_train[-1]:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)
            
ix_to_tag = {v:k for k, v in tag_to_ix.items()}

EMBEDDING_DIM = len(word_to_ix)
HIDDEN_DIM = 15

In [368]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=False)
        self.decoder = nn.GRU(hidden_dim, hidden_dim)
#         self.decoder = nn.LSTM()

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim*1, tagset_size)
        self.hidden = self.init_hidden()
        self.hidden2 = None

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        
        lstm_outs = []
        for i in range(embeds.shape[0]):
            lstm_out, self.hidden = self.lstm(embeds[i].view(1, 1, -1), self.hidden)
            lstm_outs.append(lstm_out)
            
        encode_output = torch.cat(lstm_outs)
        decode_output = lstm_out
        self.hidden2 = lstm_out
        for _ in range(len(lstm_outs)):
            dot = encode_output @ decode_output.view(-1)
            score = torch.softmax(dot.squeeze(-1), dim=0)
            alignment_vectors = encode_output * score.view(-1, 1, 1)
            decode_output = alignment_vectors.sum(0).unsqueeze(0) + decode_output
            decode_output, self.hidden2 = self.decoder(decode_output, self.hidden2)
        
        tag_space = self.hidden2tag(torch.cat(lstm_outs).squeeze(1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

with torch.no_grad():
    model = LSTMTagger(EMBEDDING_DIM, hidden_dim=25, vocab_size=len(word_to_ix), tagset_size=len(tag_to_ix))
    inputs = prepare_sequence(x_train[0], word_to_ix)
    tag_scores = model(inputs)

In [372]:
torch.manual_seed(1)

model = LSTMTagger(EMBEDDING_DIM, hidden_dim=25, vocab_size=len(word_to_ix), tagset_size=len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(x_train[0], word_to_ix)
    tag_scores = model(inputs)
    print('before training:', torch.softmax(tag_scores, dim=0).argmax(dim=-1))

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in zip(x_train, y_train):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    for i in range(3):
        inputs = prepare_sequence(x_train[i], word_to_ix)
        tag_scores = model(inputs)

        # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
        # for word i. The predicted tag is the maximum scoring tag.
        # Here, we can see the predicted sequence below is 0 1 2 0 1
        # since 0 is index of the maximum value of row 1,
        # 1 is the index of maximum value of row 2, etc.
        # Which is DET NOUN VERB DET NOUN, the correct sequence!
        print('after training:', torch.softmax(tag_scores, dim=0).argmax(dim=-1))
        print('y_true        :', [tag_to_ix[tag] for tag in y_train[i]])
        print()
    

before training: tensor([4, 5, 8, 1, 6, 0, 0, 7])
after training: tensor([0, 1, 2, 0, 3, 4, 5, 6])
y_true        : [0, 1, 2, 0, 3, 4, 5, 6]

after training: tensor([ 7,  4,  8,  0,  9,  0,  0, 10,  0])
y_true        : [7, 4, 8, 0, 9, 0, 0, 10, 0]

after training: tensor([ 0,  1,  2,  0, 10,  0,  0,  3,  4,  5,  6])
y_true        : [0, 1, 2, 0, 10, 0, 0, 3, 4, 5, 6]

