In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f63dd526630>

# Sequence Models and LSTM 

시퀀스 모델은 NLP의 중심이다. 이러한 모델들은 인풋들 간의 시간(step) 의존성을 가진다. 이러한 시퀀스 모델의 고전적인 예로는 POS 태깅을 위한 Hidden Markov Model, 혹은 Conditional Random Field이다.

## LSTM's in Pytorch 

예제를 시작하기 전, 몇 가지를 상기하자. Pytorch의 LSTM은 3D 텐서의 인풋을 기대한다. 텐서의 각 축의 의미는 중요하다.<br>
1. 첫 번째 차원은 시퀀스 time-step
2. 두 번째 차원은 미니-배치에서의 인덱스
3. 세 번째 차원은 토큰 인덱스 in vocab


일단 우리는 미니배치를 고려하지 않기 때문에, 2번째 차원은 항상 1로 간주하자. 만약 우리가 "The cow jumped"라는 문장을 사용하고 싶다면 우리의 인풋은 이렇게 생겼을 것이다.  $$ 
\begin{bmatrix}
\overbrace{q_\text{The}}^\text{row vector} \\
q_\text{cow} \\
q_\text{jumped}
\end{bmatrix}
$$ 


In [4]:
autograd.Variable(torch.randn(1,1,3)).size()

torch.Size([1, 1, 3])

In [58]:
inputs = [ autograd.Variable(torch.randn((1,3))) for _ in range(5) ] 

In [59]:
inputs

[Variable containing:
  0.9610  0.3508 -0.1519
 [torch.FloatTensor of size 1x3], Variable containing:
  0.5372 -1.2869  1.6373
 [torch.FloatTensor of size 1x3], Variable containing:
  1.4175 -0.4246 -0.6304
 [torch.FloatTensor of size 1x3], Variable containing:
  0.0919 -0.2338  1.3037
 [torch.FloatTensor of size 1x3], Variable containing:
 -1.0965 -0.0207 -1.4612
 [torch.FloatTensor of size 1x3]]

In [3]:
inputs[0].size()

torch.Size([1, 3])

In [6]:
inputs[0].view(1,1,-1).size()

torch.Size([1, 1, 3])

In [57]:
lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3
inputs = [ autograd.Variable(torch.randn((1,3))) for _ in range(5) ] # make a sequence of length 5 (5,1,3) 의 텐서??

# 히든 스테이트 초기화 뭐지?!! 
# (h0, c0) hidden state, cell state 
hidden = (autograd.Variable(torch.randn(1,1,3)), autograd.Variable(torch.randn((1,1,3))))
for i in inputs:
    # 요렇게 스텝바이스텝 넘겨주는 방법도 있음
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1,1,-1), hidden)

# 다른 방법으로, 전체 시퀀스를 한방에 넘길 수도 있다
# out은 시퀀스 동안의 모든 hidden states이다.
# hidden은 가장 최근 즉, out[-1]과 같음
# out을 통해 모든 히든스테이트에 접근할 수 있고
# hidden을 이용해서 추후, backprop을 할 수 있을 것.

inputs = torch.cat(inputs).view(len(inputs), 1, -1) # Add the extra 2nd dimension (5,3) -> (5,1,3)
hidden = (autograd.Variable(torch.randn(1,1,3)), autograd.Variable(torch.randn((1,1,3)))) # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out.size()[0])
print(hidden)

5
(Variable containing:
(0 ,.,.) = 
  0.6223 -0.0418 -0.1876
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
  1.1719 -0.3549 -0.3968
[torch.FloatTensor of size 1x1x3]
)


## Example: An LSTM for POS tagging 

이번에 우리는 POS 태깅을 위해 LSTM을 사용할 것이다. Viterbi나 Forward-Backward 등 그 어떤 것도 사용하지 않을 것이다. 하지만 독자들은 추후, Viterbi를 어떻게 사용할 수 있을지 생각해보아라.

The model is as follows: let our input sentence be $w_1, \dots, w_M$, where $w_i \in V$, our vocab. Also, let $T$ be our tag set, and $y_i$ the tag of word $w_i$. Denote our prediction of the tag of word $w_i$ by $\hat{y}_i$.
This is a structure prediction, model, where our output is a sequence $\hat{y}_1, \dots, \hat{y}_M$, where $\hat{y}_i \in T$.
To do the prediction, pass an LSTM over the sentence. Denote the hidden state at timestep $i$ as $h_i$. Also, assign each tag a unique index (like how we had word_to_ix in the word embeddings section). Then our prediction rule for $\hat{y}_i$ is $$ \hat{y}_i = \text{argmax}_j \  (\log \text{Softmax}(Ah_i + b))_j $$ That is, take the log softmax of the affine map of the hidden state, and the predicted tag is the tag that has the maximum value in this vector. Note this implies immediately that the dimensionality of the target space of $A$ is $|T|$.

### 1. 데이터 준비 

In [2]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w], seq))
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

In [7]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'the': 3, 'dog': 1, 'ate': 2, 'apple': 4, 'that': 7, 'read': 6, 'The': 0, 'book': 8, 'Everybody': 5}


### 2. 모델링

In [13]:
class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # 만약 배치로 할거면, batch_first = True  옵션 줄 것.
        # bidirectional = True 로 두면 bi-RNN 
        # num_layers 로 쌓을 수도
        
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # 초기 state 필요!
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)), # 히든 스테이트
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))  # 셀 스테이트 
        
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence) # 5x6
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden) # 5x1x6
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1)) # 5x6 => 5x3
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

### 3. 트레이닝 

In [24]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [15]:
training_data[0]

(['The', 'dog', 'ate', 'the', 'apple'], ['DET', 'NN', 'V', 'DET', 'NN'])

In [32]:
prepare_sequence(train_data[0][0],word_to_ix)[0]

Variable containing:
 0
[torch.LongTensor of size 1]

In [25]:
for epoch in range(100): # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()
        
        # 히든 스테이트 초기화
        model.hidden = model.init_hidden()
    
        # Step 2. Get our inputs ready for the network, that is, turn them into Variables
        # of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
    
        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)
    
        # Step 4. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

### Exercise: Augmenting the LSTM pos tagger with character-level features 

In the example above, each word had an embedding, which served as the inputs to our sequence model. Let's augment the word embeddings with a representation derived from the characters of the word. We expect that this should help significantly, since character-level information like affixes have a large bearing on part-of-speech. For example, words with the affix -ly are almost always tagged as adverbs in English.

Do do this, let $c_w$ be the character-level representation of word $w$. Let $x_w$ be the word embedding as before. Then the input to our sequence model is the concatenation of $x_w$ and $c_w$. So if $x_w$ has dimension 5, and $c_w$ dimension 3, then our LSTM should accept an input of dimension 8.

1. LSTM 모델이 2개 필요함. 하나는 POS tag scores를 계산하는 모델, 다른 하나는 각 단어의 character-level representation을 산출하는 모델
2. character 모델을 실행하기 위해, characters를 임베딩해야 한다. character embeddings은 character LSTM의 인풋으로 들어갈 것

### 1. 데이터 준비

nltk의 brown 코퍼스에서  NUM_OF_DATA만큼의 문장을 가져온다. [[단어,단어,....],[POS,POS,...]]

In [3]:
import nltk
import random

In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [5]:
data = nltk.corpus.brown.tagged_words(tagset='universal')

In [None]:
NUM_OF_DATA=100

In [6]:
data = data[:NUM_OF_DATA]

In [7]:
train_data = []
words=[]
tags=[]
vocab = []
pos=[]
i = 0
for w,p in data:
    i+=1
    words.append(w)
    tags.append(p)
    
    if w not in vocab:
        vocab.append(w)
    
    if p not in pos:
        pos.append(p)
    
    if p == '.' or i==NUM_OF_DATA:
        train_data.append((words,tags))
        words=[]
        tags=[]

In [8]:
word_to_ix = {v:i for i,v in enumerate(vocab)}
tag_to_ix = {v:i for i,v in enumerate(pos)}

캐릭터 레벨의 dictionary도 준비한다

In [49]:
char_vocab = list("""$%'`()+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ\\
^_abcdefghijklmnopqrstuvwxyz{|}""")
char_to_ix = {v:i for i,v in enumerate(char_vocab)}
ix_to_char = {v:k for k,v in char_to_ix.items()}

In [126]:
def word2input(word):
    chars = list(word)
    chars_vec = list(map(lambda x:char_to_ix[x],chars))
    return autograd.Variable(torch.LongTensor(chars_vec))

### 2. 모델링 : CHARLSTM  in LSTMTagger 

각 단어를 character representation하여 CHAR_EMBEDDING 차원으로 만들고, 이를 기존의 word representation인 EMBEDDING_DIM 차원과 concat해서 Prediction하고 이를 기반으로 최적화한다

In [10]:
CHAR_EMBEDDING = 10
CHAR_HIDDEN = 10

In [127]:
class CHARLSTM(nn.Module):
    
    def __init__(self,char_size,embedding_dim,hidden_dim):
        super(CHARLSTM,self).__init__()
        self.embedding = nn.Embedding(char_size,embedding_dim)
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embedding_dim,self.hidden_dim)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)), # 히든 스테이트
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim))) 
    
    def forward(self,word):
        embeds = self.embedding(word)
        output, self.hidden = self.lstm(embeds.view(len(word),1,-1), self.hidden)
        
        return self.hidden[0].view(1,self.hidden_dim)

In [138]:
test = random.choice(train_data)[0]
test = random.choice(test)
input_ = word2input(test)
print(test)
input_

said


Variable containing:
 72
 54
 62
 57
[torch.LongTensor of size 4]

In [129]:
EMBEDDING_DIM = 15
HIDDEN_DIM = 15

In [130]:
class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,char_size,char_embed_dim,char_hidden_dim):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.char_model = CHARLSTM(char_size,char_embed_dim,char_hidden_dim)
        self.char_model.init_hidden()
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # 만약 배치로 할거면, batch_first = True  옵션 줄 것.
        # bidirectional = True 로 두면 bi-RNN 
        # num_layers 로 쌓을 수도
        
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim+char_hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # 초기 state 필요!
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)), # 히든 스테이트
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))  # 셀 스테이트 
        
    def forward(self, sentence,chars_embed):
        embeds = self.word_embeddings(sentence) # n x 15
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden) # n x 1 x 15
        lstm_out = lstm_out.view(len(sentence), -1)
        
        lstm_cat_out=[]
        
        
        # 각 단어의 캐릭터 레벨의 representation을 가져와서 concat하고 최종 score를 계산한다!!
        for i in range(len(sentence)):
            self.char_model.init_hidden()
            lstm_cat_out.append(torch.cat([lstm_out[i].view(1,self.hidden_dim),self.char_model(chars_embed[i])],1))
        input_ = torch.cat(lstm_cat_out)
        tag_space = self.hidden2tag(input_) # 
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

### 3. 트레이닝, 시간이 넘 오래걸림.. 잘못짠거 아님?ㅠ

In [136]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix),len(char_vocab),CHAR_EMBEDDING,CHAR_HIDDEN)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [140]:
%%time
losses=[]
for epoch in range(10): # again, normally you would NOT do 300 epochs, it is toy data
    
    if epoch%10==0: 
        
        print(epoch)
    
    for sentence, tags in train_data:
        # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()
        # 히든 스테이트 초기화
        model.hidden = model.init_hidden()

        sentence_in = prepare_sequence(sentence,word_to_ix)
        words_in = [word2input(w) for w in sentence]
        targets = prepare_sequence(tags, tag_to_ix)
    
        # Step 3. Run our forward pass.
       
        tag_scores = model(sentence_in,words_in)
    
        # Step 4. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(tag_scores, targets)
        losses.append(loss)
        loss.backward(retain_variables=True)
        optimizer.step()

0


In [199]:
print(losses[0],losses[-1])

Variable containing:
 0.5504
[torch.FloatTensor of size 1]
 Variable containing:
 0.3544
[torch.FloatTensor of size 1]



#### Variables.backward(retrain_variables=True)

retain_variables (bool) – If True, buffers necessary for computing gradients won’t be freed after use. It is only necessary to specify True if you want to differentiate some subgraph multiple times (in some cases it will be much more efficient to use autograd.backward).

요약하자면 모델 안에 서브모델을 여러번 미분하고싶을 때 True

지금 여기에도 LSTM tagger 안에 CharLSTM이 있기 땜에 일케 하는게 맞는거 같은데 과연 ..?

### 4. 여튼 테스트 

In [193]:
ix_to_tag = {v:k for k,v in tag_to_ix.items()}  # 인덱스를 다시 태그로 돌리는 딕

### 디코딩 

In [197]:
test = random.choice(train_data)
print(' '.join(test[0])+'\n')

sentence_in = prepare_sequence(test[0],word_to_ix)
words_in = [word2input(w) for w in test[0]]
tag_scores = model(sentence_in,words_in)
v,i = torch.max(tag_scores,1)
for t in range(i.size()[0]):
    print(test[1][t], ' : ', ix_to_tag[i.data.numpy()[t][0]])

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced ``

DET  :  DET
NOUN  :  NOUN
NOUN  :  NOUN
ADJ  :  ADP
NOUN  :  NOUN
VERB  :  VERB
NOUN  :  NOUN
DET  :  NOUN
NOUN  :  NOUN
ADP  :  ADP
NOUN  :  NOUN
ADJ  :  NOUN
NOUN  :  NOUN
NOUN  :  NOUN
VERB  :  VERB
.  :  .


오... 되긴하는구먼! 과적합인가? 음음