In [135]:
from collections import Counter

In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
import pickle
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

torch.manual_seed(1)

<torch._C.Generator at 0x7efea40e0558>

# 데이터 

In [132]:
train = json.load(open('../../dataset/NER_16000_dev.json'))

training_data=[]

for sent in train:
    word=[]
    tag=[]
    for w,p,t in sent:
        word.append(w)
        tag.append(t)
    training_data.append((word,tag))

In [133]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w], seq))
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

In [134]:
PAD = "<PAD>"

### 시퀀스 길이 분포 파악 

In [136]:
Length = [len(t) for t,l in training_data]
distribution = Counter(Length)

In [137]:
bucket_config = [(5,5),(10,10),(20,20),(30,30)]

### 버킷에 나눠 담으면서 동시에 <패딩까지> 나중에는 동적으로 패딩하기 

In [138]:
bucket = [[],[],[],[]]

In [139]:
for tr,label in training_data:
    length = len(tr)
    
    for i in range(len(bucket_config)):
        if bucket_config[i][0] >= length:
            
            while len(tr) < bucket_config[i][0]:
                tr.append(PAD)
                label.append("O")
            bucket[i].append((tr,label))
            break

In [140]:
for b in bucket:
    print(len(b))

893
820
717
294


In [141]:
bucket[0][:10]

[(['네', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['O', 'O', 'O', 'O', 'O']),
 (['네', '!', '<PAD>', '<PAD>', '<PAD>'], ['O', 'O', 'O', 'O', 'O']),
 (['1002', '434', '286953', '우리은행', '박종화'], ['O', 'O', 'O', 'B-ORG', 'B-PER']),
 (['네', 'ㅡ', '<PAD>', '<PAD>', '<PAD>'], ['O', 'O', 'O', 'O', 'O']),
 (['네', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['O', 'O', 'O', 'O', 'O']),
 (['네', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['O', 'O', 'O', 'O', 'O']),
 (['알', '겟', '소', '<PAD>', '<PAD>'], ['O', 'O', 'O', 'O', 'O']),
 (['회원', '가입', '은', '했', '고'], ['O', 'O', 'O', 'O', 'O']),
 (['PN', '<PAD>', '<PAD>', '<PAD>', '<PAD>'], ['O', 'O', 'O', 'O', 'O']),
 (['안녕', '하', '세요', '고객', '님'], ['O', 'O', 'O', 'B-PER', 'O'])]

### word2index, tag2index 딕 준비

In [149]:
NER_LIST = ['B-PER','I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG','B-DATE', 'I-DATE','B-TIME','I-TIME','B-MISC','I-MISC','O']

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

ix_to_word = {v:k for k,v in word_to_ix.items()}

tag_to_ix={}
i=0
for tag in NER_LIST:           
    tag_to_ix[tag] = i
    i+=1

ix_to_tag = {v:k for k,v in tag_to_ix.items()}

### Sanity Check

일단 가장 쉬운 길이 10개짜리로 고정해 놓고 배치<br>
로스 계산 시에도 패딩까지 계산한다... (나중에 실제 길이 알려줘서 그것만 loss 계산하는 법 고민)

In [146]:
import random

#bucket_id = random.choice(range(len(bucket_config)))
bucket_id = 1

In [147]:
train_x=[]
train_y=[]
for tr,label in bucket[bucket_id]:
    temp = prepare_sequence(tr, word_to_ix)
    temp = temp.view(1,-1)
    train_x.append(temp)
    
    temp2 = prepare_sequence(label,tag_to_ix)
    temp2 = temp2.view(1,-1)
    train_y.append(temp2)

In [184]:
INPUT_SIZE = bucket_config[bucket_id][0]
EMBEDDING_DIM = 50
HIDDEN_DIM = 50
BATCH_SIZE=10
nb_epochs = 10
num_layers = 2

In [151]:
inputs = torch.cat(train_x[:BATCH_SIZE])

In [152]:
inputs.size()

torch.Size([10, 10])

In [89]:
class RNN(nn.Module):
    def __init__(self,hidden_size, num_layers, num_classes,vocab_size,embedding_dim):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

        
    def forward(self, x):
        # Set initial states 
        h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) 
        c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
        
        embeds = self.word_embeddings(x)
        # Forward propagate RNN
        out, _ = self.lstm(embeds, (h0, c0))  
        
        tag_space = self.fc(out.contiguous().view(x.size(0)*x.size(1),-1)) # input_length,batch_size,hidden_dim -> input_length*batch_size,hidden_dim
        tag_scores = F.log_softmax(tag_space)
        
        return tag_scores
       

In [188]:
model = RNN(HIDDEN_DIM, num_layers,len(tag_to_ix),len(word_to_ix),EMBEDDING_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [189]:
losses=[]
for epoch in range(nb_epochs):
    
    for offset in range((len(train_x)//BATCH_SIZE)):
        inputs = torch.cat(train_x[BATCH_SIZE*offset:BATCH_SIZE*(offset+1)])
        targets = torch.cat(train_y[BATCH_SIZE*offset:BATCH_SIZE*(offset+1)])
        
        tag_scores = model(inputs)
        loss = loss_function(tag_scores, targets.view(BATCH_SIZE*INPUT_SIZE))
        losses.append(loss)
        loss.backward()
        optimizer.step()
    
    print(epoch)

0
1
2
3
4
5
6
7
8
9


In [190]:
print(losses[0],losses[-1])

Variable containing:
 2.6698
[torch.FloatTensor of size 1]
 Variable containing:
 0.3381
[torch.FloatTensor of size 1]

