In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
import pickle
import random
from collections import Counter
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

torch.manual_seed(1)

<torch._C.Generator at 0x7f007412e630>

# 데이터 

In [2]:
train = json.load(open('../../dataset/NER_16000_train.json'))

training_data=[]

for sent in train:
    word=[]
    tag=[]
    for w,p,t in sent:
        word.append(w)
        tag.append(t)
    training_data.append((word,tag))

In [3]:
training_data = [t for t in training_data if len(t[0])!=0]

In [4]:
len(training_data)

11196

In [5]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w], seq))
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

In [6]:
PAD = "<PAD>"

### 시퀀스 길이 분포 파악 

In [7]:
Length = [len(t) for t,l in training_data]
distribution = Counter(Length)

In [8]:
bucket_config = [(5,5),(10,10),(20,20),(30,30)]

### 버킷에 나눠 담으면서 동시에 <패딩까지> 나중에는 동적으로 패딩하기 

In [9]:
bucket = [[],[],[],[]]

In [10]:
for tr,label in training_data:
    length = len(tr)
    
    for i in range(len(bucket_config)):
        if bucket_config[i][0] >= length:
            
            while len(tr) < bucket_config[i][0]:
                tr.append(PAD)
                label.append("O")
            bucket[i].append((tr,label))
            break

In [11]:
for b in bucket:
    print(len(b))

3184
2824
2568
998


In [12]:
def getBatch(bucket,bucket_id,batch_size):
    random.shuffle(bucket[bucket_id])
    train_x=[]
    train_y=[]
    lengths=[]
    for tr,label in bucket[bucket_id][:batch_size]:
        temp = prepare_sequence(tr, word_to_ix)
        temp = temp.view(1,-1)
        train_x.append(temp)
        
        temp2 = prepare_sequence(label,tag_to_ix)
        temp2 = temp2.view(1,-1)
        train_y.append(temp2)
        
        length = [t for t in tr if t !='<PAD>']
        lengths.append(len(length))
    inputs = torch.cat(train_x)
    targets = torch.cat(train_y)
    
     ### PAD 제외하고 로스 계산 ###
    t_out=[]
    for i in range(len(lengths)):
        t_out.append(targets[i][:lengths[i]])
    
    r_targets = torch.cat(t_out)
    
    del train_x
    del train_y
    del t_out

    
    return inputs,r_targets, lengths

### word2index, tag2index 딕 준비

In [13]:
NER_LIST = ['B-PER','I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG','B-DATE', 'I-DATE','B-TIME','I-TIME','B-MISC','I-MISC','O']

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

ix_to_word = {v:k for k,v in word_to_ix.items()}

tag_to_ix={}
i=0
for tag in NER_LIST:           
    tag_to_ix[tag] = i
    i+=1

ix_to_tag = {v:k for k,v in tag_to_ix.items()}

### Sanity Check

일단 가장 쉬운 길이 10개짜리로 고정해 놓고 배치<br>
로스 계산 시에도 패딩까지 계산한다... (나중에 실제 길이 알려줘서 그것만 loss 계산하는 법 고민)

In [14]:
import random

#bucket_id = random.choice(range(len(bucket_config)))
bucket_id = 1

In [15]:
train_x=[]
train_y=[]
for tr,label in bucket[bucket_id]:
    temp = prepare_sequence(tr, word_to_ix)
    temp = temp.view(1,-1)
    train_x.append(temp)
    
    temp2 = prepare_sequence(label,tag_to_ix)
    temp2 = temp2.view(1,-1)
    train_y.append(temp2)

In [16]:
INPUT_SIZE = bucket_config[bucket_id][0]
EMBEDDING_DIM = 100
HIDDEN_DIM = 100
BATCH_SIZE= 64
NUM_LAYERS = 3

In [17]:
class RNN(nn.Module):
    def __init__(self,hidden_size, num_layers, num_classes,vocab_size,embedding_dim):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

        
    def forward(self, x,length):
        # Set initial states 
        h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) 
        c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
        
        embeds = self.word_embeddings(x)
        # Forward propagate RNN
        out, _ = self.lstm(embeds, (h0, c0))  
        
        ### PAD 제외하고 로스 계산 ###
        t_out=[]
        for i in range(len(length)):
            t_out.append(out[i][:length[i]])
            
        outwithoutpad = torch.cat(t_out)
        del t_out
        
        tag_space = self.fc(outwithoutpad) # input_length,batch_size,hidden_dim -> input_length*batch_size,hidden_dim
        tag_scores = F.log_softmax(tag_space)
        
        
        return tag_scores
       

In [18]:
model = RNN(HIDDEN_DIM, NUM_LAYERS,len(tag_to_ix),len(word_to_ix),EMBEDDING_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
x,y,l=getBatch(bucket,1,BATCH_SIZE)

In [20]:
o = model(x,l)

In [21]:
o

Variable containing:
-2.6222 -2.5880 -2.5342  ...  -2.5593 -2.4500 -2.5203
-2.6299 -2.5856 -2.5293  ...  -2.5702 -2.4533 -2.5006
-2.6340 -2.5838 -2.5256  ...  -2.5759 -2.4569 -2.4942
          ...             ⋱             ...          
-2.6481 -2.5844 -2.5272  ...  -2.5717 -2.4617 -2.4884
-2.6496 -2.5825 -2.5236  ...  -2.5747 -2.4598 -2.4925
-2.6513 -2.5819 -2.5238  ...  -2.5748 -2.4597 -2.4939
[torch.FloatTensor of size 499x13]

### 버킷이랑 같이 쓰는 모델 

In [22]:
class BUCKETRNN(nn.Module):
    
    def __init__(self,bucket_config,hidden_size, num_layers, num_classes,vocab_size,embedding_dim):
        self.models={}
        self.optims={}
        self.bucket_config=bucket_config
        for i in range(len(self.bucket_config)):
            self.models[i] = RNN(hidden_size, num_layers, num_classes,vocab_size,embedding_dim)
            self.optims[i] = optim.Adam(self.models[i].parameters(), lr=0.001)
            
        
    def select_bucket(self):
        bucket_id = random.choice(range(len(bucket_config)))
        
        return bucket_id
        
            

In [23]:
bucket_model = BUCKETRNN(bucket_config,HIDDEN_DIM, NUM_LAYERS,len(tag_to_ix),len(word_to_ix),EMBEDDING_DIM)
loss_function =  nn.CrossEntropyLoss()

In [24]:
losses=[]

In [25]:
for epoch in range(5000):
    
    bucket_id = bucket_model.select_bucket()
    inputs, targets,lengths = getBatch(bucket,bucket_id,BATCH_SIZE)
    
    bucket_model.models[bucket_id].zero_grad()
    
    outputs = bucket_model.models[bucket_id](inputs,lengths)
    
    loss = loss_function(outputs,targets)
    losses.append(loss)
    loss.backward()
    bucket_model.optims[bucket_id].step()
    
    if epoch % 100==0:
        print("[{epoch}] loss : {loss} , bucket : {bucket_id}".format(epoch=epoch,loss=loss.data.numpy()[0],bucket_id=bucket_id))

[0] loss : 2.5338616371154785 , bucket : 0
[100] loss : 1.021700382232666 , bucket : 3
[200] loss : 0.46864187717437744 , bucket : 0
[300] loss : 0.5949462652206421 , bucket : 1
[400] loss : 0.9982641339302063 , bucket : 2
[500] loss : 0.8244224786758423 , bucket : 2
[600] loss : 0.6691949367523193 , bucket : 3
[700] loss : 0.5334180593490601 , bucket : 1
[800] loss : 0.3589295446872711 , bucket : 0
[900] loss : 0.6886817216873169 , bucket : 3
[1000] loss : 0.41534432768821716 , bucket : 2
[1100] loss : 0.49127447605133057 , bucket : 3
[1200] loss : 0.3872307240962982 , bucket : 2
[1300] loss : 0.47533732652664185 , bucket : 2
[1400] loss : 0.4393002986907959 , bucket : 3
[1500] loss : 0.3600185215473175 , bucket : 3
[1600] loss : 0.4524793326854706 , bucket : 2
[1700] loss : 0.09196716547012329 , bucket : 0
[1800] loss : 0.13422846794128418 , bucket : 0
[1900] loss : 0.3615540564060211 , bucket : 2
[2000] loss : 0.13525305688381195 , bucket : 0
[2100] loss : 0.3106137812137604 , bucke

### 테스트 

In [26]:
test = random.choice(training_data)
input_ = test[0]
tag = test[1]
print(' '.join(input_)+'\n')

length = len(input_)
for i in range(len(bucket_config)):
        if bucket_config[i][0] == length:
            bucket_id = i
            break



sentence_in = prepare_sequence(input_,word_to_ix)
sentence_in=sentence_in.view(1,-1)

scores = bucket_model.models[bucket_id](sentence_in,[len(input_)])
v,i = torch.max(scores,1)
for t in range(i.size()[0]):
    print(tag[t], ' : ', ix_to_tag[i.data.numpy()[t][0]])

혹시 강동구 보건소 도 한 번 물어봐 주 세요 <PAD>

O  :  O
B-LOC  :  B-LOC
I-LOC  :  I-LOC
O  :  O
O  :  O
O  :  O
O  :  O
O  :  O
O  :  O
O  :  O


In [27]:
torch.save(bucket_model,'NER_model.pkl')

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [28]:
restore = torch.load('NER_model.pkl')

In [45]:
test = random.choice(training_data)
input_ = test[0]
tag = test[1]
print(' '.join(input_)+'\n')

length = len(input_)
for i in range(len(bucket_config)):
        if bucket_config[i][0] == length:
            bucket_id = i
            break



sentence_in = prepare_sequence(input_,word_to_ix)
sentence_in=sentence_in.view(1,-1)

scores = restore.models[bucket_id](sentence_in,[len(input_)])
v,i = torch.max(scores,1)
for t in range(i.size()[0]):
    print(tag[t], ' : ', ix_to_tag[i.data.numpy()[t][0]])

영화 예매 좀 해 줘

O  :  O
O  :  O
O  :  O
O  :  O
O  :  O
