In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
from sklearn_crfsuite import metrics

In [2]:
flatten = lambda l :[item for sublist in l for item in sublist]
random.seed(1234)

In [3]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [4]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [5]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] 
                    if word2index.get(w) is not None else word2index['<UNK>'],seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) 
                    if word2index.get(word) is not None else LongTensor([word2index['<UNK>']]))
    
def prepare_tag(tag, tag2index):
    return Variable(LongTensor([tag2index[tag]]))

## Data load and Preprocessing

In [6]:
corpus = nltk.corpus.conll2002.iob_sents() #문장의 단어 리스트[단어, chunk, tag]
print(len(corpus))
print(corpus)

35651
[[('Sao', 'NC', 'B-LOC'), ('Paulo', 'VMI', 'I-LOC'), ('(', 'Fpa', 'O'), ('Brasil', 'NC', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('23', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFECOM', 'NP', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')], [('-', 'Fg', 'O')], ...]


In [8]:
data = []
for cor in corpus:
    sent, _, tag = list(zip(*cor)) 
    # _: 불러올 뿐 변수에 저장 x
    #zip(*)으로 묶으면 >> [단어, chunk, tag]가 단어들끼리, chunck끼리, tag끼리의 리스트로 형성된다.
    data.append([sent, tag])

In [30]:
print(list(zip(*corpus[0]))) # [(단어),(chunk),(tag)]

[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('NC', 'VMI', 'Fpa', 'NC', 'Fpt', 'Fc', 'Z', 'NC', 'Fpa', 'NP', 'Fpt', 'Fp'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


In [28]:
print(len(data))
print(data[0]) 

35651
[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


## Build Vocab

In [10]:
sents, tags = list(zip(*data))
vocab = list(set(flatten(sents)))  # set > 중복 단어 제거 (단어 리스트)
tagset = list(set(flatten(tags)))  # 중복 태그 제거 (태그 리스트)

In [31]:
print(sents[0]) 
print(tags[0])
print(vocab[0])
print(tagset[0])

('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.')
('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')
BCH
B-MISC


In [32]:
word2index = {'<UNK>':0, '<DUMMY>':1} # Dummy: 문장의 처음과 끝 표시

In [33]:
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)  # 단어:번호
index2word = {v:k for k, v in word2index.items()}  #번호:단어 로 변형

tag2index={}
for tag in tagset:
    if tag2index.get(tag) is None: 
        tag2index[tag] = len(tag2index)  # 태깅:번호
index2tag = {v:k for k,v in tag2index.items()} #번호:태깅으로 변형

### Prepare data

In [13]:
WINDOW_SIZE = 2
windows=[]

In [14]:
for sample in data:
    dummy = ['<DUMMY>']*WINDOW_SIZE
    window = list(nltk.ngrams(dummy+list(sample[0])+dummy, WINDOW_SIZE*2+1))
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])

In [15]:
print(windows[0])
print(len(windows))

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']
678377


In [16]:
random.shuffle(windows)
train_data = windows[:int(len(windows)*0.9)]
test_data = windows[int(len(windows)*0.9):]

## Modeling

In [17]:
class WindowClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, window_size,hidden_size, output_size):
        super(WindowClassifier, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedding_size) #embedding layer
        self.h_layer1 = nn.Linear(embedding_size*(window_size*2+1), hidden_size) #first hidden layer
        self.h_layer2 = nn.Linear(hidden_size, hidden_size) #second hidden layer
        self.o_layer = nn.Linear(hidden_size, output_size) # predict layer
        self.relu = nn.ReLU() # activation = relu  
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(0.3)  
        # dropout : 두 레이어 사이의 dropout은 학습 단계에서 신호를 일정 비율로 누락시켜 과적합을 방지한다.
        # 0.3 (30%)를 0으로 누락시켜 학습을 너무 많이 하지 못하도록 억제한다.
        
    def forward(self, inputs, is_training=False):
        embeds = self.embed(inputs) # B X W X D
        concated = embeds.view(-1, embeds.size(1)*embeds.size(2)) # B X (W*D)
        h0 = self.relu(self.h_layer1(concated))
        if is_training:
            h0 =  self.dropout(h0)
        h1 = self.relu(self.h_layer2(h0))
        if is_training:
            h1 = self.dropout(h1)
        out = self.softmax(self.o_layer(h1))
        return out

In [18]:
BATCH_SIZE = 128
EMBEDDING_SIZE = 50 # x (WINDOW_SIZE*2+1) = 250
HIDDEN_SIZE = 300
EPOCH = 3
LEARNING_RATE = 0.001

## Train

In [19]:
model = WindowClassifier(len(word2index), EMBEDDING_SIZE, WINDOW_SIZE, HIDDEN_SIZE, len(tag2index))
if USE_CUDA:
    model = model.cuda()
    
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [20]:
for epoch in range(EPOCH):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        x,y=list(zip(*batch))
        inputs = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x])
        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])
        model.zero_grad()
        preds = model(inputs, is_training=True)
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist()[0])
        loss.backward()
        optimizer.step()

        if i % 1000 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/3] mean_loss : 2.22
[0/3] mean_loss : 0.47
[0/3] mean_loss : 0.36
[0/3] mean_loss : 0.31
[0/3] mean_loss : 0.28
[1/3] mean_loss : 0.25
[1/3] mean_loss : 0.22
[1/3] mean_loss : 0.21
[1/3] mean_loss : 0.19
[1/3] mean_loss : 0.19
[2/3] mean_loss : 0.13
[2/3] mean_loss : 0.15
[2/3] mean_loss : 0.15
[2/3] mean_loss : 0.14
[2/3] mean_loss : 0.14


## Test

In [21]:
for_f1_score = []
accuracy = 0
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)

    i = model(input_).max(1)[1]
    pred = index2tag[i.data.tolist()[0]]
    for_f1_score.append([pred, y])
    if pred == y:
        accuracy += 1

print(accuracy/len(test_data) * 100)

95.83861552522185


## Print Confusion matrix

In [22]:
y_pred, y_test = list(zip(*for_f1_score))

In [23]:
sorted_labels = sorted(  #중복제거, 정렬
    list(set(y_test) - {'O'}),
    key=lambda name: (name[1:], name[0])
)

In [24]:
sorted_labels

['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [25]:
y_pred = [[y] for y in y_pred] # this is because sklearn_crfsuite.metrics function flatten inputs
y_test = [[y] for y in y_test]

In [26]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels = sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.778     0.677     0.724      1125
      I-LOC      0.678     0.470     0.556       287
     B-MISC      0.681     0.444     0.538       801
     I-MISC      0.657     0.428     0.518       631
      B-ORG      0.805     0.690     0.743      1311
      I-ORG      0.766     0.729     0.747       959
      B-PER      0.808     0.748     0.777      1309
      I-PER      0.846     0.827     0.836       937

avg / total      0.770     0.663     0.710      7360

