In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
USE_CUDA = torch.cuda.is_available()

In [15]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [16]:
def make_word_vector(sents, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<UNK>"], sents))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else  Variable(torch.LongTensor(idxs))
    return tensor

def make_tag_vector(tag,tag2index):
    tensor = Variable(torch.LongTensor([tag2index[tag]]))
    if USE_CUDA:
        tensor = tensor.cuda()
    return tensor

### Data load & processing 

In [3]:
corpus = nltk.corpus.conll2002.iob_sents()

In [4]:
pdata=[]
for cor in corpus:
    sent,_,tag = list(zip(*cor))
    pdata.append([sent,tag])

In [6]:
print(len(pdata))
print(pdata[0])

35651
[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


In [7]:
samples = pdata
#samples = random.sample(pdata,1000) # sampling for practice

In [8]:
sents,tags = list(zip(*samples))
vocab = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

In [9]:
word2index={'<UNK>' : 0, '<DUMMY>' : 1}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)
index2word = {v:k for k,v in word2index.items()}

tag2index = {}
for tag in tagset:
    if tag not in tag2index.keys():
        tag2index[tag]=len(tag2index)
index2tag={v:k for k,v in tag2index.items()}

In [10]:
WINDOW_SIZE=2
windows=[]

In [12]:
for sample in samples:
    dummy=['<DUMMY>']*WINDOW_SIZE
    window = list(nltk.ngrams(dummy+list(sample[0])+dummy,WINDOW_SIZE*2+1))
    windows.extend([[list(window[i]),sample[1][i]] for i in range(len(sample[0]))])

In [44]:
windows[0]

[['strijdig', 'gevonden', 'met', 'de', 'wet'], 'O']

In [13]:
len(windows)

678377

In [14]:
random.shuffle(windows)

train_data = windows[:int(len(windows)*0.9)]
test_data = windows[int(len(windows)*0.9):]

### Modeling 

In [17]:
class WindowClassifier(nn.Module): 
    def __init__(self,vocab_size,embedding_size,window_size,hidden_size,output_size):

        super(WindowClassifier, self).__init__()
        
        self.embed = nn.Embedding(vocab_size,embedding_size)
        self.h_layer1 = nn.Linear(embedding_size*(window_size*2+1), hidden_size)
        self.h_layer2 = nn.Linear(hidden_size, hidden_size)
        self.o_layer = nn.Linear(hidden_size,output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax()
        self.dropout = nn.Dropout(0.7)
        
    def forward(self, inputs,is_training=False): 
        embeds = self.embed(inputs) # BxWxD
        concated = embeds.view(-1,embeds.size(1)*embeds.size(2)) # Bx(W*D)
        h0 = self.relu(self.h_layer1(concated))
        if is_training:
            h0 = self.dropout(h0)
        h1 = self.relu(self.h_layer2(h0))
        if is_training:
            h1 = self.dropout(h1)
        out = self.softmax(self.o_layer(h1))
        return out

In [18]:
BATCH_SIZE=128
EMBEDDING_SIZE=50 # x (WINDOW_SIZE*2+1) = 250
HIDDEN_SIZE=300
STEP=5
LEARNING_RATE = 0.001

### Training 

In [21]:
model = WindowClassifier(len(word2index),EMBEDDING_SIZE,WINDOW_SIZE,HIDDEN_SIZE,len(tag2index))
if USE_CUDA:
    model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)

In [22]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        x,y=list(zip(*batch))
        inputs = torch.cat([make_word_vector(sent,word2index).view(1,-1) for sent in x])
        targets = torch.cat([make_tag_vector(tag,tag2index) for tag in y])
        model.zero_grad()
        preds = model(inputs,is_training=True)
        loss = loss_function(preds,targets)
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0] )
        loss.backward()
        optimizer.step()

        if i % 1000==0:
            print("[%d/%d] : %0.2f" %(step,STEP,np.mean(losses)))
            losses=[]

[0/5] : 2.41
[0/5] : 0.53
[0/5] : 0.46
[0/5] : 0.40
[0/5] : 0.36
[1/5] : 0.24
[1/5] : 0.29
[1/5] : 0.28
[1/5] : 0.27
[1/5] : 0.25
[2/5] : 0.21
[2/5] : 0.22
[2/5] : 0.22
[2/5] : 0.21
[2/5] : 0.20
[3/5] : 0.18
[3/5] : 0.18
[3/5] : 0.17
[3/5] : 0.17
[3/5] : 0.17
[4/5] : 0.25
[4/5] : 0.15
[4/5] : 0.15
[4/5] : 0.15
[4/5] : 0.15


### Test 

In [43]:
accuracy=0
for test in test_data:
    x,y = test[0],test[1]
    input_ = make_word_vector(x,word2index).view(1,-1)

    pred = model(input_)
    v,i = torch.max(pred,1)
    pred = index2tag[i.data.cpu().tolist()[0]] if USE_CUDA else index2tag[i.data.tolist()[0]]
    if pred==y:
        accuracy+=1

print(accuracy/len(test_data)*100)
# print(index2tag[i.data.cpu().tolist()[0]] if USE_CUDA else index2tag[i.data.tolist()[0]] )
# print(y)

95.74574722132138


### TODO 

* 사이킷런 eval로 F1 score(Confusion matrix 출력)