In [1]:
import tensorflow as tf
import numpy as np
import re
import os
from sklearn.metrics import classification_report
import time

  from ._conv import register_converters as _register_converters


In [2]:
with open('entities-bm.txt','r') as fopen:
    texts= list(filter(None, fopen.read().split('\n')))
len(texts)

12194

In [3]:
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0}
tag_idx = 1
char_idx = 1

In [4]:
train_texts = texts
# dataset is too small
test_texts = texts

In [5]:
def process_word(word, lower=True):
    if lower:
        word = word.lower()
    else:
        if word.isupper():
            word = word.title()
    word = re.sub('[^A-Za-z0-9\- ]+', '', word)
    if word.isdigit():
        word = 'NUM'
    return word

def read_file(f):
    global tag_idx, char_idx
    words, tags, X, Y = [], [], [], []
    for line in f:
        line = line.strip()
        if (len(line) == 0 or line.startswith("-DOCSTART-")):
            continue
        else:
            ls = line.split(' ')
            if len(ls) > 1:
                word, tag = ls[0],ls[-1]
            else:
                word = ls[0]
                tag = 'O'
            word = process_word(word)
            if len(word) < 1:
                continue
            char_ids = []
            for c in word:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
                char_ids.append(char2idx[c])
            words += [word]
            tags += [tag]
            X.append(char_ids)
            if tag not in tag2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1
            Y.append(tag2idx[tag])
                        
    return words, tags, X, np.array(Y)

In [6]:
train_words, train_tags, train_X, train_Y = read_file(train_texts)
test_words, test_tags, test_X, test_Y = read_file(test_texts)

In [7]:
class Model:
    def __init__(self, size_layer, num_layers, embedded_size,
                 dict_size, dimension_output, learning_rate):
        
        def cells(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.float32, [None, dimension_output])
        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layer // 2),
                cell_bw = cells(size_layer // 2),
                inputs = encoder_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            encoder_embedded = tf.concat((out_fw, out_bw), 2)
        W = tf.get_variable('w',shape=(size_layer, dimension_output),initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(dimension_output),initializer=tf.zeros_initializer())
        self.logits = tf.matmul(encoder_embedded[:, -1], W) + b
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [8]:
train_onehot = np.zeros((train_Y.shape[0],len(tag2idx)))
train_onehot[np.arange(train_Y.shape[0]),train_Y] = 1.0

test_onehot = np.zeros((test_Y.shape[0],len(tag2idx)))
test_onehot[np.arange(test_Y.shape[0]),test_Y] = 1.0

In [9]:
size_layer = 128
num_layers = 2
embedded_size = 128
dimension_output = len(tag2idx)
learning_rate = 1e-3
batch_size = 32
idx2tag={idx: tag for tag, idx in tag2idx.items()}

In [15]:
def str_idx(corpus, dic, UNK=3):
    maxlen = max([len(i) for i in corpus])
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i][:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [16]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_layer,num_layers,embedded_size,len(char2idx),dimension_output,learning_rate)
sess.run(tf.global_variables_initializer())

In [17]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0
while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n'%(EPOCH))
        break
        
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(train_words[i:i+batch_size],char2idx)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                           feed_dict = {model.X : batch_x, model.Y : train_onehot[i:i+batch_size]})
        train_loss += loss
        train_acc += acc
    
    for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size):
        batch_x = str_idx(test_words[i:i+batch_size],char2idx)
        acc, loss = sess.run([model.accuracy, model.cost], 
                           feed_dict = {model.X : batch_x, model.Y : test_onehot[i:i+batch_size]})
        test_loss += loss
        test_acc += acc
    
    train_loss /= (len(train_X) // batch_size)
    train_acc /= (len(train_X) // batch_size)
    test_loss /= (len(test_X) // batch_size)
    test_acc /= (len(test_X) // batch_size)
    
    if test_acc > CURRENT_ACC:
        print('epoch: %d, pass acc: %f, current acc: %f'%(EPOCH,CURRENT_ACC, test_acc))
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time()-lasttime)
    print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'%(EPOCH,train_loss,
                                                                                          train_acc,test_loss,
                                                                                          test_acc))
    EPOCH += 1

epoch: 0, pass acc: 0.000000, current acc: 0.764556
time taken: 9.056696891784668
epoch: 0, training loss: 0.973193, training acc: 0.759046, valid loss: 0.868884, valid acc: 0.764556

epoch: 1, pass acc: 0.764556, current acc: 0.776563
time taken: 8.885531902313232
epoch: 1, training loss: 0.835342, training acc: 0.772039, valid loss: 0.763444, valid acc: 0.776563

epoch: 2, pass acc: 0.776563, current acc: 0.814803
time taken: 8.91784930229187
epoch: 2, training loss: 0.731407, training acc: 0.798026, valid loss: 0.657007, valid acc: 0.814803

epoch: 3, pass acc: 0.814803, current acc: 0.832319
time taken: 8.92201042175293
epoch: 3, training loss: 0.647545, training acc: 0.818339, valid loss: 0.585254, valid acc: 0.832319

epoch: 4, pass acc: 0.832319, current acc: 0.844737
time taken: 8.985709190368652
epoch: 4, training loss: 0.585942, training acc: 0.832895, valid loss: 0.534852, valid acc: 0.844737

epoch: 5, pass acc: 0.844737, current acc: 0.859704
time taken: 8.957971811294556


In [21]:
def get_entity(string):
    batch_x = str_idx([process_word(w) for w in string.split()],char2idx)
    Y_pred = sess.run(model.logits,feed_dict={model.X:batch_x})
    for no, i in enumerate(string.split()):
        print(i,idx2tag[np.argmax(Y_pred[no])])

In [24]:
get_entity('KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.')

KUALA LOC
LUMPUR: LOC
Sempena O
sambutan O
Aidilfitri EVENT
minggu O
depan, O
Perdana PRN
Menteri PRN
Tun PRN
Dr PRN
Mahathir PRN
Mohamad PRN
dan O
Menteri PRN
Pengangkutan NORP
Anthony PRN
Loke PRN
Siew PRN
Fook O
menitipkan O
pesanan O
khas NORP
kepada O
orang O
ramai O
yang O
mahu O
pulang O
ke O
kampung LOC
halaman O
masing-masing. O
Dalam O
video O
pendek O
terbitan O
Jabatan NORP
Keselamatan O
Jalan LOC
Raya O
(JKJR) O
itu, O
Dr PRN
Mahathir PRN
menasihati O
mereka O
supaya O
berhenti O
berehat O
dan O
tidur O
sebentar O
sekiranya O
mengantuk O
ketika O
memandu. O
