In [1]:
import tensorflow as tf
import numpy as np
import re
import os
from sklearn.metrics import classification_report

  from ._conv import register_converters as _register_converters


In [2]:
with open('entities-bm.txt','r') as fopen:
    texts= list(filter(None, fopen.read().split('\n')))
len(texts)

12194

In [3]:
train_texts = texts
# dataset is too small
test_texts = texts

In [4]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0}
word_idx = 3
tag_idx = 1
char_idx = 1

In [5]:
def process_word(word, lower=True):
    if lower:
        word = word.lower()
    else:
        if word.isupper():
            word = word.title()
    word = re.sub('[^A-Za-z0-9\- ]+', '', word)
    if word.isdigit():
        word = 'NUM'
    return word

def read_file(f):
    global word_idx, tag_idx, char_idx
    words, tags, X, Y = [], [], [], []
    for line in f:
        line = line.strip()
        if (len(line) == 0 or line.startswith("-DOCSTART-")):
            continue
        else:
            ls = line.split(' ')
            if len(ls) > 1:
                word, tag = ls[0],ls[-1]
            else:
                word = ls[0]
                tag = 'O'
            for c in word:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            word = process_word(word)
            if len(word) < 1:
                continue
            words += [word]
            tags += [tag]
            if word not in word2idx:
                word2idx[word] = word_idx
                word_idx += 1
            X.append(word2idx[word])
            if tag not in tag2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1
            Y.append(tag2idx[tag])
                        
    return words, tags, X, Y

In [6]:
train_words, train_tags, train_X, train_Y = read_file(train_texts)
test_words, test_tags, test_X, test_Y = read_file(test_texts)

In [7]:
idx2tag={idx: tag for tag, idx in tag2idx.items()}
idx2word={idx: tag for tag, idx in word2idx.items()}
batch_size = 16
dim_word = 128
dim_char = 32
dropout = 0.8
learning_rate = 1e-2
hidden_size_char = 64
hidden_size_word = 128
num_layers = 2
seq_len = 20
display_step = 200
epoch = 20

In [8]:
class Model:
    def __init__(self, dim_word, dim_char, dropout, learning_rate,
                 hidden_size_char, hidden_size_word, num_layers):
        
        def cells(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        def clip_grads(loss):
            variables = tf.trainable_variables()
            grads = tf.gradients(loss, variables)
            clipped_grads, _ = tf.clip_by_global_norm(grads, 5.0)
            return zip(clipped_grads, variables)
        
        self.word_ids = tf.placeholder(tf.int32, shape=[None, None])
        self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None])
        self.labels = tf.placeholder(tf.int32, shape=[None, None])
        
        self.word_embeddings = tf.Variable(tf.truncated_normal([len(word2idx), dim_word],
                                                      stddev=1.0 / np.sqrt(dim_word)))
        self.char_embeddings = tf.Variable(tf.truncated_normal([len(char2idx), dim_char],
                                                      stddev=1.0 / np.sqrt(dim_char)))
        word_embedded = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids)
        char_embedded = tf.nn.embedding_lookup(self.char_embeddings, self.char_ids)
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(char_embedded, shape=[s[0]*s[1], s[-2], dim_char])
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_char),
                cell_bw = cells(hidden_size_char),
                inputs = char_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_char_%d'%(n))
            char_embedded = tf.concat((out_fw, out_bw), 2)
        output = tf.reshape(char_embedded[:,-1], shape=[s[0], s[1], 2*hidden_size_char])
        word_embedded = tf.concat([word_embedded, output], axis=-1)
        word_embedded = tf.nn.dropout(word_embedded, dropout)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_word),
                cell_bw = cells(hidden_size_word),
                inputs = word_embedded,
                dtype=tf.float32,
                scope = 'bidirectional_rnn_word_%d'%(n))
            word_embedded = tf.concat((out_fw, out_bw), 2)
        word_embedded = tf.nn.dropout(word_embedded, dropout)
        
        W = tf.get_variable('w',shape=(2*hidden_size_word, len(idx2tag)),
                            initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(len(idx2tag)),initializer=tf.zeros_initializer())
        
        nsteps = tf.shape(word_embedded)[1]
        output = tf.reshape(word_embedded, [-1, 2*hidden_size_word])
        pred = tf.matmul(output, W) + b
        self.logits = tf.reshape(pred, [-1, nsteps, len(idx2tag)])
        
        log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
        self.logits, self.labels, tf.count_nonzero(self.word_ids, 1))
        
        self.cost = tf.reduce_mean(-log_likelihood)
        self.global_step = tf.Variable(0, trainable=False)
        
        self.crf_decode = tf.contrib.crf.crf_decode(self.logits, 
                                                    trans_params, 
                                                    tf.count_nonzero(self.word_ids, 1))[0]
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate).apply_gradients(clip_grads(self.cost), 
                                                                                    global_step=self.global_step)

In [9]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(dim_word,dim_char,dropout,learning_rate,hidden_size_char,hidden_size_word,num_layers)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [10]:
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

def generate_char_seq(batch):
    x = [[len(idx2word[i]) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((batch.shape[0],batch.shape[1],maxlen),dtype=np.int32)
    for i in range(batch.shape[0]):
        for k in range(batch.shape[1]):
            for no, c in enumerate(idx2word[batch[i,k]]):
                temp[i,k,-1-no] = char2idx[c]
    return temp

In [11]:
train_X_seq, train_Y_seq = to_train_seq(train_X, train_Y)
test_X_seq, test_Y_seq = to_train_seq(test_X, test_Y)

In [12]:
np.unique(train_Y_seq.ravel(),return_counts=True)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 array([  8063, 185166,  23771,   2600,   6080,   2120,   2120,  10280,
          1280,   1860,     20]))

In [13]:
np.unique(test_Y_seq.ravel(),return_counts=True)

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 array([  8063, 185166,  23771,   2600,   6080,   2120,   2120,  10280,
          1280,   1860,     20]))

In [14]:
idx2tag

{0: 'PAD',
 1: 'LOC',
 2: 'O',
 3: 'PRN',
 4: 'ORG',
 5: 'TIME',
 6: 'ART',
 7: 'EVENT',
 8: 'NORP',
 9: 'FAC',
 10: 'LAW',
 11: 'DOC'}

In [15]:
for i in range(epoch):
    total_cost = 0
    for k in range(0,(train_X_seq.shape[0] // batch_size)*batch_size,batch_size):
        batch_x = train_X_seq[k:k+batch_size]
        batch_y = train_Y_seq[k:k+batch_size]
        batch_x_char = generate_char_seq(batch_x)
        step, loss, _ = sess.run([model.global_step, model.cost, model.optimizer],
                                 feed_dict={model.word_ids:batch_x,
                                           model.char_ids:batch_x_char,
                                           model.labels:batch_y})
        if step % display_step == 0 or step == 1:
            print('epoch %d, step %d, loss %f'%(i+1,step,loss))
        total_cost += loss
    total_cost /= (train_X_seq.shape[0] // batch_size)
    print('epoch %d, avg loss %f'%(i+1,total_cost))

epoch 1, step 1, loss 48.084633
epoch 1, step 200, loss 3.000995
epoch 1, step 400, loss 2.912233
epoch 1, step 600, loss 9.430214
epoch 1, avg loss 12.790535
epoch 2, step 800, loss 7.287593
epoch 2, step 1000, loss 13.377748
epoch 2, step 1200, loss 11.709394
epoch 2, step 1400, loss 9.363575
epoch 2, avg loss 7.502459
epoch 3, step 1600, loss 0.055793
epoch 3, step 1800, loss 2.792339
epoch 3, step 2000, loss 1.423341
epoch 3, step 2200, loss 5.258823
epoch 3, avg loss 5.829440
epoch 4, step 2400, loss 0.035322
epoch 4, step 2600, loss 2.375077
epoch 4, step 2800, loss 0.474343
epoch 4, step 3000, loss 12.236855
epoch 4, avg loss 4.864292
epoch 5, step 3200, loss 3.332653
epoch 5, step 3400, loss 12.662679
epoch 5, step 3600, loss 13.143872
epoch 5, step 3800, loss 0.916353
epoch 5, avg loss 4.172150
epoch 6, step 4000, loss 0.072409
epoch 6, step 4200, loss 0.621689
epoch 6, step 4400, loss 1.656671
epoch 6, avg loss 3.422261
epoch 7, step 4600, loss 0.562943
epoch 7, step 4800, lo

In [16]:
label_Y, predicted_Y = [], []
for k in range(0,(test_X_seq.shape[0] // batch_size)*batch_size,batch_size):
    batch_x = test_X_seq[k:k+batch_size]
    batch_x_char = generate_char_seq(batch_x)
    batch_y = test_Y_seq[k:k+batch_size]
    Y_pred = sess.run(model.crf_decode,
                  feed_dict={model.word_ids:batch_x,
                            model.char_ids:batch_x_char})
    predicted_Y.append(Y_pred)
    label_Y.append(batch_y)

In [17]:
print(classification_report(np.vstack(label_Y).ravel(), np.vstack(predicted_Y).ravel(), target_names=tag2idx.keys()))

             precision    recall  f1-score   support

      EVENT       0.98      0.96      0.97      8063
        ORG       0.99      1.00      0.99    185006
        FAC       0.98      0.96      0.97     23771
        DOC       0.95      0.99      0.97      2600
        LAW       0.97      0.86      0.91      6080
       NORP       0.96      0.96      0.96      2120
        ART       0.98      0.92      0.95      2120
        PAD       0.98      0.92      0.95     10280
       TIME       0.97      0.88      0.93      1280
        PRN       0.98      0.96      0.97      1860
        LOC       1.00      0.65      0.79        20

avg / total       0.98      0.98      0.98    243200



  .format(len(labels), len(target_names))


In [18]:
test_string = 'Keikhlasan merupakan faktor utama yang perlu ada pada setiap pemimpin dan ahli UMNO sekiranya mahu melihat parti itu pulih kembali selain mendapat sokongan majoriti rakyat negara ini, kata Ahli Majlis Tertinggi (MT) UMNO Datuk Seri Idris Jusoh.'

In [19]:
test_X = []
for w in test_string.split():
    w = process_word(w)
    try:
        test_X.append(word2idx[w])
    except:
        test_X.append(2)

In [20]:
batch_x_char = generate_char_seq(np.array([test_X]))

In [21]:
Y_pred = sess.run(model.crf_decode,feed_dict={model.word_ids:np.array([test_X]),
                                              model.char_ids:batch_x_char})

In [22]:
for no, i in enumerate(test_string.split()):
    print(i,idx2tag[Y_pred[0,no]])

Keikhlasan O
merupakan O
faktor O
utama O
yang O
perlu O
ada O
pada O
setiap O
pemimpin O
dan O
ahli O
UMNO NORP
sekiranya O
mahu O
melihat O
parti O
itu O
pulih O
kembali O
selain O
mendapat O
sokongan O
majoriti O
rakyat O
negara O
ini, O
kata O
Ahli O
Majlis O
Tertinggi O
(MT) O
UMNO PRN
Datuk PRN
Seri PRN
Idris PRN
Jusoh. PRN


In [26]:
def get_entity(string):
    test_X = []
    for w in string.split():
        w = process_word(w)
        try:
            test_X.append(word2idx[w])
        except:
            test_X.append(2)
    batch_x_char = generate_char_seq(np.array([test_X]))
    Y_pred = sess.run(model.crf_decode,feed_dict={model.word_ids:np.array([test_X]),
                                              model.char_ids:batch_x_char})
    for no, i in enumerate(string.split()):
        print(i,idx2tag[Y_pred[0,no]])

In [27]:
get_entity('KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.')

KUALA LOC
LUMPUR: LOC
Sempena O
sambutan O
Aidilfitri TIME
minggu TIME
depan, TIME
Perdana PRN
Menteri PRN
Tun PRN
Dr PRN
Mahathir PRN
Mohamad PRN
dan O
Menteri PRN
Pengangkutan PRN
Anthony PRN
Loke PRN
Siew PRN
Fook PRN
menitipkan PRN
pesanan PRN
khas PRN
kepada O
orang O
ramai O
yang O
mahu O
pulang O
ke O
kampung LOC
halaman O
masing-masing. O
Dalam O
video O
pendek O
terbitan O
Jabatan O
Keselamatan O
Jalan O
Raya ART
(JKJR) O
itu, O
Dr PRN
Mahathir PRN
menasihati O
mereka O
supaya O
berhenti O
berehat O
dan O
tidur O
sebentar O
sekiranya O
mengantuk O
ketika O
memandu. O
