# Install Library
tensorflow version 1.13.1

keras 2.2.4

keras-contrib

pythainlp (thai2fit)


In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
!pip install tensorflow==1.13.1
import tensorflow

In [None]:
!pip install keras==2.2.4
import keras

In [None]:
!pip install pythainlp

# Import Library

In [None]:
# Save / Load File
import pickle

# Plot Graph
import matplotlib.pyplot as plt

# Sklearn Report
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

# Load Vectors
from gensim.models import KeyedVectors

# Utility
import numpy as np

# Model Utility
from sklearn.model_selection import train_test_split
import pandas as pd

# Keras Model
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from tensorflow.keras.utils import to_categorical
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
from keras_contrib.layers import CRF
from keras.callbacks import ModelCheckpoint , EarlyStopping

# Download and Thai2fit word vector

In [None]:
#LST20 data
with open('train_sent_lst20.data', 'rb') as filehandle:
    train_sent = pickle.load(filehandle)
with open('train_ner_lst20.data', 'rb') as filehandle:
    train_ner = pickle.load(filehandle)
with open('eval_sent_lst20.data', 'rb') as filehandle:
    eval_sent = pickle.load(filehandle)
with open('eval_ner_lst20.data', 'rb') as filehandle:
    eval_ner = pickle.load(filehandle)

In [None]:
from pythainlp import word_vector
thai2fit_model = word_vector.get_model()
thai2fit_weight = thai2fit_model.vectors

In [None]:
ner_tags = [
        "O",
        "B_BRN",
        "B_DES",
        "B_DTM",
        "B_LOC",
        "B_MEA",
        "B_NUM",
        "B_ORG",
        "B_PER",
        "B_TRM",
        "B_TTL",
        "I_BRN",
        "I_DES",
        "I_DTM",
        "I_LOC",
        "I_MEA",
        "I_NUM",
        "I_ORG",
        "I_PER",
        "I_TRM",
        "I_TTL",
        "E_BRN",
        "E_DES",
        "E_DTM",
        "E_LOC",
        "E_MEA",
        "E_NUM",
        "E_ORG",
        "E_PER",
        "E_TRM",
        "E_TTL",
    ]

In [None]:
dump=[]
for i, ner_sent in enumerate(train_ner):
    for ner in ner_sent:
        if not (ner in ner_tags):
            dump.append(ner_sent)
            break

In [None]:
for ners in dump:
    idx = train_ner.index(ners)
    train_ner.pop(idx)
    train_sent.pop(idx)    

In [None]:
#limit 300 words
for i, item in enumerate(train_sent):
    if len(item)>300:
        train_sent[i]=item[:300]
        train_ner[i]=train_ner[i][:300]

In [None]:
#prepare dictionary

word_list=[]
ner_list=[]
thai2dict = {}

for sent in train_sent:
    for word in sent:
        word_list.append(word)
for ners in train_ner:
    for ner in ners:
        ner_list.append(ner)        
        
for word in thai2fit_model.index2word:
    thai2dict[word] = thai2fit_model[word]

word_list.append("pad")
word_list.append("unknown") #Special Token for Unknown words ("UNK")
ner_list.append("pad")

all_words = sorted(set(word_list))
all_ner = sorted(set(ner_list))
all_thai2dict = sorted(set(thai2dict))

word_to_ix = dict((c, i) for i, c in enumerate(all_words)) #convert word to index 
ner_to_ix = dict((c, i) for i, c in enumerate(all_ner)) #convert ner to index
thai2dict_to_ix = dict((c, i) for i, c in enumerate(thai2dict)) #convert thai2fit to index 

ix_to_word = dict((v,k) for k,v in word_to_ix.items()) #convert index to word
ix_to_ner = dict((v,k) for k,v in ner_to_ix.items())  #convert index to ner
ix_to_thai2dict = dict((v,k) for k,v in thai2dict_to_ix.items())  #convert index to thai2fit

n_word = len(word_to_ix)
n_tag = len(ner_to_ix)
n_thai2dict = len(thai2dict_to_ix)
print(n_word)
print(n_tag)
print(n_thai2dict)
print(ner_to_ix)

In [None]:
with open('nerdict.pickle', 'wb') as nerdict:
    pickle.dump(ner_to_ix, nerdict)

# Model Building

In [None]:
max_len = 300
#max_len_char = 30

character_LSTM_unit = 32
char_embedding_dim = 32
main_lstm_unit = 256 ## Bidirectional 256 + 256 = 512
lstm_recurrent_dropout = 0.5

train_batch_size = 32
train_epochs = 50

In [None]:
def prepare_sequence_word(input_text):
    idxs = list()
    for word in input_text:
        if word in thai2dict:
            idxs.append(thai2dict_to_ix[word])
        else:
            idxs.append(thai2dict_to_ix["unknown"]) #Use UNK tag for unknown word
    return idxs

def prepare_sequence_target(input_label):
    idxs = list()
    for word in input_label:
        if word in ner_to_ix.keys():
            idxs.append(ner_to_ix[word])
        else:
            idxs.append(ner_to_ix["O"])
    return idxs

In [None]:
# Word Training
X_word_tr = [prepare_sequence_word(s) for s in train_sent]
X_word_tr = pad_sequences(maxlen=max_len, sequences=X_word_tr, value=thai2dict_to_ix["pad"], padding='post', truncating='post')

# Sequence Label Training
y_tr = [prepare_sequence_target(s) for s in train_ner]
y_tr = pad_sequences(maxlen=max_len, sequences=y_tr, value=ner_to_ix["pad"], padding='post', truncating='post')
y_tr = [to_categorical(i, num_classes=n_tag) for i in y_tr]

In [None]:
X_word_ev = [prepare_sequence_word(s) for s in eval_sent]
X_word_ev = pad_sequences(maxlen=max_len, sequences=X_word_ev, value=thai2dict_to_ix["pad"], padding='post', truncating='post')

y_ev = [prepare_sequence_target(s) for s in eval_ner]
y_ev = pad_sequences(maxlen=max_len, sequences=y_ev, value=ner_to_ix["pad"], padding='post', truncating='post')
y_ev = [to_categorical(i, num_classes=n_tag) for i in y_ev]

In [None]:
# Word Input
word_in = Input(shape=(max_len,), name='word_input_')

# Word Embedding Using Thai2Fit
word_embeddings = Embedding(input_dim=n_thai2dict,
                            output_dim=300,
                            weights = [thai2fit_weight],input_length=max_len,
                            mask_zero=False,
                            name='word_embedding', trainable=False)(word_in)

all_word_embeddings = SpatialDropout1D(0.3)(word_embeddings)

# BiLSTM
main_lstm = Bidirectional(LSTM(units=main_lstm_unit, return_sequences=True,
                               recurrent_dropout=lstm_recurrent_dropout))(all_word_embeddings)
main_lstm = TimeDistributed(Dense(50, activation="relu"))(main_lstm)

# CRF
crf = CRF(n_tag)  # CRF layer
out = crf(main_lstm)  # output

# Model
model = Model(word_in,out)

model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

# Training

In [None]:
early_stopping= EarlyStopping(monitor='val_crf_viterbi_accuracy', min_delta=0, patience=5, verbose=0, mode='max')

filepath="bilstm_crf_best_weight.h5"

checkpoint = ModelCheckpoint(filepath, monitor='val_crf_viterbi_accuracy', verbose=1, save_best_only=True, mode='max')

callbacks_list = [early_stopping,checkpoint]

In [None]:
history = model.fit(X_word_tr, np.array(y_tr), batch_size=train_batch_size, epochs=15, verbose=1,callbacks=callbacks_list, validation_split=0.05)

# Visualize loss and accuracy

In [None]:
hist = pd.DataFrame(history.history)

plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["crf_viterbi_accuracy"])
plt.plot(hist["val_crf_viterbi_accuracy"])
plt.savefig('bilstm_crf_accuracy.png')
plt.show()

In [None]:
hist = pd.DataFrame(history.history)

plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["loss"])
plt.plot(hist["val_loss"])
plt.savefig('bilstm_crf_loss.png')
plt.show()

# Prediction and Evaluation

In [None]:
pred_model = model.predict(X_word_ev, verbose=1)

In [None]:
y_pred = []
y_true = []

for i in range(0,len(pred_model)):
    try:
        out = np.argmax(pred_model[i], axis=-1)
        true = np.argmax(y_ev[i], axis=-1)
        revert_pred=[ix_to_ner[i] for i in out]
        revert_true=[ix_to_ner[i] for i in true]
        y_pred.append(revert_pred)
        y_true.append(revert_true)
    except:
        print (i)

In [None]:
def ner_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    tagset = list(sorted(set(lb.classes_)))
    tagset = tagset[:-2]
    print(tagset)
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        digits=4
    )

In [None]:
print(ner_classification_report(y_true,y_pred))