# Install Library

In [None]:
!pip install transformers

# Import Library

In [None]:
import pandas as pd
import pickle
import numpy as np

import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model, Model
from tensorflow.keras.utils import to_categorical
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D, Input
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Prepare Data for Bert

In [None]:
#LST20 data
with open('train_sent_lst20.data', 'rb') as filehandle:
    train_sent = pickle.load(filehandle)
with open('train_ner_lst20.data', 'rb') as filehandle:
    train_ner = pickle.load(filehandle)
with open('eval_sent_lst20.data', 'rb') as filehandle:
    eval_sent = pickle.load(filehandle)
with open('eval_ner_lst20.data', 'rb') as filehandle:
    eval_ner = pickle.load(filehandle)

In [None]:
ner_tags = [
        "O",
        "B_BRN",
        "B_DES",
        "B_DTM",
        "B_LOC",
        "B_MEA",
        "B_NUM",
        "B_ORG",
        "B_PER",
        "B_TRM",
        "B_TTL",
        "I_BRN",
        "I_DES",
        "I_DTM",
        "I_LOC",
        "I_MEA",
        "I_NUM",
        "I_ORG",
        "I_PER",
        "I_TRM",
        "I_TTL",
        "E_BRN",
        "E_DES",
        "E_DTM",
        "E_LOC",
        "E_MEA",
        "E_NUM",
        "E_ORG",
        "E_PER",
        "E_TRM",
        "E_TTL",
    ]

In [None]:
dump=[]
for i, ner_sent in enumerate(train_ner):
    for ner in ner_sent:
        if not (ner in ner_tags):
            dump.append(ner_sent)
            break
for ners in dump:
    idx = train_ner.index(ners)
    train_ner.pop(idx)
    train_sent.pop(idx)    

#limit 300 words
for i, item in enumerate(train_sent):
    if len(item)>300:
        train_sent[i]=item[:300]
        train_ner[i]=train_ner[i][:300]
train_toks = []

In [None]:
train_toks = []
for sent in train_sent:
    train_toks.append(' '.join(sent))

In [None]:
import numpy as np

seq_len = 300
num_samples = len(train_toks)

Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("Geotrend/bert-base-en-th-cased")

for i, phrase in enumerate(train_toks):
    tokens = tokenizer.encode_plus(phrase, max_length=seq_len, truncation=True, padding='max_length',
                                   add_special_tokens=True, return_tensors='tf')
    Xids[i,:] = tokens['input_ids']
    Xmask[i,:] =tokens['attention_mask']

In [None]:
ner_tags_sorted = sorted(ner_tags)
ner_tags_sorted.append('pad')
ner_to_ix = dict((c, i) for i, c in enumerate(ner_tags_sorted))
ix_to_ner = dict((v,k) for k,v in ner_to_ix.items()) 

In [None]:
def prepare_sequence_target(input_label):
    idxs = list()
    for word in input_label:
        if word in ner_to_ix.keys():
            idxs.append(ner_to_ix[word])
        else:
            idxs.append(ner_to_ix["O"])
    return idxs

In [None]:
def one_hot(labels, n_label):
    out = np.zeros((len(labels), n_label))
    for i, item in enumerate(labels):
        one_hot= np.zeros(n_label)
        one_hot[item] = 1
        out[i,:]=one_hot
    return out

In [None]:
labels = y_tr = [prepare_sequence_target(s) for s in train_ner]
labels = pad_sequences(maxlen=seq_len, sequences=labels, value=ner_to_ix["pad"], padding='post', truncating='post')

In [None]:
labels_arr = np.zeros((len(train_ner), seq_len, len(ner_tags_sorted)))

In [None]:
for i, item in enumerate(labels):
    labels_arr[i,:,:]= one_hot(item,32)

labels_arr = labels_arr.astype('uint8')

In [None]:
ix_to_ner = dict((i, c) for i, c in enumerate(sorted(ner_tags)+['pad']))
ner_to_ix = dict((v,k) for k,v in ix_to_ner.items())

In [None]:
ner_tags_sorted = sorted(ner_tags)
ner_tags_sorted.append('pad')
ner_to_ix = dict((c, i) for i, c in enumerate(ner_tags_sorted))
ix_to_ner = dict((v,k) for k,v in ner_to_ix.items()) 

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels_arr))
dataset.take(1)

In [None]:
def map_func(input_ids, masks, labels):
    return {'input_ids':input_ids, 'attention_mask':masks}, labels

In [None]:
dataset = dataset.map(map_func)

In [None]:
batch_size = 64

dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset.take(1)

In [None]:
split = 0.95

size = int((num_samples/batch_size)*split)

In [None]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

del dataset

# Model building

In [None]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('Geotrend/bert-base-en-th-cased')

bert.summary()

In [None]:
main_lstm_unit = 256 ## Bidirectional 256 + 256 = 512
lstm_recurrent_dropout = 0.5

In [None]:
input_ids =  Input(shape=(seq_len,), name='input_ids', dtype='int32')
masks =  Input(shape=(seq_len,), name='attention_mask', dtype='int32')

embeddings = bert.bert(input_ids, attention_mask=masks)[0]

word_embeddings = SpatialDropout1D(0.3)(embeddings)

# BiLSTM
main_lstm = Bidirectional(LSTM(units=main_lstm_unit, return_sequences=True,
                               recurrent_dropout=lstm_recurrent_dropout))(word_embeddings)
main_lstm = TimeDistributed(Dense(50, activation="relu"))(main_lstm)

out = Dense(32, activation='softmax', name='output')(main_lstm)

In [None]:
model = Model(inputs=[input_ids, masks], outputs=out)

model.layers[2].trainable = False
model.summary()

# Training

In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping= EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='min')

filepath="bert_bilstm_best_weight.h5"

checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

callbacks_list = [early_stopping,checkpoint]

In [None]:
history = model.fit(train_ds, epochs=15, verbose=1,callbacks=callbacks_list, validation_data = val_ds)

# Prediction and Evaluation

In [None]:
y_ev = [prepare_sequence_target(s) for s in eval_ner]
y_ev = pad_sequences(maxlen=seq_len, sequences=y_ev, value=ner_to_ix["pad"], padding='post', truncating='post')
y_ev = [to_categorical(i, num_classes=len(ix_to_ner)) for i in y_ev]

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('Geotrend/bert-base-en-th-cased')

def prep_data(text):
  tokens = tokenizer.encode_plus(text, max_length=300, truncation=True, padding='max_length', add_special_tokens=True, 
                                 return_token_type_ids=False, return_tensors='tf')
  return {
      'input_ids' : tf.cast(tokens['input_ids'], tf.float64),
      'attention_mask' : tf.cast(tokens['attention_mask'], tf.float64)
  }

In [None]:
pred = []
for sent in eval_sent:
  pred.append(model.predict(prep_data(sent)).squeeze())

In [None]:

y_pred = []
y_true = []

for i in range(0,len(pred)):
  true = np.argmax(y_ev[i], axis=-1)
  revert_pred=[ix_to_ner[pred[i][j,:].argmax()] for j in range(pred[i].shape[0])]

  revert_true=[ix_to_ner[i] for i in true]
  y_pred.append(revert_pred)
  y_true.append(revert_true)

In [None]:
def ner_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    tagset = list(sorted(set(lb.classes_)))
    tagset = tagset[:-2]
    print(tagset)
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        digits=4
    )

In [None]:
print(ner_classification_report(y_true,y_pred))
