In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Input, Conv1D , LSTM, GRU, Dense, Embedding, Bidirectional, GaussianDropout, TimeDistributed, SpatialDropout1D, MaxPooling2D
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow import keras

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
from ast import literal_eval

import tensorflow as tf


In [41]:
data_train = pd.read_excel('https://github.com/toanphanminh/ner/blob/main/ner_dataset_train.xlsx?raw=true')
data_train = data_train.dropna()
data_train.head()

Unnamed: 0,Intent,Parameters,Question
0,B-gpe,"[[0, 7, ""B-gpe""]]",Israeli police investigating a money-launderin...
1,B-perI-perB-orgI-orgB-tim,"[[0, 7, ""B-per""], [8, 14, ""I-per""], [32, 42, ""...",Senator McCain could secure the Republican Par...
2,B-gpeB-tim,"[[0, 8, ""B-gpe""], [58, 64, ""B-tim""]]",American consumers fear prices will go even hi...
3,B-gpeB-gpeB-tim,"[[4, 10, ""B-gpe""], [35, 42, ""B-gpe""], [134, 13...",The Syrian newspaper is calling on Israeli off...
4,B-geoB-geoI-geoB-perB-gpeB-orgB-tim,"[[29, 32, ""B-geo""], [111, 120, ""B-geo""], [33, ...",Despite her absence from the New Zealand event...


In [42]:
data_test = pd.read_excel('https://github.com/toanphanminh/ner/blob/main/ner_dataset_test.xlsx?raw=true')
data_test = data_train.dropna()
data_test.head()

Unnamed: 0,Intent,Parameters,Question
0,B-gpe,"[[0, 7, ""B-gpe""]]",Israeli police investigating a money-launderin...
1,B-perI-perB-orgI-orgB-tim,"[[0, 7, ""B-per""], [8, 14, ""I-per""], [32, 42, ""...",Senator McCain could secure the Republican Par...
2,B-gpeB-tim,"[[0, 8, ""B-gpe""], [58, 64, ""B-tim""]]",American consumers fear prices will go even hi...
3,B-gpeB-gpeB-tim,"[[4, 10, ""B-gpe""], [35, 42, ""B-gpe""], [134, 13...",The Syrian newspaper is calling on Israeli off...
4,B-geoB-geoI-geoB-perB-gpeB-orgB-tim,"[[29, 32, ""B-geo""], [111, 120, ""B-geo""], [33, ...",Despite her absence from the New Zealand event...


**Preprocess label data from raw data**



In [43]:
def preprocess_data(data, out_file):
  intents = data['Intent'].to_list()
  parameters = data['Parameters'].to_list()
  questions = data['Question'].to_list()
  def sort_by_firt_index(obj):
    return int(obj[0])
  preprocess_questions = []
  embedded_questions = []
  a=1
  for param, question in zip(parameters, questions):
    try:
      lists = re.findall("\[[^\]]*\]", param[1:-1])

      lists = [x[1:-1].replace("\"","").split(',') for x in lists]

      for lst in lists:
        lst[0] = int(lst[0])
        lst[1] = int(lst[1])
        lst[2] = lst[2].strip()
      lists.sort(key = sort_by_firt_index)

      begin_idx = 0 ;
      keep_sub_string_idx = []
      for i in lists :
        keep_sub_string_idx.append([begin_idx, i[0]])
        begin_idx = i[1]
      #last substring
      keep_sub_string_idx.append([begin_idx, len(question)])

      lists.append([-1,-1,""])
      preprocess_question = ""

      for sub_str_idx, entity in zip(keep_sub_string_idx, lists):
        preprocess_question += (question[sub_str_idx[0]:sub_str_idx[1]] + entity[2])
      preprocess_questions.append(preprocess_question)
      if len(preprocess_question) == 0 :
        preprocess_questions = question
    except:
      print("HAHA")
  outData = {'Intent' : data['Intent'],
             'Parameters': data['Parameters'],
             'Question':data['Question'],
             'Preprocess_Question': preprocess_questions
             }
  df = pd.DataFrame(outData, columns=['Intent', 'Parameters', 'Question', 'Preprocess_Question'])
  df.to_excel(out_file, index=False, header=True)

  print(out_file,"Done")

preprocess_data(data_train, 'preprocess_train.xlsx')
preprocess_data(data_test, 'preprocess_test.xlsx')




preprocess_train.xlsx Done
preprocess_test.xlsx Done


In [44]:
data = pd.read_excel('preprocess_train.xlsx')
data=data.dropna()
data.head()

Unnamed: 0,Intent,Parameters,Question,Preprocess_Question
0,B-gpe,"[[0, 7, ""B-gpe""]]",Israeli police investigating a money-launderin...,B-gpe police investigating a money-laundering ...
1,B-perI-perB-orgI-orgB-tim,"[[0, 7, ""B-per""], [8, 14, ""I-per""], [32, 42, ""...",Senator McCain could secure the Republican Par...,B-per I-per could secure the B-org I-org nomin...
2,B-gpeB-tim,"[[0, 8, ""B-gpe""], [58, 64, ""B-tim""]]",American consumers fear prices will go even hi...,B-gpe consumers fear prices will go even highe...
3,B-gpeB-gpeB-tim,"[[4, 10, ""B-gpe""], [35, 42, ""B-gpe""], [134, 13...",The Syrian newspaper is calling on Israeli off...,The B-gpe newspaper is calling on B-gpe offici...
4,B-geoB-geoI-geoB-perB-gpeB-orgB-tim,"[[29, 32, ""B-geo""], [111, 120, ""B-geo""], [33, ...",Despite her absence from the New Zealand event...,Despite her absence from the B-geo I-geo event...


**Prepare Dataset (X : Data, Y: target)**


In [45]:
intents = data['Intent']
parameters = data['Parameters']
questions = data['Question']
target_questions = data['Preprocess_Question']

#Build entities list deom parameters list
entity_key_list =[]
for index , pr in enumerate(parameters):
  pr = literal_eval(pr)
  if len(pr)> 0 :
    for p in pr :
      if p[2] not in entity_key_list :
        entity_key_list.append(p[2])
# entities_list = {}
# for en_key in entity_key_list:
#   entities_list[en_key]= []

#Build input data & taget data from question, target questions
X = []
Y= []

for index, qs in enumerate(questions):
  X.append(str(questions[index]))
  tg_qs = str(target_questions[index]).split()
  tg_qs_list = []
  for item in tg_qs :
    if item not in entity_key_list:
      tg_qs_list.append('0')
    else :
      tg_qs_list.append(item)
  Y.append(" ".join(tg_qs_list))

#data augmentation
duplication = True
if duplication :
  filter_class = ['B-eve', 'I-art', 'I-eve', 'I-gpe', 'I-nat', 'Tag']

  input_lack_data =[]
  target_lack_data = []
  Y_list = Y
  for idx,tar_class in enumerate(Y_list):
    for f_class in filter_class :
      if f_class in tar_class.split() :
        input_lack_data.append(X[idx])
        target_lack_data.append(Y[idx])
        break
  for i in range(len(target_lack_data)):
    for j in range(50):
      X.append(input_lack_data[i])
      Y.append(target_lack_data[i])

print("Sample:")
print("X[0]: ", X[0])
print("Y[0]: ", Y[0])

Sample:
X[0]:  Israeli police investigating a money-laundering scheme have arrested 22 employees of the country 's largest bank 
Y[0]:  B-gpe 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


**Tokenize + Padding + convert to tensor**

In [49]:
tokenized_data = Tokenizer(oov_token='OOV', filters = '', split= ' ')
tokenized_data.fit_on_texts(X)
tokenized_data_text = tokenized_data.texts_to_sequences(X)
vec_data = pad_sequences(tokenized_data_text, padding = 'post')

tokenized_target = Tokenizer( filters = '', split= ' ', lower = False)
tokenized_target.fit_on_texts(Y)
tokenized_target_text = tokenized_target.texts_to_sequences(Y)
vec_target = pad_sequences(tokenized_target_text, padding = 'post')

pickle.dump(tokenized_data, open('tokenized_data', 'wb'))
pickle.dump(tokenized_target, open('tokenized_target', 'wb'))

data_vocab_size = len(tokenized_data.word_index)
target_vocab_size = len(tokenized_target.word_index) + 1
src_max_len = vec_data.shape[1]
dest_max_len = vec_target.shape[1]

tf_X = tf.convert_to_tensor(vec_data, dtype = tf.int32)
tf_Y = tf.stack([tf.keras.utils.to_categorical(i, num_classes= target_vocab_size) for i in vec_target])

print("data shape", tf_X.shape)
print("target shape", tf_Y.shape)

data shape (54809, 103)
target shape (54809, 103, 19)


In [51]:
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove*.zip

--2024-07-19 13:48:09--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2024-07-19 13:48:09--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2024-07-19 13:48:09--  https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]


**Using Pre-trained Word Embedding**

In [53]:
glove_file = "/content/glove.42B.300d.txt"
import tqdm

EMBEDDING_DIM = 40
def construct_embedding_matrix(embedding_file, word_index):
  embedding_dict = {}
  with open(glove_file, 'r', encoding = "utf8") as f :
    for line in f :
      values = line.split()
      word = values[0]
      if word in word_index.keys() :
        vector = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vector
  num_words = len(word_index)+1
  embedding_maxtrix = np.zeros((num_words, EMBEDDING_DIM))

  for word, i in tqdm.tqdm(word_index.items()):
    if i < num_words :
      vect = embedding_dict.get(word, [])
      if len(vect)>0 :
        embedding_maxtrix[i] = vect[:EMBEDDING_DIM]
  return embedding_maxtrix

embedding_matrix = construct_embedding_matrix(glove_file, tokenized_data.word_index)

100%|██████████| 26403/26403 [00:00<00:00, 168431.01it/s]


**Build NER model with Word Embedding Glove(40 Dim) + BI-LTSM + CNN1D**

In [55]:
input_dim = len(tokenized_data.word_index) + 1
output_dim = 40
input_length = vec_data.shape[1]
dropout_thres = 0.2
n_tags = len(tokenized_target.word_index) + 1

input_layer = Input(shape = tf_X.shape[1])
embedding_layer = Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length, embeddings_initializer = keras.initializers.Constant(embedding_matrix),trainable = False)(input_layer)

bi_ltsm_layer = Bidirectional(LSTM(units = output_dim, dropout = dropout_thres, return_sequences= True), merge_mode='concat')(embedding_layer)
cnn_layer = Conv1D(40,3,activation='relu', padding = 'same')(embedding_layer)
cnn_layer = GaussianDropout(dropout_thres)(cnn_layer)
cnn_layer = Conv1D(80,5,activation ='relu',padding = 'same')(cnn_layer)

concat_layer = tf.keras.layers.Concatenate()([bi_ltsm_layer, cnn_layer])

dense_layer = Sequential()
dense_layer.add(Dense(100, activation= 'relu'))
dense_layer.add(Dense(85,activation = 'relu'))
dense_layer.add(Dense(50, activation = 'relu'))
dense_layer.add(Dense(n_tags, activation = 'softmax'))

classifier = TimeDistributed(dense_layer)(concat_layer)

model = tf.keras.Model(inputs = input_layer, outputs = classifier)
adam = Adam(learning_rate=0.001)
model.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 103)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 103, 40)              1056160   ['input_2[0][0]']             
                                                                                                  
 conv1d_2 (Conv1D)           (None, 103, 40)              4840      ['embedding_1[0][0]']         
                                                                                                  
 gaussian_dropout_1 (Gaussi  (None, 103, 40)              0         ['conv1d_2[0][0]']            
 anDropout)                                                                                   

**Training**

In [56]:
callback_model = tf.keras.callbacks.ModelCheckpoint('model_bi_ltsm_cnn.h5', monitor = 'val_loss')
callback_stop = tf.keras.callbacks.EarlyStopping(monitor = 'loss', patience= 2)

history = model.fit(
    x=tf_X,
    y= tf_Y,
    batch_size = 128,
    epochs = 50,
    validation_split = 0.1,
    callbacks = [callback_model, callback_stop]
)

Epoch 1/50

  saving_api.save_model(


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [57]:
#save full model
model.save("model.hdf5")

**Test Model**

In [58]:
data_test = pd.read_excel('preprocess_test.xlsx')
data_test=data_test.dropna()
data_test.head()

Unnamed: 0,Intent,Parameters,Question,Preprocess_Question
0,B-gpe,"[[0, 7, ""B-gpe""]]",Israeli police investigating a money-launderin...,B-gpe police investigating a money-laundering ...
1,B-perI-perB-orgI-orgB-tim,"[[0, 7, ""B-per""], [8, 14, ""I-per""], [32, 42, ""...",Senator McCain could secure the Republican Par...,B-per I-per could secure the B-org I-org nomin...
2,B-gpeB-tim,"[[0, 8, ""B-gpe""], [58, 64, ""B-tim""]]",American consumers fear prices will go even hi...,B-gpe consumers fear prices will go even highe...
3,B-gpeB-gpeB-tim,"[[4, 10, ""B-gpe""], [35, 42, ""B-gpe""], [134, 13...",The Syrian newspaper is calling on Israeli off...,The B-gpe newspaper is calling on B-gpe offici...
4,B-geoB-geoI-geoB-perB-gpeB-orgB-tim,"[[29, 32, ""B-geo""], [111, 120, ""B-geo""], [33, ...",Despite her absence from the New Zealand event...,Despite her absence from the B-geo I-geo event...


**Evaluation on Test Dataset**

In [62]:
intents_test = data_test['Intent']
parameters_test = data_test['Parameters']
questions_test = data_test['Question']
target_questions_test = data_test['Preprocess_Question']

data_input_test = []
target_input_test = []
entities_list = list(tokenized_target.word_index.keys())
entities_list.remove('0')

for index, qs in enumerate(questions_test):
  data_input_test.append(str(questions_test[index]))
  tg_qs = str(target_questions_test[index]).split()
  tg_qs_list = []
  for item in tg_qs :
    if item not in entities_list:
      tg_qs_list.append('0')
    else :
      tg_qs_list.append(item)
  target_input_test.append(" ".join(tg_qs_list))

tokenized_data_test_text = tokenized_data.texts_to_sequences(data_input_test)
vec_data_test = pad_sequences(tokenized_data_test_text, padding = 'post', maxlen = 103)

tokenized_target_test_text = tokenized_target.texts_to_sequences(target_questions_test)
vec_target_test = pad_sequences(tokenized_target_test_text, padding = 'post', maxlen = 103)

input_test_target = vec_data_test
input_test_data = tf.convert_to_tensor(vec_data_test, dtype = tf.int32)

#model.load_ưeights("model_bi_ltsm_cnn.h5")
input_test_pred = model.predict(input_test_data)
input_test_pred = input_test_pred.argmax(axis= 2)

test_ground_true = input_test_target.reshape(-1,1)
test_pred = input_test_pred.reshape(-1,1)

target_names= ['O-OVV'] + list(tokenized_target.word_index.keys())

report = classification_report(test_ground_true, test_pred, target_names=target_names, zero_division = 0)
print("class: " ,tokenized_target.word_index)
print("\nClassificasion report:\n", report)




ValueError: Number of classes, 26404, does not match size of target_names, 19. Try specifying the labels parameter

In [64]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

# Assuming `data_test` and tokenizers (`tokenized_data`, `tokenized_target`) are already defined and loaded

intents_test = data_test['Intent']
parameters_test = data_test['Parameters']
questions_test = data_test['Question']
target_questions_test = data_test['Preprocess_Question']

data_input_test = []
target_input_test = []
entities_list = list(tokenized_target.word_index.keys())
entities_list.remove('0')

for index, qs in enumerate(questions_test):
    data_input_test.append(str(questions_test[index]))
    tg_qs = str(target_questions_test[index]).split()
    tg_qs_list = []
    for item in tg_qs:
        if item not in entities_list:
            tg_qs_list.append('0')
        else:
            tg_qs_list.append(item)
    target_input_test.append(" ".join(tg_qs_list))

tokenized_data_test_text = tokenized_data.texts_to_sequences(data_input_test)
vec_data_test = pad_sequences(tokenized_data_test_text, padding='post', maxlen=103)

# Correct tokenization of target questions
tokenized_target_test_text = tokenized_target.texts_to_sequences(target_input_test)
vec_target_test = pad_sequences(tokenized_target_test_text, padding='post', maxlen=103)

input_test_target = vec_target_test
input_test_data = tf.convert_to_tensor(vec_data_test, dtype=tf.int32)

# Load model weights
# model.load_weights("model_bi_ltsm_cnn.h5")

input_test_pred = model.predict(input_test_data)
input_test_pred = input_test_pred.argmax(axis=2)

test_ground_true = input_test_target.reshape(-1)
test_pred = input_test_pred.reshape(-1)

# Debugging: Check unique classes in test_ground_true and test_pred
print(f"Unique classes in ground truth: {np.unique(test_ground_true)}")
print(f"Unique classes in predictions: {np.unique(test_pred)}")

# Create target names based on tokenized target
target_names = ['O-OVV'] + [key for key in tokenized_target.word_index.keys() if key != '0']

# Debugging: Ensure target names match the number of unique classes
print(f"Number of target names: {len(target_names)}")
print(f"Number of unique classes in ground truth: {len(np.unique(test_ground_true))}")
print(f"Number of unique classes in predictions: {len(np.unique(test_pred))}")

# Generating report only for the classes present in predictions
labels = list(range(len(target_names)))
report = classification_report(test_ground_true, test_pred, labels=labels, target_names=target_names, zero_division=0)
print("Class: ", tokenized_target.word_index)
print("\nClassification report:\n", report)


Unique classes in ground truth: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]
Unique classes in predictions: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]
Number of target names: 18
Number of unique classes in ground truth: 19
Number of unique classes in predictions: 19
Class:  {'0': 1, 'B-geo': 2, 'B-tim': 3, 'B-gpe': 4, 'B-org': 5, 'B-per': 6, 'I-per': 7, 'I-org': 8, 'B-eve': 9, 'I-tim': 10, 'I-art': 11, 'I-geo': 12, 'I-eve': 13, 'B-art': 14, 'I-gpe': 15, 'B-nat': 16, 'I-nat': 17, 'Tag': 18}

Classification report:
               precision    recall  f1-score   support

       O-OVV       1.00      1.00      1.00   2480490
       B-geo       0.98      0.99      0.98    548464
       B-tim       0.85      0.87      0.86     27725
       B-gpe       0.91      0.91      0.91     15193
       B-org       0.91      0.91      0.91     11286
       B-per       0.77      0.62      0.69     14900
       I-per       0.82      0.84      0.83     12540
       I-org     

In [None]:
tokenized_data = pickle.load(open('tokenized_data', 'rb'))
tokenized_target = pickle.load(open('tokenized_target', 'rb'))
model = tf.keras.models.load_model('model.hdf5')

def get_entities(input_sen, input_with_params):
  token_sen = input_sen.split()
  token_param = input_with_params.split()
  params_list =[]

  for idx, param in enumerate(token_param):
    if param !='0' :
      param_item =[]
      param_item.append(token_sen[idx])
      param_item.append(param)
      params_list.append(param_item)
  return params_list

while(1):
  print("Input your sentences: ")
  input_text = input()
  if input_text == 'exit':
    break
  tokenized_input_text = tokenized_data.texts_to_sequences([input_text])
  vec_input_text = pad_sequences(tokenized_input_text, padding = 'post', maxlen = 103)

  vec_out = model.predict(vec_input_text)
  vec_out_result = vec_out.argmax(axis = 2)
  result = tokenized_target.sequences_to_texts(vec_out_result)
  result = result[0]
  print(result)