In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# reference
# https://confusedcoders.com/data-science/deep-learning/how-to-build-deep-neural-network-for-custom-ner-with-keras
# https://towardsdatascience.com/named-entity-recognition-ner-using-keras-bidirectional-lstm-28cd3f301f54
# https://www.aitimejournal.com/@akshay.chavan/complete-tutorial-on-named-entity-recognition-ner-using-python-and-keras

In [None]:
!pip install tensorflow tensorflow_text pandas numpy tqdm

Collecting tensorflow_text
  Downloading tensorflow_text-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 28.7 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 62.7 MB/s 
Collecting tensorflow
  Downloading tensorflow-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |████████████████████████████████| 511.7 MB 4.5 kB/s 
[?25hCollecting flatbuffers>=1.12
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
INFO: pip is looking at multiple versions of tensorflow-text to determine which version is compatible with other requirements. This could take a while.
Collecting tensorflow_text
  Downloading tensorflow_text-2.8.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB

In [44]:
import pandas as pd
import tensorflow_text as tf_text
import tensorflow as tf
# import tensorflow_addons as tfa
import numpy as np
from tqdm import tqdm
import sklearn.model_selection

In [45]:
# MODEL PARAMS
vocab_size = 25000
sequence_length = 256

Load Data and Prepare

In [46]:
from typing import List

def tag_normalize(x: List[int]):
  # This will pad the tag to the sequence length
  # If the original tag is more than sequence length, it will be turnicate
  if len(x) > sequence_length:
    return x[:256]
  
  differences = sequence_length - len(x)
  zero_pad = [0 for _ in range(differences)]
  x = x + zero_pad
  return x

In [None]:
!gdown 1piwUH_CpsYyqwpLWQKo5UTnvZk8NlYMG

Downloading...
From: https://drive.google.com/uc?id=1piwUH_CpsYyqwpLWQKo5UTnvZk8NlYMG
To: /content/ner_location.csv
100% 6.54M/6.54M [00:00<00:00, 22.5MB/s]


In [47]:
df = pd.read_csv("/content/ner_location.csv")
df['word'] = df['word'].apply(eval) # we need to do this, because we stored this as a list
df['tag'] = df['tag'].apply(eval) # we need to do this, because we stored this as a list
df["sentence"] = df['word'].apply(lambda x: " ".join(x))
df["tag_normal"] = df['tag'].apply(tag_normalize)
df

Unnamed: 0,word,tag,sentence,tag_normal
0,"[thousands, of, demonstrators, have, marched, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...",thousands of demonstrators have marched throug...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,"[helicopter, gunships, saturday, pounded, mili...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",helicopter gunships saturday pounded militant ...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,"[un, relief, coordinator, jan, egeland, said, ...","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",un relief coordinator jan egeland said sunday ...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[mr, egeland, said, the, latest, figures, show...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",mr egeland said the latest figures show millio...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[he, said, last, week, s, tsunami, and, the, m...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",he said last week s tsunami and the massive un...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
24404,"[in, an, opinion, piece, in, the, washington, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",in an opinion piece in the washington post fri...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24405,"[president, bush, last, week, confirmed, he, s...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",president bush last week confirmed he secretly...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24406,"[iran, s, elite, security, forces, are, warnin...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",iran s elite security forces are warning oppos...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24407,"[opposition, activists, have, called, for, pro...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",opposition activists have called for protests ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [48]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(df[["word", "sentence"]], df["tag"], test_size=0.1, random_state=2022)

In [49]:
# Sanity check on tag_normal
for i in range(len(df)):
  interest = df["tag_normal"].iloc[i]
  if len(interest) != sequence_length:
    print(f"Found wrong length on {i}")

In [50]:
# Combine all words into one corpus
CORPUS = " ".join(df["sentence"])

In [51]:
# Preprocessed train
a = X_train["word"].to_list()
b = y_train.to_list()
word_combine_train = [j for i in a for j in i]
tag_combine_train = [j for i in b for j in i]
print("Train Combined:")
print(len(word_combine_train))
print(len(tag_combine_train))


a = X_test["word"].to_list()
b = y_test.to_list()
word_combine_test = [j for i in a for j in i]
tag_combine_test = [j for i in b for j in i]
print(len(word_combine_test))
print(len(tag_combine_test))

Train Combined:
474555
474555
52591
52591


Create Text Vectorizer Layer (Preprocessing Layer)   
This convert raw strings into number

In [102]:
tf.keras.backend.clear_session()

In [103]:
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [104]:
vectorize_layer.adapt([CORPUS])





In [105]:
example = df['sentence'].iloc[1]
print(f"Example: {example}")
print(vectorize_layer(example))
print(np.array(df['tag_normal'].iloc[1]).shape)

Example: helicopter gunships saturday pounded militant hideouts in the orakzai tribal region where many taliban militants are believed to have fled to avoid an earlier military offensive in nearby south waziristan
tf.Tensor(
[1067 4236   98 3554  254 3774    3    2 2779  556  119  149  313  408
  105   33  741    5   16  786    5 1538   21  140   44  565    3 1218
   75  869    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0

Modelling  
1. Text Vectorizer
2. Embedding
3. BiLSTM
4. LSTM
5. Dense (sequence_length)

In [106]:
def create_model(train: bool):
  input = tf.keras.layers.Input(shape=(1), dtype=tf.string)
  x = vectorize_layer(input)
  x = tf.keras.layers.Embedding(vocab_size, 256)(x)
  x = tf.keras.layers.Dropout(0.5)(x, training=train)
  x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True), merge_mode = 'concat')(x)
  x = tf.keras.layers.BatchNormalization()(x, training=train)
  x = tf.keras.layers.Dropout(0.3)(x, training=train)
  x = tf.keras.layers.LSTM(256, return_sequences=True)(x)
  x = tf.keras.layers.BatchNormalization()(x, training=train)
  x = tf.keras.layers.Dropout(0.3)(x, training=train)
  output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(2, activation="softmax", kernel_initializer='he_normal'))(x)

  model = tf.keras.Model(input, output)
  optimizer = tf.keras.optimizers.Adam()
  loss_func = tf.keras.losses.SparseCategoricalCrossentropy()
  return model, optimizer, loss_func

model, optimizer, loss_func = create_model(True)

In [107]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 256)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 256, 256)          6400000   
                                                                 
 dropout (Dropout)           (None, 256, 256)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 256, 512)         1050624   
 l)                                                              
                                                                 
 batch_normalization (BatchN  (None, 256, 512)         2048  

In [108]:
# # Model Save and Loading Check
# model.save("/content/drive/Shareddrives/CapBatu/ML Divsion/Named Entity Recognition Location/model/test")
# model = tf.keras.models.load_model("/content/drive/Shareddrives/CapBatu/ML Divsion/Named Entity Recognition Location/model/test")
# model.predict(["test"])

Train

In [109]:
batch_size = 64 # Every 64 data we train it
epochs = 3

In [110]:
def train(model: tf.keras.Model):
  print("Model Training")
  # Sliding window
  num_word = len(word_combine_train)
  
  x_batch = []
  y_batch = []

  p_bar = tqdm(range(num_word - (num_word % sequence_length + sequence_length)))
  for j in p_bar:
    start_idx = j
    end_idx = j + sequence_length

    x_prepare = word_combine_train[start_idx : end_idx] 
    y_prepare = tag_combine_train[start_idx : end_idx] 

    x_batch.append(" ".join(x_prepare))
    y_batch.append(y_prepare)

    if len(x_batch) % batch_size == 0:
      x_feed = tf.convert_to_tensor(x_batch)
      y_feed = tf.convert_to_tensor(y_batch)
      
      with tf.GradientTape() as tape:
        # Forward pass
        y_pred = model(x_feed)

        loss = loss_func(y_feed, y_pred)
        pred = tf.math.argmax(y_pred, axis=2)
        acc = tf.reduce_mean(tf.keras.metrics.binary_accuracy(y_feed, tf.cast(pred, dtype=tf.int32)))

      p_bar.set_postfix({
          "loss" : loss,
          "acc": acc
      })
      
      # Backprop
      grad = tape.gradient(loss, model.trainable_weights)
      optimizer.apply_gradients(zip(grad, model.trainable_weights))

      x_batch = []
      y_batch = []

  return model

def test(model: tf.keras.Model):
  print("Model Validation")
  # Sliding window
  num_word = len(word_combine_test)
  
  x_batch = []
  y_batch = []
  total_acc = []
  total_loss = []

  p_bar = tqdm(range(num_word - (num_word % sequence_length + sequence_length)))
  for j in p_bar:
    start_idx = j
    end_idx = j + sequence_length

    x_prepare = word_combine_test[start_idx : end_idx] 
    y_prepare = tag_combine_test[start_idx : end_idx] 

    x_batch.append(" ".join(x_prepare))
    y_batch.append(y_prepare)

    if len(x_batch) % batch_size == 0:
      x_feed = tf.convert_to_tensor(x_batch)
      y_feed = tf.convert_to_tensor(y_batch)
      
      # Forward pass
      y_pred = model(x_feed)

      total_loss.append(loss_func(y_feed, y_pred))
      pred = tf.math.argmax(y_pred, axis=2)
      total_acc.append(tf.reduce_mean(tf.keras.metrics.binary_accuracy(y_feed, tf.cast(pred, dtype=tf.int32))))

      x_batch = []
      y_batch = []

      p_bar.set_postfix({
          "loss" : tf.reduce_mean(total_loss),
          "acc": tf.reduce_mean(total_acc)
      })

In [111]:
# Training Loop
for i in range(epochs):
  print(f"Epochs: {i}")
  # Train
  model = train(model)

  # Validation
  new_model, _, _ = create_model(False)
  new_model.set_weights(model.get_weights())
  test(new_model)


Epochs: 0
Model Training


100%|██████████| 474112/474112 [14:44<00:00, 536.06it/s, loss=tf.Tensor(0.13223135, shape=(), dtype=float32), acc=tf.Tensor(0.94750977, shape=(), dtype=float32)]


Model Validation


100%|██████████| 52224/52224 [00:44<00:00, 1177.46it/s, loss=tf.Tensor(0.16476706, shape=(), dtype=float32), acc=tf.Tensor(0.9317308, shape=(), dtype=float32)]


Epochs: 1
Model Training


100%|██████████| 474112/474112 [14:36<00:00, 540.71it/s, loss=tf.Tensor(0.111857176, shape=(), dtype=float32), acc=tf.Tensor(0.95806885, shape=(), dtype=float32)]


Model Validation


100%|██████████| 52224/52224 [00:43<00:00, 1193.54it/s, loss=tf.Tensor(0.1550283, shape=(), dtype=float32), acc=tf.Tensor(0.93654984, shape=(), dtype=float32)]


Epochs: 2
Model Training


100%|██████████| 474112/474112 [14:35<00:00, 541.80it/s, loss=tf.Tensor(0.075076036, shape=(), dtype=float32), acc=tf.Tensor(0.9694824, shape=(), dtype=float32)]


Model Validation


100%|██████████| 52224/52224 [00:43<00:00, 1192.19it/s, loss=tf.Tensor(0.1547095, shape=(), dtype=float32), acc=tf.Tensor(0.9362086, shape=(), dtype=float32)]


In [112]:
# Transfer weight
new_model, _, _ = create_model(False)
new_model.set_weights(model.get_weights())

In [113]:
text = "hotel in bali"

In [114]:
prediction = new_model.predict([text])[0]
pred_converted = tf.math.argmax(prediction, axis=1)
def get_idx(prediction):
  idx = []
  for i in range(len(prediction)):
    if prediction[i] == 1:
      idx.append(i)
  return idx

index_one = get_idx(pred_converted)

text_vectorizator = new_model.get_layer("text_vectorization")
dictionary = text_vectorizator.get_vocabulary()
vectorized = text_vectorizator([text])[0]
attention = []
for i in index_one:
  attention.append(vectorized[i])
for i in attention:
  print(dictionary[i])





bali


















































































































































































































































In [115]:
new_model.save("/content/drive/Shareddrives/CapBatu/ML Divsion/Named Entity Recognition Location/model/v3/saved_model")





INFO:tensorflow:Assets written to: /content/drive/Shareddrives/CapBatu/ML Divsion/Named Entity Recognition Location/model/v3/saved_model/assets


INFO:tensorflow:Assets written to: /content/drive/Shareddrives/CapBatu/ML Divsion/Named Entity Recognition Location/model/v3/saved_model/assets


In [116]:
# May not use the model because of availability of the dataset

Testing Save Model

In [117]:
import pandas as pd
import tensorflow_text as tf_text
import tensorflow as tf
# import tensorflow_addons as tfa
import numpy as np

In [118]:
saved_model_path = "/content/drive/Shareddrives/CapBatu/ML Divsion/Named Entity Recognition Location/model/v3/saved_model" # The last one should be "/saved_model"
out_zip_path = "/content/drive/Shareddrives/CapBatu/ML Divsion/Named Entity Recognition Location/model/v3/saved_model.zip" # With .zip extension

In [119]:
def get_idx(prediction):
  idx = []
  for i in range(len(prediction)):
    if prediction[i] == 1:
      idx.append(i)
  return idx


In [120]:
model = tf.keras.models.load_model(saved_model_path)
text_vectorizator = model.get_layer("text_vectorization")
word_dictionary = text_vectorizator.get_vocabulary()

prediction = model.predict(["hotel in bali"])[0]
pred_converted = tf.math.argmax(prediction, axis=1)
index_one = get_idx(pred_converted)

attention = []
for i in index_one:
  attention.append(vectorized[i])
for i in attention:
  if len(dictionary[i].strip()) != 0: 
    print(dictionary[i])





bali


In [121]:
!cp -r "{saved_model_path}" "./" && zip -Z bzip2 -r "{out_zip_path}" "./saved_model" && rm -fr "saved_model"

  adding: saved_model/ (stored 0%)
  adding: saved_model/keras_metadata.pb (bzipped 91%)
  adding: saved_model/variables/ (stored 0%)
  adding: saved_model/variables/variables.index (bzipped 48%)
  adding: saved_model/variables/variables.data-00000-of-00001 (bzipped 6%)
  adding: saved_model/saved_model.pb (bzipped 94%)
  adding: saved_model/assets/ (stored 0%)
