In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# reference
# https://confusedcoders.com/data-science/deep-learning/how-to-build-deep-neural-network-for-custom-ner-with-keras
# https://towardsdatascience.com/named-entity-recognition-ner-using-keras-bidirectional-lstm-28cd3f301f54
# https://www.aitimejournal.com/@akshay.chavan/complete-tutorial-on-named-entity-recognition-ner-using-python-and-keras

In [12]:
!pip install tensorflow tensorflow_text pandas numpy tqdm



In [13]:
import pandas as pd
import tensorflow_text as tf_text
import tensorflow as tf
# import tensorflow_addons as tfa
import numpy as np
from tqdm import tqdm
import sklearn.model_selection

In [21]:
# MODEL PARAMS
vocab_size = 25000
sequence_length = 256

Load Data and Prepare

In [22]:
from typing import List

def tag_normalize(x: List[int]):
  # This will pad the tag to the sequence length
  # If the original tag is more than sequence length, it will be turnicate
  if len(x) > sequence_length:
    return x[:256]
  
  differences = sequence_length - len(x)
  zero_pad = [0 for _ in range(differences)]
  x = x + zero_pad
  return x

In [23]:
!gdown 1piwUH_CpsYyqwpLWQKo5UTnvZk8NlYMG

Downloading...
From: https://drive.google.com/uc?id=1piwUH_CpsYyqwpLWQKo5UTnvZk8NlYMG
To: /content/ner_location.csv
  0% 0.00/6.54M [00:00<?, ?B/s]100% 6.54M/6.54M [00:00<00:00, 322MB/s]


In [24]:
df = pd.read_csv("/content/ner_location.csv")
df['word'] = df['word'].apply(eval) # we need to do this, because we stored this as a list
df['tag'] = df['tag'].apply(eval) # we need to do this, because we stored this as a list
df["sentence"] = df['word'].apply(lambda x: " ".join(x))
df["tag_normal"] = df['tag'].apply(tag_normalize)
df

Unnamed: 0,word,tag,sentence,tag_normal
0,"[thousands, of, demonstrators, have, marched, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...",thousands of demonstrators have marched throug...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,"[helicopter, gunships, saturday, pounded, mili...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",helicopter gunships saturday pounded militant ...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,"[un, relief, coordinator, jan, egeland, said, ...","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",un relief coordinator jan egeland said sunday ...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[mr, egeland, said, the, latest, figures, show...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",mr egeland said the latest figures show millio...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[he, said, last, week, s, tsunami, and, the, m...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",he said last week s tsunami and the massive un...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
24404,"[in, an, opinion, piece, in, the, washington, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",in an opinion piece in the washington post fri...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24405,"[president, bush, last, week, confirmed, he, s...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",president bush last week confirmed he secretly...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24406,"[iran, s, elite, security, forces, are, warnin...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",iran s elite security forces are warning oppos...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24407,"[opposition, activists, have, called, for, pro...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",opposition activists have called for protests ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [25]:
# Sanity check on tag_normal
for i in range(len(df)):
  interest = df["tag_normal"].iloc[i]
  if len(interest) != sequence_length:
    print(f"Found wrong length on {i}")

In [26]:
# Combine all words into one corpus
CORPUS = " ".join(df["sentence"])

In [27]:
a = df["word"].to_list()
b = df["tag"].to_list()
word_combine = [j for i in a for j in i]
tag_combine = [j for i in b for j in i]
print(len(word_combine))
print(len(tag_combine))

527146
527146


Create Text Vectorizer Layer (Preprocessing Layer)   
This convert raw strings into number

In [28]:
tf.keras.backend.clear_session()

In [29]:
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [30]:
vectorize_layer.adapt([CORPUS])

In [31]:
example = df['sentence'].iloc[1]
print(f"Example: {example}")
print(vectorize_layer(example))
print(np.array(df['tag_normal'].iloc[1]).shape)

Example: helicopter gunships saturday pounded militant hideouts in the orakzai tribal region where many taliban militants are believed to have fled to avoid an earlier military offensive in nearby south waziristan
tf.Tensor(
[1067 4236   98 3554  254 3774    3    2 2779  556  119  149  313  408
  105   33  741    5   16  786    5 1538   21  140   44  565    3 1218
   75  869    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0

Modelling  
1. Text Vectorizer
2. Embedding
3. BiLSTM
4. LSTM
5. Dense (sequence_length)

In [32]:
def create_model(train: bool):
  input = tf.keras.layers.Input(shape=(1), dtype=tf.string)
  x = vectorize_layer(input)
  x = tf.keras.layers.Embedding(vocab_size, 256)(x)
  x = tf.keras.layers.Dropout(0.5)(x, training=train)
  x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True), merge_mode = 'concat')(x)
  x = tf.keras.layers.BatchNormalization()(x, training=train)
  x = tf.keras.layers.Dropout(0.3)(x, training=train)
  x = tf.keras.layers.LSTM(256, return_sequences=True)(x)
  x = tf.keras.layers.BatchNormalization()(x, training=train)
  x = tf.keras.layers.Dropout(0.3)(x, training=train)
  output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(2, activation="softmax", kernel_initializer='he_normal'))(x)

  model = tf.keras.Model(input, output)
  optimizer = tf.keras.optimizers.Adam()
  loss_func = tf.keras.losses.SparseCategoricalCrossentropy()
  return model, optimizer, loss_func

model, optimizer, loss_func = create_model(True)

In [33]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 256)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 256, 256)          6400000   
                                                                 
 dropout (Dropout)           (None, 256, 256)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 256, 512)         1050624   
 l)                                                              
                                                                 
 batch_normalization (BatchN  (None, 256, 512)         2048  

Train

In [34]:
batch_size = 64 # Every 32 data we train it
epochs = 3

In [35]:
for i in range(1):
  print(f"Epochs: {i}")
  # Sliding window
  num_word = len(word_combine)
  
  x_batch = []
  y_batch = []

  p_bar = tqdm(range(num_word - (num_word % sequence_length + sequence_length)))
  for j in p_bar:
    start_idx = j
    end_idx = j + sequence_length

    x_prepare = word_combine[start_idx : end_idx] 
    y_prepare = tag_combine[start_idx : end_idx] 

    x_batch.append(" ".join(x_prepare))
    y_batch.append(y_prepare)

    if len(x_batch) % batch_size == 0:
      x_feed = tf.convert_to_tensor(x_batch)
      y_feed = tf.convert_to_tensor(y_batch)
      
      with tf.GradientTape() as tape:
        # Forward pass
        y_pred = model(x_feed)

        loss = loss_func(y_feed, y_pred)
        pred = tf.math.argmax(y_pred, axis=2)
        acc = tf.reduce_mean(tf.keras.metrics.binary_accuracy(y_feed, tf.cast(pred, dtype=tf.int32)))

      p_bar.set_postfix({
          "loss" : loss,
          "acc": acc
      })
      
      # Backprop
      grad = tape.gradient(loss, model.trainable_weights)
      optimizer.apply_gradients(zip(grad, model.trainable_weights))

      x_batch = []
      y_batch = []

Epochs: 0


100%|██████████| 526848/526848 [16:29<00:00, 532.21it/s, loss=tf.Tensor(0.099455036, shape=(), dtype=float32), acc=tf.Tensor(0.96429443, shape=(), dtype=float32)]


In [36]:
# Transfer weight
new_model, _, _ = create_model(False)

new_model.set_weights(model.get_weights())

In [69]:
text = "hotel in bali"

In [70]:
prediction = new_model.predict([text])[0]
pred_converted = tf.math.argmax(prediction, axis=1)
def get_idx(prediction):
  idx = []
  for i in range(len(prediction)):
    if prediction[i] == 1:
      idx.append(i)

  return idx

index_one = get_idx(pred_converted)

text_vectorizator = new_model.get_layer("text_vectorization")
dictionary = text_vectorizator.get_vocabulary()
vectorized = text_vectorizator([text])[0]
attention = []
for i in index_one:
  attention.append(vectorized[i])
for i in attention:
  print(dictionary[i])

bali



In [71]:
tf.saved_model.save(new_model, "/content/drive/Shareddrives/_PercobaanKaenova/NER/v3/saved_model")



INFO:tensorflow:Assets written to: /content/drive/Shareddrives/_PercobaanKaenova/NER/v3/saved_model/assets


INFO:tensorflow:Assets written to: /content/drive/Shareddrives/_PercobaanKaenova/NER/v3/saved_model/assets


In [72]:
# May not use the model because of availability of the dataset