In [1]:
import codecs
import json
import Preprocess
import CRF
from Parameters import *
import numpy as np
import keras
import sys
import math
import tensorflow as tf
import timeit
import keras.backend as K
import gc
import unicodedata
import tensorflow_hub as hub

Using TensorFlow backend.


# LaBSE model

In [2]:
def get_model(model_url, max_seq_length):
  labse_layer = hub.KerasLayer(model_url, trainable=True)

  # Define input.
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                         name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                     name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="segment_ids")

  # LaBSE layer.
  pooled_output,  _ = labse_layer([input_word_ids, input_mask, segment_ids])

  # The embedding is l2 normalized.
  pooled_output = tf.keras.layers.Lambda(
      lambda x: tf.nn.l2_normalize(x, axis=1))(pooled_output)

  # Define model.
  return tf.keras.Model(
        inputs=[input_word_ids, input_mask, segment_ids],
        outputs=pooled_output), labse_layer
start = timeit.default_timer()
max_seq_length = 64
labse_model, labse_layer = get_model(
    model_url="https://tfhub.dev/google/LaBSE/1", max_seq_length=max_seq_length)
stop = timeit.default_timer()

print('Time: ', stop - start)

Time:  558.6550425770001


In [14]:
# start = timeit.default_timer()
# labse_model = keras.models.load_model(os.path.join(WORKING_DIR, 'Model/LaBSE_model'))
# labse_layer = hub.KerasLayer(model_url, trainable=True)
# stop = timeit.default_timer()

# print('Time: ', stop - start)

Time:  121.4608063


In [23]:
# type(labse_layer)

tensorflow_hub.keras_layer.KerasLayer

In [24]:
# hub_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
# embed = hub.KerasLayer(hub_url)
# embeddings = embed(["A long sentence.", "single-word", "http://example.com"])
# print(embeddings.shape, embeddings.dtype)

In [35]:
# dir(labse_layer)

In [19]:
# labse_layer_weights = labse_layer.weights

In [20]:
# type(labse_layer_weights)

list

In [22]:
# len(labse_layer_weights)

200

In [25]:
# vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
# do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()

In [36]:
# dir(vocab_file)

In [37]:
# dir(do_lower_case)

In [38]:
# dir(labse_model)

In [3]:
import bert

vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

def create_input(input_strings, tokenizer, max_seq_length):

  input_ids_all, input_mask_all, segment_ids_all = [], [], []
  for input_string in input_strings:
    # Tokenize input.
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)

    # Padding or truncation.
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    segment_ids_all.append([0] * max_seq_length)

  return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)

def encode(input_text):
  input_ids, input_mask, segment_ids = create_input(
    input_text, tokenizer, max_seq_length)
  return labse_model([input_ids, input_mask, segment_ids])

# Load model

In [4]:
model = tf.keras.models.load_model(AD_MODEL_FILE) 

# Load necessary file

In [5]:
norm_embeddings = np.load(NORM_EMBEDDING_FILE, allow_pickle=True)
NT_norm_embeddings = np.load(NT_NORM_EMBEDDING_FILE, allow_pickle=True)

with open(file=NORM_ADDS_FILE, mode='r', encoding='utf-8') as f:
  NORM_ADDS = json.load(fp=f)

with open(file=ID2id_FILE, mode='r', encoding='utf-8') as f:
  ID2id = json.load(fp=f)

with open(file=id2ID_FILE, mode='r', encoding='utf-8') as f:
  id2ID = json.load(fp=f)

with open(file=id2norm_add_FILE, mode='r', encoding='utf-8') as f:
  id2norm_add = json.load(fp=f)
dim = 772
num_of_norm = 34481

# for a sample in trainset, get id of norm_add coresponding to noisy_add of this sample
def get_norm_id(sample):
  return list(sample['std_add'].keys())[0]

In [6]:
entities2index = {'street': 0, 'ward': 1, 'district': 2, 'city': 3}

def create_type_add_vector(noisy_add):
  entities = CRF.detect_entity(noisy_add)
  type_add_vector = np.zeros((1,4))
  for entity in entities:
    if entity == 'name':
      pass
    else:
      index = entities2index[entity]
      type_add_vector[0, index] = 1
  return type_add_vector

In [7]:
def concat(v,type_add_vector):
  return np.concatenate((v, type_add_vector), axis=1)

# Predict class

In [8]:
class Predict:
  def __init__(self, SNN_model, norm_embeddings, NT_norm_embeddings, dim, num_of_norm):
    self.SNN_model = SNN_model
    self.norm_embeddings = norm_embeddings
    self.NT_norm_embeddings = NT_norm_embeddings
    self.dim = dim
    self.num_of_norm = num_of_norm

  def predict(self, noisy_add):  
    noisy_add = unicodedata.normalize('NFC', noisy_add)
    type_add_vector = create_type_add_vector(noisy_add)
    noisy_add = Preprocess.remove_punctuation(CRF.get_better_add(noisy_add)).lower()
    noisy_add_vector = concat(np.array(encode([noisy_add])), type_add_vector).reshape(dim,)
    noisy_add_vectors = np.full((num_of_norm, dim), noisy_add_vector)
    if noisy_add == Preprocess.remove_tone_of_text(noisy_add):
        x = model.predict([noisy_add_vectors, NT_norm_embeddings]).reshape(num_of_norm,)
    else:
        x = model.predict([noisy_add_vectors, norm_embeddings]).reshape(num_of_norm,)
    
    x = np.argmax(x, axis=0)
    print(NORM_ADDS['data'][x]['std_add'])

predictor = Predict(model, norm_embeddings, NT_norm_embeddings, dim, num_of_norm)

In [9]:
noisy_add = '4 - 6 Tô Hiến Thành, Tp. Thanh Hóa'
# noisy_add = '28, Cửa Tả, P. Lam Sơn, Thành phố Thanh Hóa, T. Thanh Hóa'
# noisy_add = '36 Lê Lợi, Thành Phố Sầm Sơn, Thanh Hóa'
# noisy_add = 'Jun-98, Cao Bá Quát, P. Đông Thọ, Thành phố Thanh Hóa, T. Thanh Hóa'
# noisy_add = '16, Trần Xuân Soạn, P. Đông Thọ, Thành phố Thanh Hóa, T. Thanh Hóa'
# noisy_add = '191, Tống Duy Tân, P. Lam Sơn, Thành phố Thanh Hóa, T. Thanh Hóa'
# noisy_add = '20 Lê Quý Đôn, Phường Ba Đình, Thanh Hóa'
# noisy_add = 'lê quý đôn đống đa hà nội'
# noisy_add = 'phường ba đinh tỉnh Thanh Hóa'
predictor.predict(noisy_add)

{'street': 'tô hiến thành', 'district': 'thanh hóa', 'city': 'thanh hóa'}


In [10]:
noisy_add = 'vinh hung vinh loc thanh hoa'
predictor.predict(noisy_add)

{'ward': 'vĩnh hùng', 'district': 'vĩnh lộc', 'city': 'thanh hóa'}


In [11]:
noisy_add = 'vinh hung vinh loc thanh hoa'
predictor.predict(noisy_add)

{'ward': 'vĩnh hùng', 'district': 'vĩnh lộc', 'city': 'thanh hóa'}
