In [5]:
# to import .py module
import sys
import os
ROOT_DIR = os.path.abspath(os.curdir)
# print(ROOT_DIR)
sys.path.append(os.path.abspath(ROOT_DIR))

In [2]:
import codecs
import json
from Parameters import *
import numpy as np
import keras
import tensorflow as tf
import timeit
import keras.backend as K
import gc
import CRF
import unicodedata

#### Bert

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import json

def get_model(model_url, max_seq_length):
  labse_layer = hub.KerasLayer(model_url, trainable=True)

  # Define input.
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                         name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                     name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="segment_ids")

  # LaBSE layer.
  pooled_output,  _ = labse_layer([input_word_ids, input_mask, segment_ids])

  # The embedding is l2 normalized.
  pooled_output = tf.keras.layers.Lambda(
      lambda x: tf.nn.l2_normalize(x, axis=1))(pooled_output)

  # Define model.
  return tf.keras.Model(
        inputs=[input_word_ids, input_mask, segment_ids],
        outputs=pooled_output), labse_layer

max_seq_length = 64
labse_model, labse_layer = get_model(
    model_url="https://tfhub.dev/google/LaBSE/1", max_seq_length=max_seq_length)

In [None]:
import bert

vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

def create_input(input_strings, tokenizer, max_seq_length):

  input_ids_all, input_mask_all, segment_ids_all = [], [], []
  for input_string in input_strings:
    # Tokenize input.
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)

    # Padding or truncation.
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    segment_ids_all.append([0] * max_seq_length)

  return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)

def encode(input_text):
  input_ids, input_mask, segment_ids = create_input(
    input_text, tokenizer, max_seq_length)
  return labse_model([input_ids, input_mask, segment_ids])

#### Load necessary file

In [3]:
import json
with open(TEST_DATA_FILE, encoding='utf8') as f:
  testset = json.load(f)

norm_embeddings = np.load(NORM_EMBEDDING_FILE, allow_pickle=True)
NT_norm_embeddings = np.load(NT_NORM_EMBEDDING_FILE, allow_pickle=True)

with open(file=NORM_ADDS_FILE, mode='r', encoding='utf-8') as f:
  NORM_ADDS = json.load(fp=f)

with open(file=ID2id_FILE, mode='r', encoding='utf-8') as f:
  ID2id = json.load(fp=f)

with open(file=id2ID_FILE, mode='r', encoding='utf-8') as f:
  id2ID = json.load(fp=f)

with open(file=id2norm_add_FILE, mode='r', encoding='utf-8') as f:
  id2norm_add= json.load(fp=f)
  
dim = 772
num_of_norm = 34481

# for a sample in trainset, get id of norm_add coresponding to noisy_add of this sample
def get_norm_id(sample):
  return list(sample['std_add'].keys())[0]

In [None]:
entities2index = {'street': 0, 'ward': 1, 'district': 2, 'city': 3}

def create_type_add_vector(noisy_add):
  entities = CRF.detect_entity(noisy_add)
  type_add_vector = np.zeros((1,4))
  for entity in entities:
    if entity == 'name':
      pass
    else:
      index = entities2index[entity]
      type_add_vector[0, index] = 1
  return type_add_vector

def concat(v,type_add_vector):
  return np.concatenate((v, type_add_vector), axis=1)

#### Test

In [None]:
model = keras.models.load_model('/content/drive/My Drive/Norm_add_based_recommendation/Full/model_1.2/Add_model_1.4.4.2/SNN_100_epoches.snn')
import timeit
start = timeit.default_timer()
count_ = 0
error_sample = dict()
error_sample['data'] = []
for sample in testset['data']:
  noisy_add = sample['noisy_add']
  noisy_add = unicodedata.normalize('NFC', noisy_add)

  type_add_vector = create_type_add_vector(noisy_add)
  
  noisy_add = Preprocess.remove_punctuation(CRF.get_better_add(noisy_add)).lower()
  noisy_add_vector = concat(np.array(encode([noisy_add])), type_add_vector).reshape(dim,)
  noisy_add_vectors = np.full((num_of_norm, dim), noisy_add_vector)
  if noisy_add == reprocess.remove_tone_of_text(noisy_add):
    x = model.predict([noisy_add_vectors, NT_norm_embeddings]).reshape(num_of_norm,)
  else:
    x = model.predict([noisy_add_vectors, norm_embeddings]).reshape(num_of_norm,)

  x = np.argmax(x, axis = 0)
  if str(ID2id[str(x)]) in sample['std_add']:
    count_ +=1
  else:
    error_sample['data'].append(sample)
  gc.collect()

stop = timeit.default_timer()

print('Time: ', stop - start) 
print(count_)
# print(ID2id[str(x)])
with open('/content/drive/My Drive/Norm_add_based_recommendation/Full/model_1.2/Add_model_1.4.4.2/error_sample_100_epoches.json', 'w', encoding='utf8') as f:
  json.dump(error_sample, f, ensure_ascii=False)

Time:  12105.015693648
7614


In [None]:
model = keras.models.load_model('/content/drive/My Drive/Norm_add_based_recommendation/Full/model_1.2/Merge_model_1.4.4.2/SNN_100_epoches.snn')
import timeit
start = timeit.default_timer()
count_ = 0
error_sample = dict()
error_sample['data'] = []
for sample in testset['data']:
  noisy_add = sample['noisy_add']
  noisy_add = unicodedata.normalize('NFC', noisy_add)

  type_add_vector = create_type_add_vector(noisy_add)
  
  noisy_add = preprocess.remove_punctuation(CRF.get_better_add(noisy_add)).lower()
  noisy_add_vector = concat(np.array(encode([noisy_add])), type_add_vector).reshape(dim,)
  noisy_add_vectors = np.full((num_of_norm, dim), noisy_add_vector)
  if noisy_add == preprocess.remove_tone_of_text(noisy_add):
    x = model.predict([noisy_add_vectors, NT_norm_embeddings]).reshape(num_of_norm,)
  else:
    x = model.predict([noisy_add_vectors, norm_embeddings]).reshape(num_of_norm,)

  x = np.argmax(x, axis = 0)
  if str(ID2id[str(x)]) in sample['std_add']:
    count_ +=1
  else:
    error_sample['data'].append(sample)
  gc.collect()

stop = timeit.default_timer()

print('Time: ', stop - start) 
print(count_)
# print(ID2id[str(x)])
with open('/content/drive/My Drive/Norm_add_based_recommendation/Full/model_1.2/Merge_model_1.4.4.2/error_sample_100_epoches.json', 'w', encoding='utf8') as f:
  json.dump(error_sample, f, ensure_ascii=False)

Time:  12051.532232747999
7608


In [None]:
model = keras.models.load_model('/content/drive/My Drive/Norm_add_based_recommendation/Full/model_1.2/ElementWise_model_1.4.4.2/SNN_100_epoches.snn')
import timeit
start = timeit.default_timer()
count_ = 0
error_sample = dict()
error_sample['data'] = []
for sample in testset['data']:
  noisy_add = sample['noisy_add']
  noisy_add = unicodedata.normalize('NFC', noisy_add)

  type_add_vector = create_type_add_vector(noisy_add)
  
  noisy_add = preprocess.remove_punctuation(CRF.get_better_add(noisy_add)).lower()
  noisy_add_vector = concat(np.array(encode([noisy_add])), type_add_vector).reshape(dim,)
  noisy_add_vectors = np.full((num_of_norm, dim), noisy_add_vector)
  if noisy_add == preprocess.remove_tone_of_text(noisy_add):
    x = model.predict([noisy_add_vectors, NT_norm_embeddings]).reshape(num_of_norm,)
  else:
    x = model.predict([noisy_add_vectors, norm_embeddings]).reshape(num_of_norm,)

  x = np.argmax(x, axis = 0)
  if str(ID2id[str(x)]) in sample['std_add']:
    count_ +=1
  else:
    error_sample['data'].append(sample)
  gc.collect()

stop = timeit.default_timer()

print('Time: ', stop - start) 
print(count_)
# print(ID2id[str(x)])
with open('/content/drive/My Drive/Norm_add_based_recommendation/Full/model_1.2/ElementWise_model_1.4.4.2/error_sample_100_epoches.json', 'w', encoding='utf8') as f:
  json.dump(error_sample, f, ensure_ascii=False)

Time:  11303.41619964
7751


In [None]:
model = keras.models.load_model('/content/drive/My Drive/Norm_add_based_recommendation/Full/model_1.2/AD_model_1.4.4.2/SNN_100_epoches.snn')
import timeit
start = timeit.default_timer()
count_ = 0
error_sample = dict()
error_sample['data'] = []
for sample in testset['data']:
  noisy_add = sample['noisy_add']
  noisy_add = unicodedata.normalize('NFC', noisy_add)

  type_add_vector = create_type_add_vector(noisy_add)
  
  noisy_add = preprocess.remove_punctuation(CRF.get_better_add(noisy_add)).lower()
  noisy_add_vector = concat(np.array(encode([noisy_add])), type_add_vector).reshape(dim,)
  noisy_add_vectors = np.full((num_of_norm, dim), noisy_add_vector)
  if noisy_add == preprocess.remove_tone_of_text(noisy_add):
    x = model.predict([noisy_add_vectors, NT_norm_embeddings]).reshape(num_of_norm,)
  else:
    x = model.predict([noisy_add_vectors, norm_embeddings]).reshape(num_of_norm,)

  x = np.argmax(x, axis = 0)
  if str(ID2id[str(x)]) in sample['std_add']:
    count_ +=1
  else:
    error_sample['data'].append(sample)
  gc.collect()

stop = timeit.default_timer()

print('Time: ', stop - start) 
print(count_)
# print(ID2id[str(x)])
with open('/content/drive/My Drive/Norm_add_based_recommendation/Full/model_1.2/AD_model_1.4.4.2/error_sample_100_epoches.json', 'w', encoding='utf8') as f:
  json.dump(error_sample, f, ensure_ascii=False)

Time:  12244.963603577999
7855
