In [None]:
# !pip install annoy

# !pip install tensorflow==1.15.0
# !pip install "tensorflow_hub >= 0.7.0"
# !pip3 install tensorflow_text==1.15.0



In [None]:
import os
import json

import time
import sys

import tensorflow as tf
import tensorflow_hub as hub

from annoy import AnnoyIndex
from gensim.parsing.preprocessing import remove_stopwords

import pandas as pd
import numpy as np

In [None]:
print(tf.__version__)

1.15.0


In [None]:
# local_file.py

# TODO - this must be passed
model_indexes_path = './'
model_index_reference_file = 'sentence-similarity.json'


def read_sentence_similarity_from_file():
  sentence_similarity = None
  try:
    with open(model_indexes_path + model_index_reference_file, 'r') as json_file:
      sentence_similarity = json.load(json_file)

  except Exception as e:
    print('Exception in read_sentence_similarity_from_file: {0}'.format(e))
    sentence_similarity = {}

  return sentence_similarity


def write_to_sentence_similarity_file(sentence_similarity):
  try:
    if(sentence_similarity != None):
      with open(model_indexes_path + model_index_reference_file, 'w+') as json_file:
        json.dump(sentence_similarity, json_file, indent=2)

  except Exception as e:
    print('Exception in write_to_sentence_similarity_file: {0}'.format(e))
    raise

In [None]:
# utils.py
# 'local', 'aws'
g_source_type = 'local'
g_sentence_similarity = None


def get_source_type():
  return g_source_type


def get_sentence_similarity_dict():
  global g_sentence_similarity
  if(g_sentence_similarity == None):
    if(get_source_type() == 'aws'):
      g_sentence_similarity = read_sentence_similarity_from_aws_s3()
    else:
      g_sentence_similarity = read_sentence_similarity_from_file()

  return g_sentence_similarity


def put_sentence_similarity_dict(sentence_similarity):
  if(get_source_type() == 'aws'):
    write_sentence_similarity_to_aws_s3(sentence_similarity)
  else:
    write_to_sentence_similarity_file(sentence_similarity)

  global g_sentence_similarity
  g_sentence_similarity = sentence_similarity


def read_from_datafile(path, cols):
  try:
    df_docs = pd.read_csv(path, usecols=cols)
  except Exception as e:
    print('Exception in read_data: {0}'.format(e))
    raise

  return df_docs

def print_with_time(msg):
  print('{}: {}'.format(time.ctime(), msg))
  sys.stdout.flush()


In [None]:
# get_index_files.py

def get_index_files():
  result = None

  try:
    sentence_similarity_dict = get_sentence_similarity_dict()
    result = list(sentence_similarity_dict.values())

  except Exception as e:
    print('Exception in get_index_files: {0}'.format(e))
    result = {
        'error': 'Failure in get_index_files' + e.message
    }

  return result

In [None]:
# doc_indexing.py
def create_index(params, data_frame, content_index):
  try:
    annoy_vector_dimension = params.get('annoy_vector_dimension')
    index_filename = params.get('index_filename')
    data_file = params.get('data_file')
    data_file_updated = params.get('data_file_updated')
    use_model = params.get('use_model')
    model_name = params.get('model_name')
    stop_words = params.get('stop_words')
    default_batch_size = params.get('default_batch_size')
    num_trees = params.get('num_trees')
    model_indexes_path = params.get('model_indexes_path')

    start_time = time.time()
    embed_func = hub.Module(use_model)
    end_time = time.time()
    
    print_with_time('Load the module: {}'.format(end_time-start_time))

    start_time = time.time()
    sentences = tf.compat.v1.placeholder(dtype=tf.string, shape=[None])
    embedding = embed_func(sentences)
    end_time = time.time()
    print_with_time(
        'Init sentences embedding: {}'.format(end_time-start_time))

    start_time = time.time()
    # data_frame = read_data(data_file, data_file_updated)
    content_array = data_frame.to_numpy()
    end_time = time.time()
    print('Read Data Time: {}'.format(end_time - start_time))

    start_time = time.time()
    ann = build_index(annoy_vector_dimension, embedding,
                      default_batch_size, sentences,
                      content_array, stop_words, content_index)
    end_time = time.time()
    print('Build Index Time: {}'.format(end_time - start_time))

    ann.build(num_trees)

    # create model_indexes folder if it doesn't exist
    if not os.path.exists(model_indexes_path):
      os.makedirs(model_indexes_path)

    save_index(ann, model_indexes_path + index_filename)

    sentence_similarity_dict = get_sentence_similarity_dict()
    if(sentence_similarity_dict.get(index_filename) != None):
      sentence_similarity_dict.pop(index_filename, None)

    sentence_similarity_dict[index_filename] = {
        'model_name': model_name, 'data_file': data_file,
        'index_filename': index_filename, 'use_model': use_model,
        'vector_size': annoy_vector_dimension, 'stop_words': stop_words
    }
    put_sentence_similarity_dict(sentence_similarity_dict)

  except Exception as e:
    raise


def build_index(annoy_vector_dimension, embedding_fun, batch_size, sentences, content_array, stop_words, content_index):
    ann = AnnoyIndex(annoy_vector_dimension, metric='angular')
    batch_sentences = []
    batch_indexes = []
    last_indexed = 0
    num_batches = 0
    content = ''

    with tf.compat.v1.Session() as sess:
      sess.run([tf.compat.v1.global_variables_initializer(),
                tf.compat.v1.tables_initializer()])
      for sindex, sentence in enumerate(content_array):
        content = sentence[content_index]
        if stop_words:
          content = remove_stopwords(sentence[1])

        batch_sentences.append(content)
        batch_indexes.append(sindex)

        if len(batch_sentences) == batch_size:
          context_embed = sess.run(embedding_fun, feed_dict={
                                    sentences: batch_sentences})

          for index in batch_indexes:
            ann.add_item(index, context_embed[index - last_indexed])
            batch_sentences = []
            batch_indexes = []

          last_indexed += batch_size
          if num_batches % 10000 == 0:
            print_with_time('sindex: {} annoy_size: {}'.format(
                sindex, ann.get_n_items()))

          num_batches += 1

      if batch_sentences:
        context_embed = sess.run(embedding_fun, feed_dict={
                                  sentences: batch_sentences})
        for index in batch_indexes:
          ann.add_item(index, context_embed[index - last_indexed])

    return ann


def save_index(ann, file_name):
  ann.save(file_name)


def load_index(annoy_vector_dimension, file_name, force_download):
  annoy_index = AnnoyIndex(annoy_vector_dimension, metric='angular')
  annoy_index.load(file_name)

  return annoy_index

In [None]:
# search.py

def search_query(params, data_frame):
  try:
    input_search_string = params.get('input_search_string')
    annoy_vector_dimension = params.get('annoy_vector_dimension')
    index_filename = params.get('index_filename')
    data_file = params.get('data_file')
    data_file_updated = params.get('data_file_updated')
    use_model = params.get('use_model')
    use_updated_model = params.get('use_updated_model')
    stop_words = params.get('stop_words')
    k = params.get('k')
    filter_values = params.get('filter_values')
    model_indexes_path = params.get('model_indexes_path')

    start_time = time.time()
    annoy_index = load_index(annoy_vector_dimension, model_indexes_path +
                                            index_filename, use_updated_model)
    end_time = time.time()
    print_with_time(
        'Annoy Index load time: {}'.format(end_time-start_time))

    content_array = data_frame.to_numpy()

    start_time = time.time()
    embed_func = hub.Module(use_model)
    end_time = time.time()
    print_with_time('Load the module: {}'.format(end_time-start_time))

    start_time = time.time()
    sentences = tf.compat.v1.placeholder(dtype=tf.string, shape=[None])
    embedding = embed_func(sentences)
    end_time = time.time()
    print_with_time(
        'Init sentences embedding: {}'.format(end_time-start_time))

    start_time = time.time()
    sess = tf.compat.v1.Session()
    sess.run([tf.compat.v1.global_variables_initializer(),
              tf.compat.v1.tables_initializer()])
    end_time = time.time()
    print_with_time(
        'Time to create session: {}'.format(end_time-start_time))

    if stop_words:
      input_search_string = remove_stopwords(input_search_string)

    start_time = time.time()
    sentence_vector = sess.run(embedding, feed_dict={
                                sentences: [input_search_string]})
    nns = annoy_index.get_nns_by_vector(sentence_vector[0], k)
    end_time = time.time()
    print_with_time('nns done: Time: {}'.format(end_time-start_time))

    similarities = [content_array[nn] for nn in nns]

  except Exception as e:
    raise

  return similarities

In [None]:
# find_similar_docs.py

def find_similar_document(params, data_frame, input_sentence):
  try:
    input_sentence_id = params.get('input_sentence_id')
    annoy_vector_dimension = params.get('annoy_vector_dimension')
    index_filename = params.get('index_filename')
    data_file = params.get('data_file')
    data_file_updated = params.get('data_file_updated')
    use_model = params.get('use_model')
    use_updated_model = params.get('use_updated_model')
    stop_words = params.get('stop_words')
    k = params.get('k')
    filter_values = params.get('filter_values')
    model_indexes_path = params.get('model_indexes_path')

    start_time = time.time()
    annoy_index = load_index(annoy_vector_dimension, model_indexes_path +
                                            index_filename, use_updated_model)
    end_time = time.time()
    print_with_time(
        'Annoy Index load time: {}'.format(end_time-start_time))

    content_array = data_frame.to_numpy()

    start_time = time.time()
    embed_func = hub.Module(use_model)
    end_time = time.time()
    print_with_time('Load the module: {}'.format(end_time-start_time))

    start_time = time.time()
    sentences = tf.compat.v1.placeholder(dtype=tf.string, shape=[None])
    embedding = embed_func(sentences)
    end_time = time.time()
    print_with_time(
        'Init sentences embedding: {}'.format(end_time-start_time))

    start_time = time.time()
    sess = tf.compat.v1.Session()
    sess.run([tf.compat.v1.global_variables_initializer(),
              tf.compat.v1.tables_initializer()])
    end_time = time.time()
    print_with_time(
        'Time to create session: {}'.format(end_time-start_time))

    if stop_words:
      input_sentence = remove_stopwords(input_sentence)

    start_time = time.time()
    sentence_vector = sess.run(embedding, feed_dict={
                                   sentences: input_sentence})
    nns = annoy_index.get_nns_by_vector(sentence_vector[0], k)
    end_time = time.time()
    print_with_time('nns done: Time: {}'.format(end_time-start_time))

    similarities = [content_array[nn] for nn in nns]

  except Exception as e:
    raise

  return similarities

In [None]:
# globals
VECTOR_SIZE = 512
default_k = 10
default_batch_size = 32
default_num_trees = 10

# for new articles (articles1.csv)
g_columns = ['id', 'title', 'publication', 'content']
g_id_index = 0
g_content_index = 3
g_content_key = 'content'

# for short-wiki.csv
  # g_columns = ['GUID', 'CONTENT', 'ENTITY']
  # g_id_index = 0
  # g_content_index = 1
  # g_content_key = 'CONTENT'

g_df_docs = None
g_data_file = None
g_sentence_similarity = None

default_use_model = 'https://tfhub.dev/google/universal-sentence-encoder-large/3?tf-hub-format=compressed'
default_csv_file_path = 'short-wiki.csv'
model_indexes_path = './'
model_index_reference_file = 'sentence-similarity.json'
default_index_file = 'wiki.annoy.index'
default_index_filepath = model_indexes_path + default_index_file

In [None]:
# methods called from the APIs

def train(params):
  result = {}

  print('Training', params)

  annoy_vector_dimension = VECTOR_SIZE
  data_file_updated = False
  num_trees = default_num_trees
  stop_words = False

  try:
    if params:
      if params.get('vector_size'):
        annoy_vector_dimension = params.get('vector_size')
      if params.get('index_filename'):
        index_filename = params.get('index_filename')
      if params.get('data_file'):
        data_file = params.get('data_file')
      if params.get('data_file_updated'):
        data_file_updated = params.get('data_file_updated')
      if params.get('use_model'):
        use_model = params.get('use_model')
      if params.get('model_name'):
        model_name = params.get('model_name')
      if params.get('stop_words'):
        stop_words = params.get('stop_words')

    # required input params
    if len(index_filename) == 0 or len(data_file) == 0 or len(use_model) == 0 or len(model_name) == 0:
      result = {
          'error': 'Invalid Input'
      }
    else:
      index_params = {
        'annoy_vector_dimension': annoy_vector_dimension,
        'index_filename': index_filename,
        'data_file': data_file,
        'data_file_updated': data_file_updated,
        'use_model': use_model,
        'model_name': model_name,
        'stop_words': stop_words,
        'default_batch_size': default_batch_size,
        'num_trees': num_trees,
        'model_indexes_path': model_indexes_path
      }
      data_frame = read_data(data_file, data_file_updated)
      # this index is the content index
      create_index(index_params, data_frame, g_content_index)
      result = {
        'message': 'Training successful'
      }

  except Exception as e:
    print('Exception in train: {0}'.format(e))
    result = {
      'error': 'Exception in train: {0}'.format(e)
    }

  return result


def search(params):
  result = {}

  print('Search', params)

  annoy_vector_dimension = VECTOR_SIZE
  index_filename = default_index_file

  data_file = default_csv_file_path
  data_file_updated = False
  use_model = default_use_model
  use_updated_model = False
  k = default_k
  stop_words = False

  input_search_string = None
  filter_values = []

  try:
    if params:
      if params.get('search_string'):
        input_search_string = params.get('search_string')
      if params.get('vector_size'):
        annoy_vector_dimension = params.get('vector_size')
      if params.get('index_filename'):
        index_filename = params.get('index_filename')
      if params.get('data_file'):
        data_file = params.get('data_file')
      if params.get('data_file_updated'):
        data_file_updated = params.get('data_file_updated')
      if params.get('use_model'):
        use_model = params.get('use_model')
      if params.get('use_updated_model'):
        use_updated_model = params.get('use_updated_model')
      if params.get('k'):
        k = params.get('k')
      if params.get('stop_words'):
        stop_words = params.get('stop_words')
      if params.get('filter_values'):
        filter_values = params.get('filter_values')

    # required input params
    if len(input_search_string) <= 0 or len(index_filename) == 0 or len(data_file) == 0 or len(use_model) == 0:
      result = {
          'error': 'Invalid Input'
      }
    else:
      search_params = {
        'input_search_string': input_search_string,
        'annoy_vector_dimension': annoy_vector_dimension,
        'index_filename': index_filename,
        'data_file': data_file,
        'data_file_updated': data_file_updated,
        'use_model': use_model,
        'use_updated_model': use_updated_model,
        'k': k,
        'stop_words': stop_words,
        'filter_values': filter_values,
        'model_indexes_path': model_indexes_path
      }
      start_time = time.time()
      data_frame = read_data(data_file, data_file_updated)
      end_time = time.time()
      print_with_time(
          'Time to read data file: {}'.format(end_time-start_time))

      similarities = search_query(search_params, data_frame)
      similar_sentences = []
      for sentence in similarities:
        if len(filter_values) > 0:
          if sentence[2].lower() in filter_values:
            similar_sentences.append({
              'guid': sentence[g_id_index],
              'content': sentence[g_content_index]
            })
        else:
          similar_sentences.append({
            'guid': sentence[g_id_index],
            'content': sentence[g_content_index]
          })
        print(sentence[g_id_index])

      result = {
          'sourceGuid': '000',
          'sourceSentence': input_search_string,
          'similarDocs': similar_sentences
      }

  except Exception as e:
    print('Exception in search: {0}'.format(e))
    result = {
      'error': 'Exception in search: {0}'.format(e)
    }

  return result


def predict(params):
  result = {}

  print('Predict', params)

  annoy_vector_dimension = VECTOR_SIZE
  index_filename = default_index_file

  data_file = default_csv_file_path
  data_file_updated = False
  use_model = default_use_model
  use_updated_model = False
  k = default_k
  stop_words = False

  input_sentence_id = None
  filter_values = []

  try:
    if params:
      if params.get('guid'):
        input_sentence_id = params.get('guid')
      if params.get('vector_size'):
        annoy_vector_dimension = params.get('vector_size')
      if params.get('index_filename'):
        index_filename = params.get('index_filename')
      if params.get('data_file'):
        data_file = params.get('data_file')
      if params.get('data_file_updated'):
        data_file_updated = params.get('data_file_updated')
      if params.get('use_model'):
        use_model = params.get('use_model')
      if params.get('use_updated_model'):
        use_updated_model = params.get('use_updated_model')
      if params.get('k'):
        k = params.get('k')
      if params.get('stop_words'):
        stop_words = params.get('stop_words')
      if params.get('filter_values'):
        filter_values = params.get('filter_values')

    # required input params
    if len(input_sentence_id) <= 0 or len(index_filename) == 0 or len(data_file) == 0 or len(use_model) == 0:
      result = {
          'error': 'Invalid Input'
      }
    else:
      search_params = {
        'input_sentence_id': input_sentence_id,
        'annoy_vector_dimension': annoy_vector_dimension,
        'index_filename': index_filename,
        'data_file': data_file,
        'data_file_updated': data_file_updated,
        'use_model': use_model,
        'use_updated_model': use_updated_model,
        'k': k,
        'stop_words': stop_words,
        'filter_values': filter_values,
        'model_indexes_path': model_indexes_path
      }

      start_time = time.time()
      data_frame = read_data(data_file, data_file_updated)
      end_time = time.time()
      print_with_time(
          'Time to read data file: {}'.format(end_time-start_time))

      print_with_time('Input Sentence id: {}'.format(input_sentence_id))
      params_filter = 'GUID == "' + input_sentence_id + '"'
      input_data_object = data_frame.query(params_filter)
      input_sentence = input_data_object[g_content_key]

      similarities = find_similar_document(search_params, data_frame, input_sentence)

      similar_sentences = []
      for sentence in similarities[1:]:
        if len(filter_values) > 0:
          if sentence[2].lower() in filter_values:
            similar_sentences.append({
              'guid': sentence[g_id_index],
              'content': sentence[g_content_index]
            })
        else:
          similar_sentences.append({
            'guid': sentence[g_id_index],
            'content': sentence[g_content_index]
          })
        print(sentence[g_id_index])

      result = {
        'sourceGuid': input_sentence_id,
        'sourceSentence': input_sentence.values[0],
        'similarDocs': similar_sentences
      }

  except Exception as e:
    print('Exception in predict: {0}'.format(e))
    result = {
      'error': 'Exception in predict: {0}'.format(e)
    }

  return result

def read_data(path, force_download):
  if(get_source_type() == 'aws'):
    download_from_s3(path, force_download)

  global g_df_docs, g_data_file
  if g_df_docs is None or path != g_data_file:
    try:
      g_df_docs = read_from_datafile(path, g_columns)
      g_data_file = path
    except Exception as e:
      print('Exception in read_data: {0}'.format(e))
      raise

  return g_df_docs


In [None]:
def get_model_indexes():
  """Get all trained model in the same structure that was passed during training process
  :return: List of trained models if any, else empty list
  """

  print('get_model_indexes')
  result = get_index_files()
  print(result)

def train_model(params):
  """Train the given model
  :param params: {
    model_name: 'name-of-the-model',
    use_model: 'googles-tfhub-model-url',
    index_filename: 'name-of-the-annoy-index-filename',
    data_file: 'data-source-that-needs-to-be-trained',
    vector_size: 'size-of-the-vector' (default: 512),
    stop_words: boolean (default: False),
    default_batch_size': Numeric (default: 32)
  }
  :return: success - result object with success message, 
            failure - result object with error message
  """
  result = train(params)
  print (json.dumps(result))


def search_string(params):
  """Find similar documents given a source search string
  :param params: {
    input_search_string: 'string-to-search',
    model_name: 'name-of-the-model',
    use_model: 'googles-tfhub-model-url',
    use_updated_model: Boolean (default: False): Set it to true to force download the new model,
    index_filename: 'name-of-the-annoy-index-filename',
    data_file: 'data-source-that-needs-to-be-trained',
    data_file_updated: Boolean (default: False): Set it to true if the data source is updated, so that the application will not take it from cache
    vector_size: 'size-of-the-vector' (default: 512),
    stop_words: boolean (default: False),
    default_batch_size': Numeric (default: 32),
    num_trees: Numeric (default: 10),
    k: Numeric (default: 10)
  }
  :return: success - result object with success message, 
            failure - result object with error message
  """
  result = search(params)
  print (json.dumps(result))


def predict_sentence(params):
  """Find similar documents given a source document id
  :param params: {
    guid: 'source-document-id',
    model_name: 'name-of-the-model',
    use_model: 'googles-tfhub-model-url',
    use_updated_model: Boolean (default: False): Set it to true to force download the new model,
    index_filename: 'name-of-the-annoy-index-filename',
    data_file: 'data-source-that-needs-to-be-trained',
    data_file_updated: Boolean (default: False): Set it to true if the data source is updated, so that the application will not take it from cache
    vector_size: 'size-of-the-vector' (default: 512),
    stop_words: boolean (default: False),
    default_batch_size': Numeric (default: 32),
    num_trees: Numeric (default: 10),
    k: Numeric (default: 10)
  }
  :return: success - result object with success message, 
            failure - result object with error message
  """
  result = predict(params)
  print (result)

In [None]:
get_index_files()

[{'data_file': 'articles1.csv',
  'index_filename': 'news.annoy.use.4.index',
  'model_name': 'news-annoy-use-4',
  'stop_words': False,
  'use_model': 'https://tfhub.dev/google/universal-sentence-encoder-large/3',
  'vector_size': 512},
 {'data_file': 'articles1.csv',
  'index_filename': 'news.annoy.use.large.3.index',
  'model_name': 'news-annoy-use-large-3',
  'stop_words': False,
  'use_model': 'https://tfhub.dev/google/universal-sentence-encoder-large/3',
  'vector_size': 512}]

In [None]:
# train
training_params = {
    'model_name': 'news-annoy-use-large-3',
    'use_model': 'https://tfhub.dev/google/universal-sentence-encoder-large/3',
    'index_filename': 'news.annoy.use.large.3.index',
    'data_file': 'articles1.csv',
    'stop_words': False
}
train_model(trainingParams)

Training {'model_name': 'news-annoy-use-4', 'use_model': 'https://tfhub.dev/google/universal-sentence-encoder-large/3', 'index_filename': 'news.annoy.use.4.index', 'data_file': 'articles1.csv', 'stop_words': False}
1
Mon Dec  7 23:00:57 2020: Load the module: 1.9526698589324951
2
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Mon Dec  7 23:00:58 2020: Init sentences embedding: 0.788921594619751
Read Data Time: 0.01198720932006836
Mon Dec  7 23:01:20 2020: sindex: 31 annoy_size: 32
Build Index Time: 5014.780691385269
{'message': 'Training successful'}


In [None]:
# search by string
search_params = {
    'search_string': 'Dr. Anthony Fauci, head of the National Institute of Allergy and Infectious Diseases, said Monday that family gatherings over the holidays should be limited to fewer than 10 people',
    'index_filename': 'news.annoy.use.large.3.index',
    'use_model': 'https://tfhub.dev/google/universal-sentence-encoder-large/3',
    'data_file': 'articles1.csv'
}
search_string(search_params)

Search {'search_string': 'Dr. Anthony Fauci, head of the National Institute of Allergy and Infectious Diseases, said Monday that family gatherings over the holidays should be limited to fewer than 10 people', 'index_filename': 'news.annoy.use.large.3.index', 'use_model': 'https://tfhub.dev/google/universal-sentence-encoder-large/3', 'data_file': 'articles1.csv'}
Tue Dec  8 00:14:48 2020: Time to read data file: 5.4836273193359375e-06
Tue Dec  8 00:14:48 2020: Annoy Index load time: 0.00019311904907226562
Tue Dec  8 00:14:51 2020: Load the module: 2.8809621334075928
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Tue Dec  8 00:14:52 2020: Init sentences embedding: 0.7818193435668945
Tue Dec  8 00:16:38 2020: Time to create session: 106.5471293926239
Tue Dec  8 00:16:40 2020: nns done: Time: 1.2977993488311768
67105
52409
43543
34703
49476
53594
43585
44839
19525
20803
