In [0]:
!pip uninstall --quiet --yes tensorflow
!pip install --quiet tensorflow-gpu==1.13.1
!pip install --quiet tensorflow-hub
!pip install --quiet seaborn
!pip install --quiet tf-sentencepiece
!pip install --quiet simpleneighbors
!pip install --quiet tqdm



import numpy as np
import os
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
import tf_sentencepiece  # Not used directly but needed to import TF ops.

from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange


In [0]:

module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/1'  

# Set up graph.
g = tf.Graph()
with g.as_default():
    text_input = tf.placeholder(dtype=tf.string, shape=[None])
    multiling_embed = hub.Module(module_url)
    embedded_text = multiling_embed(text_input)
    init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

# Initialize session.
session = tf.Session(graph=g)
session.run(init_op)

In [0]:
corpus_metadata = [
    ('en', '/content/test-en.csv', 'English'),
    ('es', '/content/test-es.csv', 'Spanish'),
]

language_to_sentences = {}
language_to_terms = {}


en_csv = pd.read_csv('./test-en.csv')
es_csv = pd.read_csv('./test-es.csv')
language_to_sentences['en'] = en_csv['Definitions']
language_to_sentences['es'] = es_csv['Definitions']
language_to_terms['en'] = en_csv['Terms']
language_to_terms['es'] = es_csv['Terms']


In [0]:
## Using a pre-trained model to transform sentences into vectors
import sys

batch_size = 512
language_to_embeddings = {}

for language_code, terms_file, language_name in corpus_metadata:
    print('\nComputing {} embeddings'.format(language_name))
    with tqdm(total=len(language_to_sentences[language_code])) as pbar:
        for batch in pd.read_csv(terms_file, chunksize=batch_size):
            try:
                 language_to_embeddings.setdefault(language_code, []).extend(
                    session.run(embedded_text, feed_dict={text_input: batch['Definitions']}))
            except:
                  print ("Unexpected error:", sys.exc_info()[0])
                  raise

            pbar.update(len(batch))
            
            
language_to_embeddings['en'] = np.array(language_to_embeddings['en'])
emb_en = language_to_embeddings['en'] 
language_to_embeddings['es'] = np.array(language_to_embeddings['es'])
emb_es = language_to_embeddings['es'] 


np.save('emb_en', emb_en)
np.save('emb_es', emb_es)




In [0]:
emb_en = np.load('/content/emb_en.npy')
emb_es = np.load('/content/emb_es.npy')


language_to_embeddings['en'] = emb_en
language_to_embeddings['es'] = emb_es


In [0]:
## Building an index of semantic vectors


num_index_trees = 40
language_name_to_index = {}
embedding_dimensions = len(list(language_to_embeddings.values())[0][0])

num_index_trees = 60
print('Computing mixed-language index')
combined_index = SimpleNeighbors(embedding_dimensions, metric='dot')
for language_code, terms_file, language_name in corpus_metadata:
    print('Adding {} embeddings to mixed-language index'.format(language_name))
    for i in trange(len(language_to_sentences[language_code])):
        annotated_sentence = '({}) {}'.format(language_name, language_to_sentences[language_code][i])
        combined_index.add_one(annotated_sentence, language_to_embeddings[language_code][i])

print('Building mixed-language index with {} trees...'.format(num_index_trees))
combined_index.build(n=num_index_trees)




In [0]:
#Match EN-ES
def find_parallel_nn(search_results):
  for result in search_results:
    if "(Spanish)" in result:
      return result
  raise 'NO PARALLEL'
    
def find_term_and_sentence_in_corpus(sentence):
  for i in range(len(language_to_sentences['es'])):
    if sentence.endswith(language_to_sentences['es'][i]):
      return (i, language_to_terms['es'][i], language_to_sentences['es'][i])
  
df_en_es = pd.pandas.DataFrame(columns=['en_es_index_es', 'en_es_index_en','en_es_term_es', 'en_es_term_en', 'en_es_def_es', 'en_es_def_en'])

en_dict = pd.read_csv('./test-en.csv')


with tqdm(total=len(language_to_sentences['en'])) as pbar:
  for row in en_dict.iterrows():
    index_en = row[0]
    data = row[1]
    term_en = data['Terms']
    definition_en = data['Definitions']

    sample_query = definition_en
    num_results = 1500  
    query_embedding = session.run(embedded_text, feed_dict={text_input: [sample_query]})[0]
    transformed_query_embedding = en_es_T.dot(query_embedding.T).T
    search_results = combined_index.nearest(transformed_query_embedding, n=num_results)

    try:
      parallel_nn = find_parallel_nn(search_results)
      (index_es, term_es, definition_es) = find_term_and_sentence_in_corpus(parallel_nn)
      df_en_es = df_en_es.append({'en_es_index_es':index_es, 'en_es_index_en':index_en,'en_es_term_es': term_es, 'en_es_term_en':term_en, 'en_es_def_es':definition_es, 'en_es_def_en':definition_en}, ignore_index=True)
    except:
      print('####### ERROR ##########')
      print ("Unexpected error:", sys.exc_info()[0])
      print (definition_en)
      print(search_results)
      raise
    pbar.update(1)

df_en_es.to_csv('match-en-es.csv')
  

In [0]:
#Match ES-EN

def find_parallel_nn_es_en(search_results):
  for result in search_results:
    if "(English)" in result:
      return result
  raise 'NO PARALLEL'
    
def find_term_and_sentence_in_corpus_es_en(sentence):
  for i in range(len(language_to_sentences['en'])):
    if sentence.endswith(language_to_sentences['en'][i]):
      return (i, language_to_terms['en'][i], language_to_sentences['en'][i])
  
df_es_en = pd.pandas.DataFrame(columns=['es_en_index_es', 'es_en_index_en','es_en_term_es', 'es_en_term_en', 'es_en_def_es', 'es_en_def_en'])

es_dict = pd.read_csv('./test-es.csv')
with tqdm(total=len(language_to_sentences['es'])) as pbar:
  for row in es_dict.iterrows():
    index_es = row[0]
    data = row[1]
    term_es = data['Terms']
    definition_es = data['Definitions']

    sample_query = definition_es
    num_results = 1500  
    query_embedding = session.run(embedded_text, feed_dict={text_input: [sample_query]})[0]
    transformed_query_embedding = es_en_T.dot(query_embedding.T).T
    search_results = combined_index.nearest(transformed_query_embedding, n=num_results)

    try:
      parallel_nn = find_parallel_nn_es_en(search_results)
      (index_en, term_en, definition_en) = find_term_and_sentence_in_corpus_es_en(parallel_nn)
      #print({'es_en_index_es':index_es, 'es_en_index_en':index_en,'es_en_term_es': term_es, 'es_en_term_en':term_en, 'es_en_def_es':definition_es, 'es_en_def_en':definition_en})
      df_es_en = df_es_en.append({'es_en_index_es':index_es, 'es_en_index_en':index_en,'es_en_term_es': term_es, 'es_en_term_en':term_en, 'es_en_def_es':definition_es, 'es_en_def_en':definition_en}, ignore_index=True)
      
    except:
      print('####### ERROR ##########')
      print ("Unexpected error:", sys.exc_info()[0])

      print (definition_en)
      print(search_results)
    pbar.update(1)

df_es_en.to_csv('match-es-en.csv')

In [0]:
df_es_en = pd.read_csv('/content/match-es-en.csv')
df_en_es = pd.read_csv('/content/match-en-es.csv')

In [0]:
df_intersection = pd.pandas.DataFrame(columns=['term_en', 'term_es', 'en_es_index_es', 'en_es_index_en','en_es_term_es', 'en_es_term_en', 'en_es_def_es', 'en_es_def_en', 'es_en_index_es', 'es_en_index_en','es_en_term_es', 'es_en_term_en', 'es_en_def_es', 'es_en_def_en' ])


embeddings_en = []
embeddings_es = []
for row_es in df_es_en.iterrows():
    index = row_es[0]
    data = row_es[1]


    es_en_index_es = int(data['es_en_index_es'])
    es_en_index_en = int(data['es_en_index_en'])
    es_en_term_es = data['es_en_term_es']
    es_en_term_en = data['es_en_term_en']
    es_en_def_es = data['es_en_def_es']
    es_en_def_en = data['es_en_def_en']



    row_en = df_en_es.iloc[[es_en_index_en]]


    en_es_index_es = int(row_en['en_es_index_es']) 
    en_es_index_en = int(row_en['en_es_index_en']) 
    en_es_term_es = row_en['en_es_term_es']
    en_es_term_en = row_en['en_es_term_en']
    en_es_def_es = row_en['en_es_def_es']
    en_es_def_en = row_en['en_es_def_en']


    term_en = str(es_en_term_en)
    term_es = str(es_en_term_es)




    if (en_es_index_es == es_en_index_es):

        def_en = es_en_def_en
        def_es = es_en_def_es

        annotated_sentence_en = '({}) {}'.format('English' ,def_en)
        annotated_sentence_es = '({}) {}'.format('Spanish' ,def_es)
        np.set_printoptions(threshold=sys.maxsize)
        vec_def_en =   combined_index.vec(annotated_sentence_en)
        vec_def_es =   combined_index.vec(annotated_sentence_es)


        embeddings_en.append(vec_def_en)
        embeddings_es.append(vec_def_es)
        df_intersection = df_intersection.append({'term_en': term_en, 'term_es': term_es, 'es_en_index_es':es_en_index_es, 'es_en_index_en':es_en_index_en,'es_en_term_es': es_en_term_es, 'es_en_term_en':es_en_term_en, 'es_en_def_es':es_en_def_es, 'es_en_def_en':es_en_def_en, 'en_es_index_es':en_es_index_es, 'en_es_index_en':en_es_index_en,'en_es_term_es': en_es_term_es, 'en_es_term_en':en_es_term_en, 'en_es_def_es':en_es_def_es, 'en_es_def_en':en_es_def_en}, ignore_index=True)


df_intersection.to_csv('intersection.csv')
