# IIC-3800 Tópicos en CC - NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- gensim 4.1.2
- keras 2.9.0
- tensorflow 2.9.1
- instant-distance 0.3.5


In [1]:
LANGS = ('en', 'es') # fíjese que aquí modifique fr por es
LANG_REPLACE = '$$lang'
WORD_MAP_PATH = f"./data/{'_'.join(LANGS)}.json"
BUILT_IDX_PATH = f"./data/{'_'.join(LANGS)}.idx"
DL_TEMPLATE = f"https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.{LANG_REPLACE}.align.vec"

points = []
values = []
word_map = {}

____________________________________________________________________________________________________________

## Actividad en clase

Construya un traductor de palabras  **inglés-castellano** usando FastText. Luego, construya un traductor **castellano-inglés**. Para esto haga lo siguiente:

- Cree el diccionario **word_map** (vea el ejemplo de clases).
- Construya el motor de vecinos cercanos sobre el embedding space usando la librería **instant_distance**.
- Busque las cinco palabras en castellano más cercanas a **hello**.
- Ahora cree el diccionario **word_map_reverse** para que pueda traducir desde castellano a inglés.
- Construya el motor de vecinos cercanos sobre el embedding space usando la librería **instant_distance**.
- Busque las cinco palabras en inglés más cercanas a **hola**.
- Cuanto termine, me avisa para entregarle una **L (logrado)**.
- Recuerde que las L otorgan un bono en la nota final de la asignatura.


***Tiene hasta el final de la clase.***

_________________________________________________________________________________________________________________

# Solución

In [2]:
import os, aiohttp

async with aiohttp.ClientSession() as session:
  for lang in LANGS:
    # Construct a url for each language
    url = DL_TEMPLATE.replace(LANG_REPLACE, lang)

    # Ensure the directory and files exist
    os.makedirs(os.path.dirname(BUILT_IDX_PATH), exist_ok=True)

    lineno = 0
    async with session.get(url) as resp:
      while True:
        lineno += 1
        line = await resp.content.readline()
        if not line:
          # EOF
          break

        linestr = line.decode('utf-8')
        tokens = linestr.split(' ')

        # The first token is the word and the rest
        # are the embedding
        value = tokens[0]
        embedding = [float(p) for p in tokens[1:]]

        # We only go from english to the other two languages
        if lang == 'en':
          word_map[value] = embedding
        else:
          # Don't index words that exist in english
          # to improve the quality of the results.
          if value in word_map:
              continue

          # We track values here to build the instant-distance index
          # Every value is prepended with 2 character language code.
          # This allows us to determine language output later.
          values.append(lang + value)
          points.append(embedding)



TimeoutError: 

In [None]:
import instant_distance, json

# Build the instant-distance index and dump it out to a file with .idx suffix
print('Building index... (this will take a while)')
hnsw = instant_distance.HnswMap.build(points, values, instant_distance.Config())
hnsw.dump(BUILT_IDX_PATH)

# Store the mapping from string to embedding in a .json file
with open(WORD_MAP_PATH, 'w') as f:
    json.dump(word_map, f)

In [None]:
word = 'hello'

# Get an embedding for the given word
embedding = word_map.get(word)
if not embedding:
  print(f"Word not recognized: {word}")
  exit(1)

hnsw = instant_distance.HnswMap.load(BUILT_IDX_PATH)
search = instant_distance.Search()
hnsw.search(embedding, search)

# Print the results
for result in list(search)[:5]:
  # We know that the first two characters of the value is the language code
  # from when we built the index.
  print(result.value)

In [None]:
LANGS = ('es', 'en')
LANG_REPLACE = '$$lang'
WORD_MAP_PATH = f"./data/{'_'.join(LANGS)}.json"
BUILT_IDX_PATH = f"./data/{'_'.join(LANGS)}.idx"
DL_TEMPLATE = f"https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.{LANG_REPLACE}.align.vec"

points = []
values = []
word_map = {}

async with aiohttp.ClientSession() as session:
  for lang in LANGS:
    # Construct a url for each language
    url = DL_TEMPLATE.replace(LANG_REPLACE, lang)

    # Ensure the directory and files exist
    os.makedirs(os.path.dirname(BUILT_IDX_PATH), exist_ok=True)

    lineno = 0
    async with session.get(url) as resp:
      while True:
        lineno += 1
        line = await resp.content.readline()
        if not line:
          # EOF
          break

        linestr = line.decode('utf-8')
        tokens = linestr.split(' ')

        # The first token is the word and the rest
        # are the embedding
        value = tokens[0]
        embedding = [float(p) for p in tokens[1:]]

        # We only go from english to the other two languages
        if lang == 'es':
          word_map[value] = embedding
        else:
          # Don't index words that exist in spanish
          # to improve the quality of the results.
          if value in word_map:
              continue

          # We track values here to build the instant-distance index
          # Every value is prepended with 2 character language code.
          # This allows us to determine language output later.
          values.append(lang + value)
          points.append(embedding)


In [None]:
# Build the instant-distance index and dump it out to a file with .idx suffix
print('Building index... (this will take a while)')
hnsw = instant_distance.HnswMap.build(points, values, instant_distance.Config())
hnsw.dump(BUILT_IDX_PATH)

# Store the mapping from string to embedding in a .json file
with open(WORD_MAP_PATH, 'w') as f:
    json.dump(word_map, f)

In [None]:
word = 'hola'

# Get an embedding for the given word
embedding = word_map.get(word)
if not embedding:
  print(f"Word not recognized: {word}")
  exit(1)

hnsw = instant_distance.HnswMap.load(BUILT_IDX_PATH)
search = instant_distance.Search()
hnsw.search(embedding, search)

# Print the results
for result in list(search)[:4]:
  # We know that the first two characters of the value is the language code
  # from when we built the index.
  print(result.value)