# Correspondencia en direcciones postales

In [1]:
import random
import string
import numpy as np
import tensorflow as tf

In [3]:
n = 10

street_names = ["diagon","elm","abbey","gran","python"]
street_types = ["callejon","calle","carretera","via","avenida"]
street_zips = [random.randint(20000,29999) for i in range(5)]
numbers = [random.randint(1,999) for i in range(n)]

In [18]:
streets = [random.choice(street_names) for i in range(n)]
street_prefixes = [random.choice(street_types) for i in range(n)]
zips = [random.choice(street_zips) for i in range(n)]

full_streets = [x + " " + y + " " + str(z) for x, y, z in zip(street_prefixes, streets, numbers)]
full_directions = [list(x) for x in zip(full_streets, zips)]

In [19]:
full_directions

[['carretera gran 639', 26103],
 ['via gran 528', 27243],
 ['via elm 815', 23500],
 ['callejon elm 78', 23500],
 ['callejon gran 963', 27243],
 ['avenida diagon 420', 23500],
 ['calle gran 148', 26103],
 ['calle gran 883', 24448],
 ['callejon abbey 146', 24448],
 ['avenida python 369', 27243]]

In [37]:
# Generar un error
def create_typo(s, probability=0.75):
    if (random.uniform(0,1)) < 0.75: # En un 75% de los casos
        random_idx = random.choice(range(len(s))) # Escoge un índice aleatorio dentro del 
        string_list = list(s)                             # rango de la longitud de la palabra
        string_list[random_idx] = random.choice(string.ascii_lowercase)
        s = "".join(string_list)
    return s

In [39]:
typo_streets = [create_typo(x) for x in streets]
typo_streets

['gran',
 'grnn',
 'elm',
 'elm',
 'nran',
 'diagnn',
 'gran',
 'gxan',
 'ybbey',
 'pythou']

In [41]:
typo_full_streets = [x + " " + y + " " + str(z) for x, y, z in zip(street_prefixes, typo_streets, numbers)]
typo_full_directions = [list(x) for x in zip(typo_full_streets, zips)]
typo_full_directions

[['carretera gran 639', 26103],
 ['via grnn 528', 27243],
 ['via elm 815', 23500],
 ['callejon elm 78', 23500],
 ['callejon nran 963', 27243],
 ['avenida diagnn 420', 23500],
 ['calle gran 148', 26103],
 ['calle gxan 883', 24448],
 ['callejon ybbey 146', 24448],
 ['avenida pythou 369', 27243]]

## Tensorflow

In [42]:
session = tf.Session()

In [44]:
# Direcciones erróneas
test_address = tf.sparse_placeholder(dtype=tf.string) # Es un string ==> tf.sparse_placeholder()
test_zip = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Direcciones correctas
ref_address = tf.sparse_placeholder(dtype=tf.string)
ref_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32)

In [47]:
# Distancias
zip_distance = tf.square(tf.subtract(ref_zip, test_zip))
address_distance = tf.edit_distance(ref_address, test_address, normalize=True)

- $ S(x,y) = 0 $ si $x$ e $y$ son totalmente diferentes (no se parecen en nada)


- $ S(x,x) = 1 $ ya que todo objeto es igual a sí mismo.


- $ S(x,y) = \frac{D - d(x,y)}{D-d} $

$S\equiv$ similaridad

$D\equiv$ mayor distancia posible entre dos objetos

$d\equiv$ menor distancia posible entre dos objetos

In [50]:
# Gather = reunir
# Squeeze = aplanar
zip_max = tf.gather(tf.squeeze(zip_distance), tf.argmax(zip_distance, axis=1)) # Código zip más grande de cada fila
zip_min = tf.gather(tf.squeeze(zip_distance), tf.argmin(zip_distance, axis=1)) # Código zip más pequeño de cada fila

zip_sim = tf.divide(tf.subtract(zip_max, zip_distance), tf.subtract(zip_max, zip_min))

In [51]:
# address_max es 1 y address_min es 0 (porque están normalizadas)
address_sim = tf.subtract(1.0, address_distance)

$$ S(x,y) = \sum_{i=1}^k w_i\cdot S_k(x,y) $$
$$ \sum_{i=1}^k w_i = 1 $$

In [53]:
address_wi = 0.5
zip_wi = 1.0 - address_wi

In [54]:
weighted_sim = tf.add(tf.transpose(tf.multiply(address_wi, address_sim)), tf.multiply(zip_wi, zip_sim))

In [55]:
# La dirección más parecida ==> mayor similaridad
top_match_idx = tf.argmax(weighted_sim, axis=1) 

In [57]:
def sparse_from_word_vector(word_vector):
    num_words = len(word_vector)
    
    idx = [[0, x_idx, y_idx] for x_idx, x in enumerate(word_vector) for y_idx, y in enumerate(x)]
    vals = list("".join(word_vector))
    
    return tf.SparseTensorValue(indices=idx, values=vals, dense_shape=[num_words, 1, 1])