# Correspondencia en direcciones postales

In [67]:
import random
import string
import numpy as np
import tensorflow as tf

In [68]:
n = 10

street_names = ["diagon","elm","abbey","gran","python"]
street_types = ["callejon","calle","carretera","via","avenida"]
street_zips = [random.randint(20000,29999) for i in range(5)]
numbers = [random.randint(1,999) for i in range(n)]

In [69]:
streets = [random.choice(street_names) for i in range(n)]
street_prefixes = [random.choice(street_types) for i in range(n)]
zips = [random.choice(street_zips) for i in range(n)]

full_streets = [x + " " + y + " " + str(z) for x, y, z in zip(street_prefixes, streets, numbers)]
full_directions = [list(x) for x in zip(full_streets, zips)]

In [70]:
full_directions

[['via gran 346', 26244],
 ['via python 759', 22130],
 ['carretera gran 832', 28417],
 ['via diagon 641', 25492],
 ['calle abbey 899', 25492],
 ['callejon elm 723', 28417],
 ['calle python 122', 25492],
 ['callejon diagon 94', 22130],
 ['avenida abbey 827', 25492],
 ['avenida python 83', 28417]]

In [104]:
# Generar un error
def create_typo(s, probability=0.75):
    if random.uniform(0,1) < 0.75: # En un 75% de los casos
        random_idx = random.choice(range(len(s))) # Escoge un índice aleatorio dentro del 
        string_list = list(s)                     # rango de la longitud de la palabra
        string_list[random_idx] = random.choice(string.ascii_lowercase)
        s = "".join(string_list)
    return s

In [105]:
typo_streets = [create_typo(x) for x in streets]
typo_streets

['grvn',
 'pnthon',
 'cran',
 'diagon',
 'abbey',
 'ezm',
 'python',
 'diaton',
 'abbhy',
 'puthon']

In [73]:
typo_full_streets = [x + " " + y + " " + str(z) for x, y, z in zip(street_prefixes, typo_streets, numbers)]
typo_full_directions = [list(x) for x in zip(typo_full_streets, zips)]
typo_full_directions

[['via grcn 346', 26244],
 ['via python 759', 22130],
 ['carretera gray 832', 28417],
 ['via diagon 641', 25492],
 ['calle abbes 899', 25492],
 ['callejon elq 723', 28417],
 ['calle pyvhon 122', 25492],
 ['callejon diagon 94', 22130],
 ['avenida abbcy 827', 25492],
 ['avenida pythou 83', 28417]]

## Tensorflow

In [74]:
session = tf.Session()

In [87]:
# Direcciones erróneas
test_address = tf.sparse_placeholder(dtype=tf.string) # Es un string ==> tf.sparse_placeholder()
test_zip = tf.placeholder(shape=[None,1], dtype=tf.float32)

# Direcciones correctas
ref_address = tf.sparse_placeholder(dtype=tf.string)
ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32)

In [88]:
# Distancias
zip_distance = tf.square(tf.subtract(ref_zip, test_zip)) # (zip_t - zip_r)^2
address_distance = tf.edit_distance(ref_address, test_address, normalize=True)

- $ S(x,y) = 0 $ si $x$ e $y$ son totalmente diferentes (no se parecen en nada)


- $ S(x,x) = 1 $ ya que todo objeto es igual a sí mismo.


- $ S(x,y) = \frac{D - d(x,y)}{D-d} $

$S\equiv$ similaridad

$D\equiv$ mayor distancia posible entre dos objetos

$d\equiv$ menor distancia posible entre dos objetos

In [89]:
# Gather = reunir
# Squeeze = aplanar
zip_max = tf.gather(tf.squeeze(zip_distance), tf.argmax(zip_distance, axis=1)) # Código zip más grande de cada fila
zip_min = tf.gather(tf.squeeze(zip_distance), tf.argmin(zip_distance, axis=1)) # Código zip más pequeño de cada fila

zip_sim = tf.divide(tf.subtract(zip_max, zip_distance), tf.subtract(zip_max, zip_min))

In [90]:
# address_max es 1 y address_min es 0 (porque están normalizadas)
address_sim = tf.subtract(1.0, address_distance)

$$ S(x,y) = \sum_{i=1}^k w_i\cdot S_k(x,y) $$
$$ \sum_{i=1}^k w_i = 1 $$

In [108]:
# Pesos de ponderación
address_wi = 0.5
zip_wi = 1.0 - address_wi

In [109]:
# Suma de las dos similaridades ponderadas (la de la dirección y la del código postal)
weighted_sim = tf.add(tf.transpose(tf.multiply(address_wi, address_sim)), tf.multiply(zip_wi, zip_sim))

In [93]:
# La dirección más parecida ==> mayor similaridad
top_match_idx = tf.argmax(weighted_sim, axis=1) 

In [94]:
def sparse_from_word_vector(word_vector):
    num_words = len(word_vector)
    
    idx = [[0, x_idx, y_idx] for x_idx, x in enumerate(word_vector) for y_idx, y in enumerate(x)]
    vals = list("".join(word_vector))
    
    return tf.SparseTensorValue(indices=idx, values=vals, dense_shape=[num_words, 1, 1])

In [101]:
full_directions

[['via gran 346', 26244],
 ['via python 759', 22130],
 ['carretera gran 832', 28417],
 ['via diagon 641', 25492],
 ['calle abbey 899', 25492],
 ['callejon elm 723', 28417],
 ['calle python 122', 25492],
 ['callejon diagon 94', 22130],
 ['avenida abbey 827', 25492],
 ['avenida python 83', 28417]]

In [95]:
reference_address = [x[0] for x in full_directions]
reference_zips = np.array([[x[1] for x in full_directions]])

In [96]:
sparse_reference_set = sparse_from_word_vector(reference_address)

In [100]:
for i in range(n):
    test_address_entry = typo_full_directions[i][0]
    test_zip_entry = [[typo_full_directions[i][1]]]
    
    test_address_repeated = [test_address_entry]*n
    sparse_test_set = sparse_from_word_vector(test_address_repeated)
    feed_dict = {test_address: sparse_test_set, 
                 test_zip: test_zip_entry, 
                 ref_address: sparse_reference_set, 
                 ref_zip: reference_zips}
    
    best_match = session.run(top_match_idx, feed_dict=feed_dict)
    best_address = reference_address[best_match[0]]
    [best_zip] = reference_zips[0][best_match]
    [[test_zip_aux]] = test_zip_entry
    
    print("Dirección original: {}, {}".format(test_address_entry, test_zip_entry))
    print("Dirección corregida: {}, {}".format(best_address, best_zip))
    print("\n")

Dirección original: via grcn 346, [[26244]]
Dirección corregida: via gran 346, 26244


Dirección original: via python 759, [[22130]]
Dirección corregida: via python 759, 22130


Dirección original: carretera gray 832, [[28417]]
Dirección corregida: carretera gran 832, 28417


Dirección original: via diagon 641, [[25492]]
Dirección corregida: via diagon 641, 25492


Dirección original: calle abbes 899, [[25492]]
Dirección corregida: calle abbey 899, 25492


Dirección original: callejon elq 723, [[28417]]
Dirección corregida: callejon elm 723, 28417


Dirección original: calle pyvhon 122, [[25492]]
Dirección corregida: calle python 122, 25492


Dirección original: callejon diagon 94, [[22130]]
Dirección corregida: callejon diagon 94, 22130


Dirección original: avenida abbcy 827, [[25492]]
Dirección corregida: avenida abbey 827, 25492


Dirección original: avenida pythou 83, [[28417]]
Dirección corregida: avenida python 83, 28417


