# Distancia de Levenshtein (distancia entre palabras)

Es el número mínimo de operaciones (inserción, eliminación o sustitución de un caracter) requeridas para transformar una cadena de caracteres en otra.

Por ejemplo, la distancia entre 'casa' y 'calle' es 3:
1. casa $\rightarrow$ cala (sustitución de 's' por 'l')
2. cala $\rightarrow$ calla (inserción de 'l' entre la 'l' y la 'a'
3. calla $\rightarrow$ calle (sustitución de 'a' por 'e'

In [1]:
import tensorflow as tf
session = tf.Session()

In [2]:
initial_word = list("casa")
final_word = list("calle")

In [14]:
initial_tensor = tf.SparseTensor(indices=[[0,0,0],[0,0,1],[0,0,2],[0,0,3]], # 4 letras en la palabra inicial ('casa')
                     values=initial_word, 
                     dense_shape=[1,1,1])

final_tensor = tf.SparseTensor(indices=[[0,0,0],[0,0,1],[0,0,2],[0,0,3],[0,0,4]], # 5 letras en la palabra final ('calle')
                               values=final_word, 
                               dense_shape=[1,1,1]) 

In [15]:
distance = session.run(tf.edit_distance(hypothesis=initial_tensor, truth=final_tensor, normalize=False))
print(distance)

[[3.]]


In [16]:
print(session.run(tf.edit_distance(hypothesis=initial_tensor, truth=final_tensor, normalize=True)))

[[0.6]]


In [8]:
# 3/5 ==> número de pasos / número de letras

In [11]:
session.run(initial_tensor)

SparseTensorValue(indices=array([[0, 0, 0],
       [0, 0, 1],
       [0, 0, 2],
       [0, 0, 3]]), values=array([b'c', b'a', b's', b'a'], dtype=object), dense_shape=array([1, 1, 1]))

In [43]:
hypothesis2 = list("casacalle")
truth2 = list("callescalles")

In [46]:
h2 = tf.SparseTensor(indices=[[0,0,0],[0,0,1],[0,0,2],[0,0,3],[0,1,0],[0,1,1],[0,1,2],[0,1,3],[0,1,4]],
                     values=hypothesis2, 
                     dense_shape=[1,2,5]) # 2 palabras de 4 letras máximo (la más larga)

t2 = tf.SparseTensor(indices=[[0,0,0],[0,0,1],[0,0,2],[0,0,3],[0,0,4],[0,0,5], 
                             [0,1,0],[0,1,1],[0,1,2],[0,1,3],[0,1,4],[0,1,5]], 
                     values=truth2, 
                     dense_shape=[1,2,6]) # 2 palabras de 6 letras máximo

In [47]:
print(session.run(tf.edit_distance(h2,t2, normalize=False) ))

[[4. 1.]]


casa - calles $\rightarrow$ 4 pasos

calle - calles $\rightarrow$ 1 paso

In [37]:
hypothesis_words = ["casa", "casita", "caseron", "tensor", "python"]
truth_word = "algoritmo"

In [52]:
num_h_words = len(hypothesis_words)
h_idx = [[0, x_idx, y_idx] for x_idx, x in enumerate(hypothesis_words) for y_idx, y in enumerate(x)]
h_idx

[[0, 0, 0],
 [0, 0, 1],
 [0, 0, 2],
 [0, 0, 3],
 [0, 1, 0],
 [0, 1, 1],
 [0, 1, 2],
 [0, 1, 3],
 [0, 1, 4],
 [0, 1, 5],
 [0, 2, 0],
 [0, 2, 1],
 [0, 2, 2],
 [0, 2, 3],
 [0, 2, 4],
 [0, 2, 5],
 [0, 2, 6],
 [0, 3, 0],
 [0, 3, 1],
 [0, 3, 2],
 [0, 3, 3],
 [0, 3, 4],
 [0, 3, 5],
 [0, 4, 0],
 [0, 4, 1],
 [0, 4, 2],
 [0, 4, 3],
 [0, 4, 4],
 [0, 4, 5]]

In [85]:
h_chars = list("".join(hypothesis_words))
h_chars

['c',
 'a',
 's',
 'a',
 'c',
 'a',
 's',
 'i',
 't',
 'a',
 'c',
 'a',
 's',
 'e',
 'r',
 'o',
 'n',
 't',
 'e',
 'n',
 's',
 'o',
 'r',
 'p',
 'y',
 't',
 'h',
 'o',
 'n']

In [78]:
truth_words = []
for i in range(num_h_words):
    truth_words.append(truth_word)

truth_words

['algoritmo', 'algoritmo', 'algoritmo', 'algoritmo', 'algoritmo']

In [82]:
t_idx = [[0, x_idx, y_idx] for x_idx, x in enumerate(truth_words) for y_idx, y in enumerate(x)]
t_chars = list("".join(truth_words))

In [83]:
h3 = tf.SparseTensor(indices=h_idx, values=h_chars, dense_shape=[1, num_h_words, 5])
t3 = tf.SparseTensor(indices=t_idx, values=t_chars, dense_shape=[1, len(truth_words), len(truth_word)])

In [84]:
print(session.run(tf.edit_distance(h3, t3, normalize=False)))

[[9. 7. 8. 8. 8.]]


In [86]:
print(session.run(tf.edit_distance(h3, t3, normalize=True)))

[[1.        0.7777778 0.8888889 0.8888889 0.8888889]]


In [110]:
def create_sparse_words_vect(word_list):
    
    indices = [[0, x_idx, y_idx] for x_idx, x in enumerate(word_list) for y_idx, y in enumerate(x)]
    values = list("".join(word_list))
    num_words = len(word_list)
    
    max_length_word = 0
    
    for word in word_list:
        if len(word) > max_length_word:
            max_length_word = len(word) 
            
    return tf.SparseTensorValue(indices=indices, values=values, dense_shape=[1, num_words, max_length_word])


In [111]:
h4 = create_sparse_words_vect(hypothesis_words)
t4 = create_sparse_words_vect(truth_words)

print(session.run(tf.edit_distance(h4, t4)))

[[1.        0.7777778 0.8888889 0.8888889 0.8888889]]
