# String distance

In [None]:
import nltk
import numpy as np
import pandas as pd

In [None]:
words = nltk.corpus.words.words()

In [None]:
pd.DataFrame(words).to_csv("words_nltk.csv")

## Assuming same cost for all edits

In [None]:
def create_memoization_table(X,Y):
    len_x = len(X)
    len_y = len(Y)
    D = np.zeros((len_x+1,len_y+1), dtype=np.int32)
    
    for i in range(len(X)+1):
        for j in range(len(Y)+1):

            if i == 0:
                D[i][j] = j    

            elif j == 0:
                D[i][j] = i  

            elif X[i-1] == Y[j-1]: 
                D[i][j] = D[i-1][j-1]

            else:
                D[i][j] = 1+min(D[i][j-1],      # Insert 
                                D[i-1][j],      # Remove 
                                D[i-1][j-1])    # Replace 
    return D

In [None]:
x = "EXPONENTIAL"
y = "POLYNOMIAL"
D = create_memoization_table(x,y)
D[-1,-1]

In [None]:
x = "Elliot"
y = "Elia"
D = create_memoization_table(x,y)
D[-1,-1]

In [None]:
x = "hi"
y = "hill"
D = create_memoization_table(x,y)
print("\nThe distance between {} and {} is {}".format(x,y,D[-1,-1]))

##### Timing implementation

In [None]:
x = "EXPONENTIAL"
y = "POLYNOMIAL"

In [None]:
def edit_distance_fast(x,y):
    D = create_memoization_table(x,y)
    return D[-1,-1]

In [None]:
%%timeit
edit_distance_fast(x,y)

In [None]:
%%timeit
nltk.edit_distance(x,y)

In [None]:
nltk.edit_distance(x,y) == edit_distance_fast(x,y)

### Different costs per operation

In [None]:
def memoization_table_weighted(X,Y):
    len_x = len(X)
    len_y = len(Y)
    D = np.zeros((len_x + 1, len_y + 1), dtype=np.int32)
    D[:,0] = range(len_x + 1)
    D[0,:] = range(len_y + 1)

    w_sub = 1
    w_del = 1
    w_ins = 1

    for i in range(1, len_x + 1):
        for j in range(1, len_y + 1):
            del_char = D[i-1,j] + w_del
            ins_char = D[i,j-1] + w_ins

            if X[i-1] == Y[j-1]:
                Z = 0
            else:
                Z = w_sub
            sub_char = D[i-1,j-1] + Z

            D[i,j] = min(del_char, ins_char, sub_char)

    return D

In [None]:
x = "Elliot"
y = "Elia"
D = memoization_table_weighted(x, y)
print("\nThe distance between {} and {} is {}".format(x,y,D[-1,-1]))

# Speeding up code

Simple example with cython

In [None]:
%load_ext cython

In [None]:
def fib(n):
    a = 0.
    b = 1.
    for i in range(n):
        a, b = a + b, a
    return a

In [None]:
%%cython --annotate
def cy_fib(int n):
    cdef int i
    cdef float a=0.0, b=1.0
    for i in range(n):
        a, b = a + b, a
    return a

In [None]:
fib(10)

In [None]:
cy_fib(10)

In [None]:
import timeit

n_times = 100000
t_fib = timeit.timeit("fib(10)", setup="from __main__ import fib",number=n_times)
t_cyfib = timeit.timeit("cy_fib(10)", setup="from __main__ import cy_fib",number=n_times)
t_fib_unit = t_fib/n_times

t_cyfib      = timeit.timeit("cy_fib(10)", setup="from __main__ import cy_fib",number=n_times)
t_cyfib_unit = t_cyfib/n_times

print(" Python version took: {} sec\n Cython version took: {} sec\n Cython is {:.0f}x faster"\
      .format(t_fib, t_cyfib, t_fib/t_cyfib))

print("\n Python version 1 run took: {} sec\n Cython version 1 run took: {} sec\n Cython is {:.0f}x faster"\
      .format(t_fib_unit, t_cyfib_unit, t_fib_unit/t_cyfib_unit))

### Speeding up edit distance



##### Exercise fill in cy_create_memoization_table so that it returns the matrix filled to compute the edit distance

In [None]:
%%cython --annotate

import numpy as np

def cy_create_memoization_table(str X, str Y):
    cdef int i, j, del_char, ins_char, sub_char, Z
    cdef int len_x = len(X)
    cdef int len_y = len(Y)
    cdef int [:, :] D =  np.zeros((len_x + 1, len_y + 1), dtype=np.int32)

    for i in range(len_x+1):
        D[i,0] = i

    for j in range(len_y+1):
        D[0,j] = j

    for i in range(1, len_x + 1):
        for j in range(1, len_y + 1):
            del_char = D[i-1,j] + 1
            ins_char = D[i,j-1] + 1

            if X[i-1] == Y[j-1]:
                Z = 0
            else:
                Z = 1
            sub_char = D[i-1,j-1] + Z

            D[i,j] = min(del_char, ins_char, sub_char)
    
    return D

In [None]:
D1 = create_memoization_table(x,y)
D1

In [None]:
D2 = cy_create_memoization_table(x,y)
D2 = np.asarray(D2)

In [None]:
D2

In [None]:
t_create_memoization_table = timeit.timeit("x='exponential'; y='polynomial'; create_memoization_table(x,y)",
                                           setup="import numpy as np; from __main__ import create_memoization_table",
                                           number=5000)

In [None]:
t_cy_create_memoization_table = timeit.timeit("x='exponential'; y='polynomial'; cy_create_memoization_table(x,y)",
                                              setup="from __main__ import cy_create_memoization_table",
                                              number=5000)

In [None]:
t_nltk = timeit.timeit("x='exponential'; y='polynomial'; nltk.edit_distance(x,y)",
                        setup="import nltk ",
                        number=5000)

In [None]:
print(""" 
      Python version took: {} sec
      Cython version took: {} sec
      nltk   version took: {} sec
      Cython is {:.0f}x faster than python
      Cython is {:.0f}x faster than nltk
      """\
      .format(t_create_memoization_table, 
              t_cy_create_memoization_table,
              t_nltk, 
              t_create_memoization_table/t_cy_create_memoization_table,
              t_nltk/t_cy_create_memoization_table))


In [None]:
def edit_distance(x,y):
    return cy_create_memoization_table(x,y)[-1,-1]

### Return to the experiment where we computed closest word


##### Exercise: Return the last component of the DynamicProgramming matrix containing the edit distance

In [None]:
import editdistance

In [None]:
words = nltk.corpus.words.words()
len(words)

In [None]:
%%time
mistake = "drauing" 
distances = []
for word in words:
    ed = nltk.edit_distance(mistake, word)
    distances.append(ed)
    
print("\nthe closest word is", words[np.argmin(distances)])

In [None]:
%%time
mistake = "drauing" 
cy_distances = []
for word in words:
    ed = editdistance.eval(mistake, word)
    cy_distances.append(ed)
    
print("\nthe closest word is", words[np.argmin(cy_distances)])

In [None]:
%%time
mistake = "drauing" 
cy_distances = []
for word in words:
    ed = edit_distance(mistake, word)
    cy_distances.append(ed)
    
print("\nthe closest word is", words[np.argmin(cy_distances)])

In [None]:
distances == cy_distances

In [None]:
editdistance.eval("hi", "hi"), edit_distance("hi","hi")

In [None]:
editdistance.eval("hi", "ho"), edit_distance("hi","ho")

##### Interesting material on string similarities

Approximate string matching:

https://medium.com/@wolfgarbe/fast-approximate-string-matching-with-large-edit-distances-in-big-data-2015-9174a0968c0b

Levenshtein distance using a trie:

http://stevehanov.ca/blog/?id=114

About jaccard distance:

https://python.gotrained.com/nltk-edit-distance-jaccard-distance/