In [210]:
import unicodedata
import numpy as np
import pandas as pd

In [211]:
from IPython.display import display

In [212]:
def is_valid_unicode(character):
    try:
        unicodedata.name(character)
        return True
    except ValueError:
        return False

# All Kannada Letters

In [213]:
kannada_letters = [chr(codepoint) for codepoint in range(0x0C80, 0x0CFF + 1) if is_valid_unicode(chr(codepoint))]
print(kannada_letters)

['ಀ', 'ಁ', 'ಂ', 'ಃ', '಄', 'ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ಌ', 'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ', 'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ', 'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ', 'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ', 'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ', 'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ', 'ಯ', 'ರ', 'ಱ', 'ಲ', 'ಳ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ', '಼', 'ಽ', 'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೄ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ', '್', 'ೕ', 'ೖ', 'ೞ', 'ೠ', 'ೡ', 'ೢ', 'ೣ', '೦', '೧', '೨', '೩', '೪', '೫', '೬', '೭', '೮', '೯', 'ೱ', 'ೲ']


In [214]:
for codepoint in kannada_letters:
    print(codepoint, unicodedata.name(codepoint))

ಀ KANNADA SIGN SPACING CANDRABINDU
ಁ KANNADA SIGN CANDRABINDU
ಂ KANNADA SIGN ANUSVARA
ಃ KANNADA SIGN VISARGA
಄ KANNADA SIGN SIDDHAM
ಅ KANNADA LETTER A
ಆ KANNADA LETTER AA
ಇ KANNADA LETTER I
ಈ KANNADA LETTER II
ಉ KANNADA LETTER U
ಊ KANNADA LETTER UU
ಋ KANNADA LETTER VOCALIC R
ಌ KANNADA LETTER VOCALIC L
ಎ KANNADA LETTER E
ಏ KANNADA LETTER EE
ಐ KANNADA LETTER AI
ಒ KANNADA LETTER O
ಓ KANNADA LETTER OO
ಔ KANNADA LETTER AU
ಕ KANNADA LETTER KA
ಖ KANNADA LETTER KHA
ಗ KANNADA LETTER GA
ಘ KANNADA LETTER GHA
ಙ KANNADA LETTER NGA
ಚ KANNADA LETTER CA
ಛ KANNADA LETTER CHA
ಜ KANNADA LETTER JA
ಝ KANNADA LETTER JHA
ಞ KANNADA LETTER NYA
ಟ KANNADA LETTER TTA
ಠ KANNADA LETTER TTHA
ಡ KANNADA LETTER DDA
ಢ KANNADA LETTER DDHA
ಣ KANNADA LETTER NNA
ತ KANNADA LETTER TA
ಥ KANNADA LETTER THA
ದ KANNADA LETTER DA
ಧ KANNADA LETTER DHA
ನ KANNADA LETTER NA
ಪ KANNADA LETTER PA
ಫ KANNADA LETTER PHA
ಬ KANNADA LETTER BA
ಭ KANNADA LETTER BHA
ಮ KANNADA LETTER MA
ಯ KANNADA LETTER YA
ರ KANNADA LETTER RA
ಱ KANNADA LETTER RRA
ಲ

# Random combinations of letters

# Unicode Level MED

In [215]:
def min_edit_distance(s: str, t: str) -> int:
    n = len(s)
    m = len(t)

    prev = [j for j in range(m+1)]
    curr = [0] * (m+1)
    
    table = []
    row_headings    = ['-'] + [st for st in s]
    column_headings = ['-'] + [st for st in t]

    for i in range(1, n+1):
        
        curr[0] = i
        for j in range(1, m+1):
            if s[i-1] == t[j-1]:
                curr[j] = prev[j-1]
            else:
                mn = min(1 + prev[j], 1 + curr[j-1])
                curr[j] = min(mn, 2 + prev[j-1])
        
        table.append(prev)
        prev = curr.copy()
    
    table.append(curr)

    return prev[m], (table, row_headings, column_headings)

In [216]:
words = [
    'ನಾನು',
    'ನನ್ನಾ',
    'ನಿನಗೆ',
    'ನಿನಗಾ',
    'ನೀನಾ',
    'ನೀನ್ನಾ',
]

In [217]:
print(words)
for i in words:
    for j in words:
        print(i, j ,min_edit_distance(i, j)[0])

['ನಾನು', 'ನನ್ನಾ', 'ನಿನಗೆ', 'ನಿನಗಾ', 'ನೀನಾ', 'ನೀನ್ನಾ']
ನಾನು ನಾನು 0
ನಾನು ನನ್ನಾ 5
ನಾನು ನಿನಗೆ 5
ನಾನು ನಿನಗಾ 5
ನಾನು ನೀನಾ 4
ನಾನು ನೀನ್ನಾ 6
ನನ್ನಾ ನಾನು 5
ನನ್ನಾ ನನ್ನಾ 0
ನನ್ನಾ ನಿನಗೆ 6
ನನ್ನಾ ನಿನಗಾ 4
ನನ್ನಾ ನೀನಾ 3
ನನ್ನಾ ನೀನ್ನಾ 1
ನಿನಗೆ ನಾನು 5
ನಿನಗೆ ನನ್ನಾ 6
ನಿನಗೆ ನಿನಗೆ 0
ನಿನಗೆ ನಿನಗಾ 2
ನಿನಗೆ ನೀನಾ 5
ನಿನಗೆ ನೀನ್ನಾ 7
ನಿನಗಾ ನಾನು 5
ನಿನಗಾ ನನ್ನಾ 4
ನಿನಗಾ ನಿನಗೆ 2
ನಿನಗಾ ನಿನಗಾ 0
ನಿನಗಾ ನೀನಾ 3
ನಿನಗಾ ನೀನ್ನಾ 5
ನೀನಾ ನಾನು 4
ನೀನಾ ನನ್ನಾ 3
ನೀನಾ ನಿನಗೆ 5
ನೀನಾ ನಿನಗಾ 3
ನೀನಾ ನೀನಾ 0
ನೀನಾ ನೀನ್ನಾ 2
ನೀನ್ನಾ ನಾನು 6
ನೀನ್ನಾ ನನ್ನಾ 1
ನೀನ್ನಾ ನಿನಗೆ 7
ನೀನ್ನಾ ನಿನಗಾ 5
ನೀನ್ನಾ ನೀನಾ 2
ನೀನ್ನಾ ನೀನ್ನಾ 0


In [218]:
first = 'ಕನ್ನಡ'
second = 'ಕನ್ನಡಕ'

In [219]:
FIRST_WORD = words[-1]
SECOND_WORD = words[-2]
FIRST_WORD, SECOND_WORD

('ನೀನ್ನಾ', 'ನೀನಾ')

In [220]:
MED, t = min_edit_distance(FIRST_WORD, SECOND_WORD)
MED

2

In [221]:
df = pd.DataFrame(t[0], index=t[1], columns=t[2])
df

Unnamed: 0,-,ನ,ೀ,ನ.1,ಾ
-,0,1,2,3,4
ನ,1,0,1,2,3
ೀ,2,1,0,1,2
ನ,3,2,1,0,1
್,4,3,2,1,2
ನ,5,4,3,2,3
ಾ,6,5,4,3,2


### One Optimal Path

In [222]:
def one_path(table):
    path = []

    i, j = len(table)-1, len(table[0])-1
    path.append((i, j))

    while(i!=0 and j!=0):
        values = [
                    table[i-1][j-1],
                    table[i-1][j],
                    table[i][j-1]
                ]
        min_index = values.index(min(values))


        if(min_index == 0):
            i, j = i-1, j-1
        elif(min_index == 1):
            i, j = i-1, j
        else:
            i, j = i, j-1

        path.append((i, j))
#     return path
    return list(reversed(path))
#     return list(reversed(path+ [(0,0)]))

In [223]:
def yield_characters(string):
    for i in string:
        yield i

In [224]:
def alignment(string, path):
#     print(path, string)
    iterator = yield_characters(string)
        
    align = []
    
    for i in range(len(path)-1):
        if(path[i]==path[i+1]):
            align.append('*')
        else:
            align.append(next(iterator))
#             print(align)

    return align

In [225]:
pprint(t[0])

[[0, 1, 2, 3, 4],
 [1, 0, 1, 2, 3],
 [2, 1, 0, 1, 2],
 [3, 2, 1, 0, 1],
 [4, 3, 2, 1, 2],
 [5, 4, 3, 2, 3],
 [6, 5, 4, 3, 2]]


In [226]:
path = one_path(t[0]) 

first_string_path, second_string_path = zip(*path)
print(first_string_path)

first_align = alignment(FIRST_WORD, first_string_path)
second_align = alignment(SECOND_WORD, second_string_path)

df = pd.DataFrame([first_align, second_align])
df

(0, 1, 2, 3, 4, 5, 6)


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,್,ನ,ಾ
1,ನ,ೀ,ನ,*,*,ಾ


In [227]:
for first_word in words:
    for second_word in words:
        MED, t = min_edit_distance(first_word, second_word)
        path = one_path(t[0]) 

        first_string_path, second_string_path = zip(*path)
#         print(first_string_path)

        first_align = alignment(first_word, first_string_path)
        second_align = alignment(second_word, second_string_path)

        df = pd.DataFrame([first_align, second_align])
        display(df)

Unnamed: 0,0,1,2,3
0,ನ,ಾ,ನ,ು
1,ನ,ಾ,ನ,ು


Unnamed: 0,0,1,2,3,4,5
0,ನ,ಾ,ನ,*,*,ು
1,ನ,*,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4
0,ನ,ಾ,ನ,*,ು
1,ನ,ಿ,ನ,ಗ,ೆ


Unnamed: 0,0,1,2,3,4
0,ನ,ಾ,ನ,*,ು
1,ನ,ಿ,ನ,ಗ,ಾ


Unnamed: 0,0,1,2,3
0,ನ,ಾ,ನ,ು
1,ನ,ೀ,ನ,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ಾ,ನ,*,*,ು
1,ನ,ೀ,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,*,ನ,್,ನ,ಾ
1,ನ,ಾ,ನ,*,*,ು


Unnamed: 0,0,1,2,3,4
0,ನ,ನ,್,ನ,ಾ
1,ನ,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,*,ನ,್,ನ,ಾ
1,ನ,ಿ,ನ,*,ಗ,ೆ


Unnamed: 0,0,1,2,3,4,5
0,ನ,*,ನ,್,ನ,ಾ
1,ನ,ಿ,ನ,*,ಗ,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,*,ನ,್,ನ,ಾ
1,ನ,ೀ,ನ,*,*,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,*,ನ,್,ನ,ಾ
1,ನ,ೀ,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4
0,ನ,ಿ,ನ,ಗ,ೆ
1,ನ,ಾ,ನ,*,ು


Unnamed: 0,0,1,2,3,4,5
0,ನ,ಿ,ನ,*,ಗ,ೆ
1,ನ,*,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4
0,ನ,ಿ,ನ,ಗ,ೆ
1,ನ,ಿ,ನ,ಗ,ೆ


Unnamed: 0,0,1,2,3,4
0,ನ,ಿ,ನ,ಗ,ೆ
1,ನ,ಿ,ನ,ಗ,ಾ


Unnamed: 0,0,1,2,3,4
0,ನ,ಿ,ನ,ಗ,ೆ
1,ನ,ೀ,ನ,*,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ಿ,ನ,*,ಗ,ೆ
1,ನ,ೀ,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4
0,ನ,ಿ,ನ,ಗ,ಾ
1,ನ,ಾ,ನ,*,ು


Unnamed: 0,0,1,2,3,4,5
0,ನ,ಿ,ನ,*,ಗ,ಾ
1,ನ,*,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4
0,ನ,ಿ,ನ,ಗ,ಾ
1,ನ,ಿ,ನ,ಗ,ೆ


Unnamed: 0,0,1,2,3,4
0,ನ,ಿ,ನ,ಗ,ಾ
1,ನ,ಿ,ನ,ಗ,ಾ


Unnamed: 0,0,1,2,3,4
0,ನ,ಿ,ನ,ಗ,ಾ
1,ನ,ೀ,ನ,*,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ಿ,ನ,*,ಗ,ಾ
1,ನ,ೀ,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3
0,ನ,ೀ,ನ,ಾ
1,ನ,ಾ,ನ,ು


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,*,*,ಾ
1,ನ,*,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4
0,ನ,ೀ,ನ,*,ಾ
1,ನ,ಿ,ನ,ಗ,ೆ


Unnamed: 0,0,1,2,3,4
0,ನ,ೀ,ನ,*,ಾ
1,ನ,ಿ,ನ,ಗ,ಾ


Unnamed: 0,0,1,2,3
0,ನ,ೀ,ನ,ಾ
1,ನ,ೀ,ನ,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,*,*,ಾ
1,ನ,ೀ,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,್,ನ,ಾ
1,ನ,ಾ,ನ,*,*,ು


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,್,ನ,ಾ
1,ನ,*,ನ,್,ನ,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,್,ನ,ಾ
1,ನ,ಿ,ನ,*,ಗ,ೆ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,್,ನ,ಾ
1,ನ,ಿ,ನ,*,ಗ,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,್,ನ,ಾ
1,ನ,ೀ,ನ,*,*,ಾ


Unnamed: 0,0,1,2,3,4,5
0,ನ,ೀ,ನ,್,ನ,ಾ
1,ನ,ೀ,ನ,್,ನ,ಾ


### All Optimal Paths

# Character Level MED

In [228]:
dheerga_swaras = \
    list(range(3201, 3201+3)) + \
    [3260] + \
    list(range(3262, 3277)) + \
    list(range(3278, 3287)) + \
    list(range(3298, 3300))

virama = 3277

In [229]:
from pprint import pprint
pprint([(ch, chr(ch)) for ch in dheerga_swaras if is_valid_unicode(chr(ch))])

[(3201, 'ಁ'),
 (3202, 'ಂ'),
 (3203, 'ಃ'),
 (3260, '಼'),
 (3262, 'ಾ'),
 (3263, 'ಿ'),
 (3264, 'ೀ'),
 (3265, 'ು'),
 (3266, 'ೂ'),
 (3267, 'ೃ'),
 (3268, 'ೄ'),
 (3270, 'ೆ'),
 (3271, 'ೇ'),
 (3272, 'ೈ'),
 (3274, 'ೊ'),
 (3275, 'ೋ'),
 (3276, 'ೌ'),
 (3285, 'ೕ'),
 (3286, 'ೖ'),
 (3298, 'ೢ'),
 (3299, 'ೣ')]


In [230]:
a = 'ನಾ'
b = 'ನ್ನಾ'
c = 'ನಿನಗೆನ್ನಾ'
for i in a:
    print(i)

print()

for i in b:
    print(i)

ನ
ಾ

ನ
್
ನ
ಾ


### Unicode to Character

In [231]:
def yield_characters(string):
    for i in string:
        yield i


In [232]:
def u2c(string):
    char_level_text = []
    current = []
    n = len(string)
    
    i=0
    while(i<n):
        character = string[i]
        current = [character]
        
        i+=1
        
        if(i-1<n and ord(string[i])==virama):
            current.append((string[i], string[i+1]))
            i+=2
        
        if(i<n and ord(string[i]) in dheerga_swaras):
            current.append(string[i])
            i+=1
        
        print(current)
        char_level_text.append(current)
    
    return char_level_text

In [355]:
def min_edit_distance_character_level(s: str, t: str) -> int:
    n = len(s)
    m = len(t)

    prev = [j for j in range(m+1)]
    curr = [0] * (m+1)
    
    table = []
    row_headings    = ['-'] + [st for st in s]
    column_headings = ['-'] + [st for st in t]

    for i in range(1, n+1):
        
        curr[0] = i
        for j in range(1, m+1):
#             print('Currently comparing:')
            
            if s[i-1] == t[j-1]:
                curr[j] = prev[j-1]
            else:
                cost_from_previous_row = len(s[i-1])
                cost_from_current_row  = len(t[j-1])
                
                mn = min(1 + prev[j], 1 + curr[j-1])
                curr[j] = min(mn, 2 + prev[j-1])
        
        table.append(prev)
        prev = curr.copy()
    
    table.append(curr)

    return prev[m], (table, row_headings, column_headings)

In [357]:
min_edit_distance_character_level(u2c(FIRST_WORD), u2c(SECOND_WORD))

['ನ', 'ಾ']
['ನ', 'ು']
['ನ']
['ನ', ('್', 'ನ'), 'ಾ']


(4,
 ([[0, 1, 2], [1, 2, 3], [2, 3, 4]],
  ['-', ['ನ', 'ಾ'], ['ನ', 'ು']],
  ['-', ['ನ'], ['ನ', ('್', 'ನ'), 'ಾ']]))

In [233]:
for i in c:
    print(i)

ನ
ಿ
ನ
ಗ
ೆ
ನ
್
ನ
ಾ


In [234]:
c = 'ನಿನಗೆನ್ನಾ'
char_level_text = u2c(c)

['ನ', 'ಿ']
['ನ']
['ಗ', 'ೆ']
['ನ', ('್', 'ನ'), 'ಾ']


In [235]:
char_level_text

[['ನ', 'ಿ'], ['ನ'], ['ಗ', 'ೆ'], ['ನ', ('್', 'ನ'), 'ಾ']]

In [349]:
def min_edit_distance_char_fine(s: str, t: str) -> int:
    n = len(s)
    m = len(t)

    prev = [j for j in range(m+1)]
    curr = [0] * (m+1)
    
    table = []
    row_headings    = ['-'] + [st for st in s]
    column_headings = ['-'] + [st for st in t]

    for i in range(1, n+1):
        
        curr[0] = i
        for j in range(1, m+1):
#             print('Currently comparing:')
            
            if s[i-1] == t[j-1]:
                curr[j] = prev[j-1]
            else:
                cost_from_previous_row = len(s[i-1])
                cost_from_current_row  = len(t[j-1])
                
                mn = min(cost_from_previous_row + prev[j], cost_from_current_row + curr[j-1])
                cost_of_replacing = len(s[i-1]) + len(t[j-1]) - 2 * len(set(s[i-1]) & set(t[j-1]))
                curr[j] = min(mn, cost_of_replacing + prev[j-1])
                
                print()
                print(s[i-1], t[j-1])
                print('Intersection', list(set(s[i-1]) & set(t[j-1])))
                print('Inside else:', '\ncost_from_previous_row', cost_from_previous_row, 
                      '\ncost_from_current_row', cost_from_current_row,
                      '\ncost_of_replacing', cost_of_replacing)
        
        table.append(prev)
        prev = curr.copy()
    
    table.append(curr)

    return prev[m], (table, row_headings, column_headings)

In [350]:
FIRST_WORD = words[0]
SECOND_WORD = words[1]

In [351]:
MED2, t2 = min_edit_distance_char(u2c(FIRST_WORD), u2c(SECOND_WORD))
MED1, t1 = min_edit_distance(FIRST_WORD, SECOND_WORD)

['ನ', 'ಾ']
['ನ', 'ು']
['ನ']
['ನ', ('್', 'ನ'), 'ಾ']

['ನ', 'ಾ'] ['ನ']
Intersection ['ನ']
Inside else: 
cost_from_previous_row 2 
cost_from_current_row 1 
cost_of_replacing 1

['ನ', 'ಾ'] ['ನ', ('್', 'ನ'), 'ಾ']
Intersection ['ಾ', 'ನ']
Inside else: 
cost_from_previous_row 2 
cost_from_current_row 3 
cost_of_replacing 1

['ನ', 'ು'] ['ನ']
Intersection ['ನ']
Inside else: 
cost_from_previous_row 2 
cost_from_current_row 1 
cost_of_replacing 1

['ನ', 'ು'] ['ನ', ('್', 'ನ'), 'ಾ']
Intersection ['ನ']
Inside else: 
cost_from_previous_row 2 
cost_from_current_row 3 
cost_of_replacing 3


In [352]:
FIRST_WORD

'ನಾನು'

In [341]:
SECOND_WORD

'ನನ್ನಾ'

In [342]:
indices = []
print(t2[1])
for i in t2[1]:
    temp = ''
    for j in i:
        if isinstance(j, tuple):
            for k in j:
                temp += k
        else:
            temp += j
    indices.append(temp)
indices

['-', ['ನ', 'ಾ'], ['ನ', 'ು']]


['-', 'ನಾ', 'ನು']

In [343]:
col_indices = []
print(t2[2])
for i in t2[2]:
    temp = ''
    for j in i:
        if isinstance(j, tuple):
            for k in j:
                temp += k
        else:
            temp += j
    col_indices.append(temp)
col_indices

['-', ['ನ'], ['ನ', ('್', 'ನ'), 'ಾ']]


['-', 'ನ', 'ನ್ನಾ']

In [344]:
df = pd.DataFrame(t2[0], index=indices, columns=col_indices)
df

Unnamed: 0,-,ನ,ನ್ನಾ
-,0,1,2
ನಾ,1,1,2
ನು,2,2,4


In [345]:
for i in t[0]:
    print(i)

[0, 1, 2, 3, 4, 5, 6]
[1, 0, 1, 2, 3, 4, 5]
[2, 1, 0, 1, 2, 3, 4]
[3, 2, 1, 0, 1, 2, 3]
[4, 3, 2, 1, 0, 1, 2]
[5, 4, 3, 2, 1, 0, 1]
[6, 5, 4, 3, 2, 1, 0]


In [346]:
FIRST_WORD, SECOND_WORD

('ನಾನು', 'ನನ್ನಾ')

In [347]:
list1 = ['ನ', 'ೀ'] 
list2 =['ನ', 'ಾ']

intersection = list(set(list1) & set(list2))
print(intersection)

['ನ']


In [348]:
len(set(list2) & set(list1))

1