In [2]:
import pandas as pd
from glob import glob
from utils.graphemeParser import GraphemeParser
import numpy as np
from utils.clean_word import clean_word

In [3]:
path = 'unique_words/unique_words_anondoB.xlsx'
df = pd.read_excel(path)

In [4]:
word_list  = list(df.word.values)

In [5]:
word_list[:5]

['অ', 'অঁসম্বল', 'অংক', 'অংশ', 'অংশই']

In [6]:
path_class_map = 'utils/class_map.csv'
df_map = pd.read_csv(path_class_map, encoding = 'utf-8')
df_root = df_map.groupby('component_type').get_group('grapheme_root')
df_root.set_index('label', inplace = True)
df_root.drop(columns=['component_type'], inplace = True)
df_vd = df_map.groupby('component_type').get_group('vowel_diacritic')
df_vd.set_index('label', inplace = True)
df_vd.drop(columns=['component_type'], inplace = True)
df_cd = df_map.groupby('component_type').get_group('consonant_diacritic')
df_cd.set_index('label', inplace = True)
df_cd.drop(columns=['component_type'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
graphemeparser = GraphemeParser(df_root.values, df_vd.values, df_cd.values)

In [11]:
for word in word_list:
    if type(word) is str:
        word = clean_word(word)
        graphemes = graphemeparser.word2grapheme(word)
        comps_all = [graphemeparser.grapheme2component(grp) for grp in graphemes]

        grapheme_rec_all = [graphemeparser.comp2grapheme(comp)  for comp in comps_all]

        word_rec = graphemeparser.grapheme2word(grapheme_rec_all)

    #     if True:

        if word != word_rec:
            # print(repr('শ্য়পূ'))

            print('==================================================================')

            print('word: {}, unicodes: {}'.format(word, [uc for uc in word]))
            print('word_reconstructed: {}, unicodes: {}'.format(word_rec,[uc for uc in word_rec]))
            print('extracted graphemes', graphemes)
            for grp in graphemes:
                print('grapheme: {}, unicodes: {}, components: {}'.format(
                    grp,
                    [uc for uc in grp],
                    graphemeparser.grapheme2component(grp)
                ))
    #             break
            # print('reconstructed')

word: অনুগ্র্হ, unicodes: ['অ', 'ন', 'ু', 'গ', '্', 'র', '্', 'হ']
word_reconstructed: অনুগ্হ্র, unicodes: ['অ', 'ন', 'ু', 'গ', '্', 'হ', '্', 'র']
extracted graphemes ['অ', 'নু', 'গ্র্হ']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
grapheme: নু, unicodes: ['ন', 'ু'], components: ('ন', 'ু', '')
grapheme: গ্র্হ, unicodes: ['গ', '্', 'র', '্', 'হ'], components: ('গ্হ', '', '্র')
word: অন্ধ্র্প্রদেশের, unicodes: ['অ', 'ন', '্', 'ধ', '্', 'র', '্', 'প', '্', 'র', 'দ', 'ে', 'শ', 'ে', 'র']
word_reconstructed: অন্ধ্প্রদেশের, unicodes: ['অ', 'ন', '্', 'ধ', '্', 'প', '্', 'র', 'দ', 'ে', 'শ', 'ে', 'র']
extracted graphemes ['অ', 'ন্ধ্র্প্র', 'দে', 'শে', 'র']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
grapheme: ন্ধ্র্প্র, unicodes: ['ন', '্', 'ধ', '্', 'র', '্', 'প', '্', 'র'], components: ('ন্ধ্প', '', '্র্র')
grapheme: দে, unicodes: ['দ', 'ে'], components: ('দ', 'ে', '')
grapheme: শে, unicodes: ['শ', 'ে'], components: ('শ', 'ে', '')
grapheme: র, unicodes: ['র'], componen

word: পারিপা্র্শ্বিক, unicodes: ['প', 'া', 'র', 'ি', 'প', 'া', '্', 'র', '্', 'শ', '্', 'ব', 'ি', 'ক']
word_reconstructed: পারিপা্শ্ব্রিক, unicodes: ['প', 'া', 'র', 'ি', 'প', 'া', '্', 'শ', '্', 'ব', '্', 'র', 'ি', 'ক']
extracted graphemes ['পা', 'রি', 'পা', '্র্শ্বি', 'ক']
grapheme: পা, unicodes: ['প', 'া'], components: ('প', 'া', '')
grapheme: রি, unicodes: ['র', 'ি'], components: ('র', 'ি', '')
grapheme: পা, unicodes: ['প', 'া'], components: ('প', 'া', '')
grapheme: ্র্শ্বি, unicodes: ['্', 'র', '্', 'শ', '্', 'ব', 'ি'], components: ('্শ্ব', 'ি', '্র')
grapheme: ক, unicodes: ['ক'], components: ('ক', '', '')
word: পূ্র্ণ, unicodes: ['প', 'ূ', '্', 'র', '্', 'ণ']
word_reconstructed: পূ্ণ্র, unicodes: ['প', 'ূ', '্', 'ণ', '্', 'র']
extracted graphemes ['পূ', '্র্ণ']
grapheme: পূ, unicodes: ['প', 'ূ'], components: ('প', 'ূ', '')
grapheme: ্র্ণ, unicodes: ['্', 'র', '্', 'ণ'], components: ('্ণ', '', '্র')
word: পূ্র্বাভাস, unicodes: ['প', 'ূ', '্', 'র', '্', 'ব', 'া', 'ভ', 'া', 'স']
word

word: র্যাঁবো, unicodes: ['র', '্', 'য', 'া', 'ঁ', 'ব', 'ো']
word_reconstructed: যাঁবো, unicodes: ['য', 'া', 'ঁ', 'ব', 'ো']
extracted graphemes ['র্যাঁ', 'বো']
grapheme: র্যাঁ, unicodes: ['র', '্', 'য', 'া', 'ঁ'], components: ('য', 'া', 'র্ঁ')
grapheme: বো, unicodes: ['ব', 'ো'], components: ('ব', 'ো', '')
Out of dictionary symbol encountered:  ঽ
word: শাশ্বতোঽয়ং, unicodes: ['শ', 'া', 'শ', '্', 'ব', 'ত', 'ো', 'ঽ', 'য়', 'ং']
word_reconstructed: শাশ্বতোয়ং, unicodes: ['শ', 'া', 'শ', '্', 'ব', 'ত', 'ো', 'য়', 'ং']
extracted graphemes ['শা', 'শ্ব', 'তো', 'ঽয়', 'ং']
grapheme: শা, unicodes: ['শ', 'া'], components: ('শ', 'া', '')
grapheme: শ্ব, unicodes: ['শ', '্', 'ব'], components: ('শ্ব', '', '')
grapheme: তো, unicodes: ['ত', 'ো'], components: ('ত', 'ো', '')
Out of dictionary symbol encountered:  ঽ
grapheme: ঽয়, unicodes: ['ঽ', 'য়'], components: ('য়', '', '')
grapheme: ং, unicodes: ['ং'], components: ('ং', '', '')
word: শ্র্দ্ধায়, unicodes: ['শ', '্', 'র', '্', 'দ', '্', 'ধ', 'া', 'য়']
word_re