In [13]:
import pandas as pd
from glob import glob
from utils.graphemeParser import GraphemeParser
import numpy as np
from utils.wordCleaner import WordCleaner

In [14]:
path = 'unique_words/unique_words_anondoB.xlsx'
df = pd.read_excel(path)

In [15]:
word_list  = list(df.word.values)

In [16]:
word_list[:5]

['অ', 'অঁসম্বল', 'অংক', 'অংশ', 'অংশই']

In [17]:
path_class_map = 'utils/class_map.csv'
df_map = pd.read_csv(path_class_map, encoding = 'utf-8')
df_root = df_map.groupby('component_type').get_group('grapheme_root')
df_root.set_index('label', inplace = True)
df_root.drop(columns=['component_type'], inplace = True)
df_vd = df_map.groupby('component_type').get_group('vowel_diacritic')
df_vd.set_index('label', inplace = True)
df_vd.drop(columns=['component_type'], inplace = True)
df_cd = df_map.groupby('component_type').get_group('consonant_diacritic')
df_cd.set_index('label', inplace = True)
df_cd.drop(columns=['component_type'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [18]:
graphemeparser = GraphemeParser(df_root.values, df_vd.values, df_cd.values)

In [19]:
wordCleaner = WordCleaner()

In [21]:
for word in word_list:
    if type(word) is str:
        word = wordCleaner.clean(word)
        graphemes = graphemeparser.word2grapheme(word)
        comps_all = [graphemeparser.grapheme2component(grp) for grp in graphemes]

        grapheme_rec_all = [graphemeparser.comp2grapheme(comp)  for comp in comps_all]

        word_rec = graphemeparser.grapheme2word(grapheme_rec_all)

    #     if True:

        if word != word_rec:
            # print(repr('শ্য়পূ'))

            print('==================================================================')

            print('word: {}, unicodes: {}'.format(word, [uc for uc in word]))
            print('word_reconstructed: {}, unicodes: {}'.format(word_rec,[uc for uc in word_rec]))
            print('extracted graphemes', graphemes)
            for grp in graphemes:
                print('grapheme: {}, unicodes: {}, components: {}'.format(
                    grp,
                    [uc for uc in grp],
                    graphemeparser.grapheme2component(grp)
                ))
    #             break
            # print('reconstructed')

word: অগ্, unicodes: ['অ', 'গ', '্']
word_reconstructed: অ, unicodes: ['অ']
extracted graphemes ['অ']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
word: অজানা্, unicodes: ['অ', 'জ', 'া', 'ন', 'া', '্']
word_reconstructed: অজানা, unicodes: ['অ', 'জ', 'া', 'ন', 'া']
extracted graphemes ['অ', 'জা', 'না']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
grapheme: জা, unicodes: ['জ', 'া'], components: ('জ', 'া', '')
grapheme: না, unicodes: ['ন', 'া'], components: ('ন', 'া', '')
word: অঞ্, unicodes: ['অ', 'ঞ', '্']
word_reconstructed: অ, unicodes: ['অ']
extracted graphemes ['অ']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
word: অদ্বৈতম্, unicodes: ['অ', 'দ', '্', 'ব', 'ৈ', 'ত', 'ম', '্']
word_reconstructed: অদ্বৈত, unicodes: ['অ', 'দ', '্', 'ব', 'ৈ', 'ত']
extracted graphemes ['অ', 'দ্বৈ', 'ত']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
grapheme: দ্বৈ, unicodes: ['দ', '্', 'ব', 'ৈ'], components: ('দ্ব', 'ৈ', '')
grapheme: ত, unicodes: ['ত'], componen

  elif word[i] in list(self.roots) + ['়']:


word: অর্থাত্, unicodes: ['অ', 'র', '্', 'থ', 'া', 'ত', '্']
word_reconstructed: অর্থা, unicodes: ['অ', 'র', '্', 'থ', 'া']
extracted graphemes ['অ', 'র্থা']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
grapheme: র্থা, unicodes: ['র', '্', 'থ', 'া'], components: ('থ', 'া', 'র্')
word: অসুস্থ্, unicodes: ['অ', 'স', 'ু', 'স', '্', 'থ', '্']
word_reconstructed: অসু, unicodes: ['অ', 'স', 'ু']
extracted graphemes ['অ', 'সু']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
grapheme: সু, unicodes: ['স', 'ু'], components: ('স', 'ু', '')
word: অস্ত্, unicodes: ['অ', 'স', '্', 'ত', '্']
word_reconstructed: অ, unicodes: ['অ']
extracted graphemes ['অ']
grapheme: অ, unicodes: ['অ'], components: ('অ', '', '')
word: অ্যান্, unicodes: ['অ', '্', 'য', 'া', 'ন', '্']
word_reconstructed: অ্যা, unicodes: ['অ', '্', 'য', 'া']
extracted graphemes ['অ্যা']
grapheme: অ্যা, unicodes: ['অ', '্', 'য', 'া'], components: ('অ', 'া', '্য')
word: অ্যারোসেটিকস্, unicodes: ['অ', '্', 'য', 'া', 'র', '

word: একাগ্, unicodes: ['এ', 'ক', 'া', 'গ', '্']
word_reconstructed: একা, unicodes: ['এ', 'ক', 'া']
extracted graphemes ['এ', 'কা']
grapheme: এ, unicodes: ['এ'], components: ('এ', '', '')
grapheme: কা, unicodes: ['ক', 'া'], components: ('ক', 'া', '')
word: এক্সপ্লোরেশনস্, unicodes: ['এ', 'ক', '্', 'স', 'প', '্', 'ল', 'ো', 'র', 'ে', 'শ', 'ন', 'স', '্']
word_reconstructed: এক্সপ্লোরেশন, unicodes: ['এ', 'ক', '্', 'স', 'প', '্', 'ল', 'ো', 'র', 'ে', 'শ', 'ন']
extracted graphemes ['এ', 'ক্স', 'প্লো', 'রে', 'শ', 'ন']
grapheme: এ, unicodes: ['এ'], components: ('এ', '', '')
grapheme: ক্স, unicodes: ['ক', '্', 'স'], components: ('ক্স', '', '')
grapheme: প্লো, unicodes: ['প', '্', 'ল', 'ো'], components: ('প্ল', 'ো', '')
grapheme: রে, unicodes: ['র', 'ে'], components: ('র', 'ে', '')
grapheme: শ, unicodes: ['শ'], components: ('শ', '', '')
grapheme: ন, unicodes: ['ন'], components: ('ন', '', '')
word: এল্, unicodes: ['এ', 'ল', '্']
word_reconstructed: এ, unicodes: ['এ']
extracted graphemes ['এ']
grap

word: কিমাশ্চর্যম্, unicodes: ['ক', 'ি', 'ম', 'া', 'শ', '্', 'চ', 'র', '্', 'য', 'ম', '্']
word_reconstructed: কিমাশ্চর্য, unicodes: ['ক', 'ি', 'ম', 'া', 'শ', '্', 'চ', 'র', '্', 'য']
extracted graphemes ['কি', 'মা', 'শ্চ', 'র্য']
grapheme: কি, unicodes: ['ক', 'ি'], components: ('ক', 'ি', '')
grapheme: মা, unicodes: ['ম', 'া'], components: ('ম', 'া', '')
grapheme: শ্চ, unicodes: ['শ', '্', 'চ'], components: ('শ্চ', '', '')
grapheme: র্য, unicodes: ['র', '্', 'য'], components: ('য', '', 'র্')
word: কিম্, unicodes: ['ক', 'ি', 'ম', '্']
word_reconstructed: কি, unicodes: ['ক', 'ি']
extracted graphemes ['কি']
grapheme: কি, unicodes: ['ক', 'ি'], components: ('ক', 'ি', '')
word: কুন্ড্রম্, unicodes: ['ক', 'ু', 'ন', '্', 'ড', '্', 'র', 'ম', '্']
word_reconstructed: কুন্ড্র, unicodes: ['ক', 'ু', 'ন', '্', 'ড', '্', 'র']
extracted graphemes ['কু', 'ন্ড্র']
grapheme: কু, unicodes: ['ক', 'ু'], components: ('ক', 'ু', '')
grapheme: ন্ড্র, unicodes: ['ন', '্', 'ড', '্', 'র'], components: ('ন্ড', '', 

KeyboardInterrupt: 