In [37]:
import sys
sys.path.append('..')
import tkseem as tk

In [38]:
origianl_morph = tk.MorphologicalTokenizer()
origianl_morph.train()

Training MorphologicalTokenizer ...


In [139]:
class ModMorph(tk.MorphologicalTokenizer):
    def _split_word(self, word):
        """Split a word into a specific number of sub-words

        Args:
            word (str): word input
            number_of_subwords (int): number of subtokens to generate from the word 
        
        Returns:
            list: list of subwords 
        """
        def _split(_word):
            if not _word:
                return
            yield [f'##{_word}',]
            for i in range(1, len(_word)):
                for subwords in self._split_word(_word[i:]):
                    yield [f'##{word[:i]}'] +subwords
        subwords = _split(word)
        out_subwords = []
        for group in subwords:
            group[0] = group[0].replace('##','')
            out_subwords.append(group)
        return out_subwords

        
    def _tokenize_from_dict(self, text, freq_dict, cache=False, max_size=20):
        """Tokenize using frequency based approach given a dictionary

        Args:
            text (str): input string
            freq_dict (dict): frequency dictionary
            cache (bool, optional): faster approach. Defaults to False.
            max_size (int, optional): maximum word size. Defaults to 20.

        Returns:
            [type]: [description]
        """
        assert freq_dict
        tokens = []
        output_tokens = []
        for word in text.split():
            if len(word) >= max_size:
                print(f"{word} is too long ...")
                output_tokens.append(self.unk_token)
                continue
            if word in freq_dict:
                output_tokens.append(word)
            else:
                groups_of_subwords = self._split_word(word)
                groups_of_valid_subwords = list(
                        filter(
                            lambda group: all(
                                subword in freq_dict for subword in group
                            ),
                            groups_of_subwords,
                        )
                    )
                if groups_of_valid_subwords:
                    break
                if not next(groups_of_valid_subwords):
                    output_tokens.append(self.unk_token)
                else:
                    print(list(groups_of_valid_subwords))
                    sorted_groups_of_valid_subwords = sorted(
                        groups_of_valid_subwords,
                        key=lambda group: sum(freq_dict[subword] for subword in group),
                    )
                    tokens = sorted_groups_of_valid_subwords[-1]
                    for token in tokens:
                        output_tokens.append(str(token))
        return output_tokens

In [140]:
mod_morph = ModMorph()
mod_morph.train()

Training MorphologicalTokenizer ...


In [141]:
word = 'السلام'

In [155]:
%%time
subs = list()
for i in range(1,len(word)):
    subs += origianl_morph._split_word(word,i)

CPU times: user 380 µs, sys: 36 µs, total: 416 µs
Wall time: 426 µs


In [156]:
%%time
mod_morph._split_word(word)
''

CPU times: user 608 µs, sys: 59 µs, total: 667 µs
Wall time: 684 µs


''