In [1]:
import six
import sys
# sys.path.append('..')
import tkseem as tk

In [9]:
word = 'حالكم'

In [3]:
class MorphBert(tk.MorphologicalTokenizer):
    
    max_input_chars_per_word = 10
    
    def _tokenize_from_dict(self):
        pass
    
    def convert_to_unicode(self,text):
        """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
        if six.PY3:
            if isinstance(text, str):
                return text
            elif isinstance(text, bytes):
                return text.decode("utf-8", "ignore")
            else:
                raise ValueError("Unsupported string type: %s" % (type(text)))
        elif six.PY2:
            if isinstance(text, str):
                return text.decode("utf-8", "ignore")
            elif isinstance(text, unicode):
                return text
            else:
                raise ValueError("Unsupported string type: %s" % (type(text)))
        else:
            raise ValueError("Not running on Python2 or Python 3?")

    def whitespace_tokenize(self, text):
        """Runs basic whitespace cleaning and splitting on a piece of text."""
        text = text.strip()
        if not text:
            return []
        tokens = text.split()
        return tokens
    
    def tokenize(self,word):
        return self._split_word(word)
        
    def _split_word(self, text):
        """Tokenizes a piece of text into its word pieces.
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer.
        Returns:
          A list of wordpiece tokens.
        """

        text = self.convert_to_unicode(text)

        output_tokens = []
        for token in self.whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

In [4]:
class MorphGenerators(tk.MorphologicalTokenizer):
    
    def _tokenize_from_dict(self, text, freq_dict, cache=False, max_size=20):
        """Tokenize using frequency based approach given a dictionary

        Args:
            text (str): input string
            freq_dict (dict): frequency dictionary
            cache (bool, optional): faster approach. Defaults to False.
            max_size (int, optional): maximum word size. Defaults to 20.

        Returns:
            [type]: [description]
        """
        assert freq_dict
        tokens = []
        output_tokens = []
        for word in text.split():
            if len(word) >= max_size:
                print(f"{word} is too long ...")
                output_tokens.append(self.unk_token)
                continue
            if word in freq_dict:
                output_tokens.append(word)
            else:
                groups_of_subwords = self._split_word(word)
                for group in groups_of_subwords:
                    group[0] = group[0].replace('##','')
                groups_of_valid_subwords = list(
                        filter(
                            lambda group: all(
                                subword in freq_dict for subword in group
                            ),
                            groups_of_subwords,
                        )
                    )
                if groups_of_valid_subwords:
                    break
        
        if len(groups_of_valid_subwords) == 0:
            output_tokens.append(self.unk_token)
        else:
            sorted_groups_of_valid_subwords = sorted(
                groups_of_valid_subwords,
                key=lambda group: sum(freq_dict[subword] for subword in group),
            )
            tokens = sorted_groups_of_valid_subwords[-1]
            for token in tokens:
                output_tokens.append(str(token))
        return output_tokens
    
    def _split_word(self, word):
        """Split a word into a specific number of sub-words

        Args:
            word (str): word input
            number_of_subwords (int): number of subtokens to generate from the word 
        
        Returns:
            list: list of subwords 
        """
        def _split(_word):
            if not _word:
                return
            yield [f'##{_word}',]
            for i in range(1, len(_word)):
                for subwords in self._split_word(_word[i:]):
                    yield [f'##{word[:i]}'] +subwords
                    
        subwords_groups = [group for group in _split(word)]
        return subwords_groups


In [10]:
# training each tokenizer
morph_generators = MorphGenerators()
morph_generators.train()

morph_bert = MorphBert()
morph_bert.train()

morph = tk.MorphologicalTokenizer()
morph.train()

Training AutoTokenizer ...
Training AutoTokenizer ...
Training AutoTokenizer ...


In [6]:
%%timeit
morph_generators.tokenize(word)

89.4 µs ± 10 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [7]:
%%timeit
morph_bert.tokenize(word)

3.87 µs ± 344 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [8]:
%%timeit
morph.tokenize(word)

2.93 µs ± 54.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
