## Training a Hindi Tokenizer

In [1]:
!pip install tokenizers



In [2]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [9]:
paths = [str(x) for x in Path("./hindi_data/").glob("**/*.txt")]
print(paths)

['hindi_data/hin_wikipedia_2021_30K-co_n.txt', 'hindi_data/hin_wikipedia_2021_30K-sentences.txt', 'hindi_data/hin_wikipedia_2021_30K-inv_so.txt', 'hindi_data/hin_wikipedia_2021_30K-sources.txt', 'hindi_data/hin_wikipedia_2021_30K-words.txt', 'hindi_data/hin_wikipedia_2021_30K-inv_w.txt', 'hindi_data/hin_wikipedia_2021_30K-co_s.txt']


In [4]:
tokenizer = ByteLevelBPETokenizer()

In [12]:
tokenizer.train(files='hindi_data/hin_wikipedia_2021_30K-words.txt', vocab_size=52000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [13]:
tokenizer.save_model(".", "hindiBERTo")

['./hindiBERTo-vocab.json', './hindiBERTo-merges.txt']

In [8]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    './hindiBERTo/hindiBERTo-vocab.json',
    './hindiBERTo/hindiBERTo-merges.txt',
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)

tokens = (tokenizer.encode("इंद्रधनुष में 7 रंग होते हैं"))

print(tokens.tokens)

['<s>', 'à¤ĩ', 'à¤Ĥ', 'à¤¦', 'à¥į', 'à¤°à¤§à¤¨', 'à¥ģ', 'à¤·', 'Ġà¤®', 'à¥ĩà¤Ĥ', 'Ġ7', 'Ġà¤°', 'à¤Ĥ', 'à¤Ĺ', 'Ġà¤¹', 'à¥ĭ', 'à¤¤', 'à¥ĩ', 'Ġà¤¹', 'à¥Īà¤Ĥ', '</s>']


In [11]:
from torch.utils.data import Dataset

class HindiDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            './hindiBERTo-vocab.json',
            './hindiBERTo-merges.txt',
        )

        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/").glob("*-eval.txt") if evaluate else Path("./data/").glob("*-train.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [12]:
from transformers import pipeline

fill_mask = pipeline('fill-mask', model='./hindiBERTo', tokenizer='./hindiBERTo')

fill_mask("इंद्रधनुष में 7 रंग होते <mask>")

OSError: Can't load config for './hindiBERTo'. Make sure that:

- './hindiBERTo' is a correct model identifier listed on 'https://huggingface.co/models'

- or './hindiBERTo' is the correct path to a directory containing a config.json file

