## Training a Hindi Tokenizer

In [2]:
!pip install tokenizers



In [3]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [4]:
paths = [str(x) for x in Path("./hindi_data/").glob("**/*.txt")]
print(paths)

['hindi_data/hin_wikipedia_2021_30K-co_n.txt', 'hindi_data/hin_wikipedia_2021_30K-sentences.txt', 'hindi_data/hin_wikipedia_2021_30K-inv_so.txt', 'hindi_data/hin_wikipedia_2021_30K-sources.txt', 'hindi_data/hin_wikipedia_2021_30K-words.txt', 'hindi_data/hin_wikipedia_2021_30K-inv_w.txt', 'hindi_data/hin_wikipedia_2021_30K-co_s.txt']


In [5]:
tokenizer = ByteLevelBPETokenizer()

In [6]:
tokenizer.train(files='hindi_data/hin_wikipedia_2021_30K-words.txt', vocab_size=52000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])






In [15]:
tokenizer.save_pretrained(".", "hindiBERTo")

AttributeError: 'ByteLevelBPETokenizer' object has no attribute 'save_pretrained'

In [8]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    './hindiBERTo/hindiBERTo-vocab.json',
    './hindiBERTo/hindiBERTo-merges.txt',
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)

tokens = (tokenizer.encode("इंद्रधनुष में 7 रंग होते हैं"))

print(tokens.tokens)

['<s>', 'à¤ĩ', 'à¤Ĥ', 'à¤¦', 'à¥į', 'à¤°à¤§à¤¨', 'à¥ģ', 'à¤·', 'Ġà¤®', 'à¥ĩà¤Ĥ', 'Ġ7', 'Ġà¤°', 'à¤Ĥ', 'à¤Ĺ', 'Ġà¤¹', 'à¥ĭ', 'à¤¤', 'à¥ĩ', 'Ġà¤¹', 'à¥Īà¤Ĥ', '</s>']


In [11]:
from torch.utils.data import Dataset

class HindiDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            './hindiBERTo-vocab.json',
            './hindiBERTo-merges.txt',
        )

        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/").glob("*-eval.txt") if evaluate else Path("./data/").glob("*-train.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [16]:
!python run_lm_training.py \
    --output_dir=./models/hindiBERTo-v1 \
    --model_type=roberta \
    --model_name_or_path=distilbert-base-multilingual-cased \
    --do_train \
    --train_data_file=./hindi_data/hin_wikipedia_2021_30K-sentences.txt \
    --do_eval \
    --eval_data_file=./hindi_data/hin_wikipedia_2021_30K-sentences.txt \
    --mlm

02/05/2022 15:08:09 - INFO - requests.packages.urllib3.connectionpool -   Starting new HTTPS connection (1): s3.amazonaws.com
02/05/2022 15:08:10 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json not found in cache or force_download set to True, downloading to /Users/anjaneyatripathi/.cache/torch/transformers/tmpbjzbja_5
02/05/2022 15:08:10 - INFO - requests.packages.urllib3.connectionpool -   Starting new HTTPS connection (1): s3.amazonaws.com
Downloading: 100%|█████████████████████████████| 466/466 [00:00<00:00, 75.9kB/s]
02/05/2022 15:08:12 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json in cache at /Users/anjaneyatripathi/.cache/torch/transformers/aee7490b1a48646df683dee12f25d9c63ebbf8dce1b7e1a656ce28830d9a7e86.bc76a47cb1c1c2984e48f23afbd3473a944ac1a2be9a8c8200092f5bf62153c9
02/05/2022 15:08:12 - INFO -