In [3]:
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-hakl1zn7
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-hakl1zn7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
tokenizers                    0.11.4
transformers                  4.17.0.dev0


In [4]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files='hin_wikipedia_2021_30K-words.txt', vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 3.55 s, sys: 167 ms, total: 3.72 s
Wall time: 2.08 s


In [7]:
!mkdir HindiBERTo
tokenizer.save_model("HindiBERTo")

mkdir: cannot create directory ‘HindiBERTo’: File exists


['HindiBERTo/vocab.json', 'HindiBERTo/merges.txt']

In [8]:
import json
config = {
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,    
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 1,
"vocab_size": 50265
}

with open("HindiBERTo/config.json", 'w') as fp:
    json.dump(config, fp)

tokenizer_config = {"max_len": 512}

with open("HindiBERTo/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

In [9]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./HindiBERTo/vocab.json",
    "./HindiBERTo/merges.txt",
)

In [10]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [26]:
tokens = (tokenizer.encode("इंद्रधनुष में 7 रंग होते हैं"))

print(tokens)

[0, 331, 275, 282, 265, 266, 612, 284, 316, 438, 349, 225, 27, 453, 275, 283, 904, 279, 271, 276, 904, 424, 2]


In [11]:
# Check that we have a GPU
!nvidia-smi

Sat Feb 12 15:01:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [13]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [15]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./HindiBERTo", max_len=512)

In [16]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [17]:
model.num_parameters()
# => 84 million parameters

83504416

In [20]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./hin_wikipedia_2021_30K-words.txt",
    block_size=128,
)



CPU times: user 4.22 s, sys: 345 ms, total: 4.56 s
Wall time: 3.35 s


In [21]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [22]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./HindiBERTo",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [23]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 60669
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 948
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,4.5153




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 7min 2s, sys: 1min 23s, total: 8min 26s
Wall time: 8min 25s


TrainOutput(global_step=948, training_loss=4.061725857891614, metrics={'train_runtime': 505.8145, 'train_samples_per_second': 119.943, 'train_steps_per_second': 1.874, 'total_flos': 361804756105728.0, 'train_loss': 4.061725857891614, 'epoch': 1.0})

In [24]:
trainer.save_model("./HindiBERTo")

Saving model checkpoint to ./HindiBERTo
Configuration saved in ./HindiBERTo/config.json
Model weights saved in ./HindiBERTo/pytorch_model.bin


In [27]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./HindiBERTo",
    tokenizer="./HindiBERTo"
)

loading configuration file ./HindiBERTo/config.json
Model config RobertaConfig {
  "_name_or_path": "./HindiBERTo",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./HindiBERTo/config.json
Model config RobertaConfig {
  "_name_or_path": "./HindiBERTo",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dro

In [29]:
# The sun <mask>.
# =>
fill_mask("इंद्रधनुष में 7 रंग होते <mask>")

[{'score': 0.9234524369239807,
  'sequence': 'इंद्रधनुष में 7 रंग होते\t',
  'token': 202,
  'token_str': '\t'},
 {'score': 0.03927748650312424,
  'sequence': 'इंद्रधनुष में 7 रंग होते1',
  'token': 21,
  'token_str': '1'},
 {'score': 0.003578753210604191,
  'sequence': 'इंद्रधनुष में 7 रंग होते2',
  'token': 22,
  'token_str': '2'},
 {'score': 0.0016390023520216346,
  'sequence': 'इंद्रधनुष में 7 रंग होते3',
  'token': 23,
  'token_str': '3'},
 {'score': 0.0009601011406630278,
  'sequence': 'इंद्रधनुष में 7 रंग होतेा',
  'token': 264,
  'token_str': 'ा'}]