# Pretraining RoBERTa for Bulgarian Masked-Language-Modeling

The notebook is based on [this guide](https://huggingface.co/blog/how-to-train).

In [None]:
!pip install transformers==2.11.0

Collecting transformers==2.11.0
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 2.8MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 16.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 19.0MB/s 
[?25hCollecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


# Training the tokenizer

In [None]:
%cd '/content/drive/My Drive/ml_hw/NLP/bulgarian/'

from tokenizers import ByteLevelBPETokenizer

datapaths= ['bul_wikipedia_2016_1M-sentences_notab.txt',
            'bul_newscrawl_2017_1M-sentences_notab.txt']

tokenizer = ByteLevelBPETokenizer()

/content/drive/My Drive/ml_hw/NLP/bulgarian


In [None]:
MODEL_DIR = "./roberta-small-bg"

In [None]:
tokenizer.train(files=datapaths, vocab_size=52_000, min_frequency=2, show_progress=True, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
])

In [None]:
!mkdir roberta-small-bg
tokenizer.save(MODEL_DIR)

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    MODEL_DIR + "/vocab.json",
    MODEL_DIR + "/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(tokenizer.encode("аз съм мъж").ids)

[0, 964, 1081, 2115, 2]


# Pretraining the model

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_DIR, model_max_length=512)

## Dataset preprocessing

In [None]:
from torch.utils.data import Dataset
import transformers

MAX_SEQ_LEN = 512

class TextDataset(Dataset):

    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, file_paths):
        lines = []
        i = 1
        for path in file_paths:
            print('Reading {}/{} {}'.format(i, len(file_paths), path))
            with open(path, encoding="utf-8") as f:
                lines.extend(
                    [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
                )
            i += 1
        print('Packing sequences...')
        packed_sequences = self.pack_sequences(lines)
        self.examples = packed_sequences
        print('Dataset ready.')
    
    def pack_sequences(self, text):
        data = []

        concat_len = 0
        concat_string = ""

        i = 0
        checkpoints = [i for i in range(10, 110, 10)]
        
        for line in text:

            percent = round((i / len(text)) * 100)
            if percent in checkpoints:
                print('{}% complete'.format(percent))
                checkpoints.pop(0)

            # first tokenize the current line
            encoding = tokenizer.encode_plus(
                line,
                truncation=True
            )
            tokenized_line = encoding['input_ids']

            # then we'll try to add it to the current sequence we're packing
            if concat_len + len(tokenized_line) < MAX_SEQ_LEN:
                concat_len += len(tokenized_line)
                concat_string += line

            # if the current sequence is already full, add it and make a new one    
            else:
                data.append(concat_string)
                concat_len = len(tokenized_line)
                concat_string = line
            
            i += 1

        # we'll have one unfinished sequence left over after iterating
        data.append(concat_string)

        return data

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        encoding = tokenizer.encode_plus(
            self.examples[i], 
            add_special_tokens=True, 
            truncation=True,
            max_length=MAX_SEQ_LEN)
        return torch.tensor(encoding['input_ids'], dtype=torch.long)

datapaths = ['bul_newscrawl_2017_1M-sentences_notab.txt', 
             'bul_wikipedia_2016_1M-sentences_notab.txt']
datapaths.extend(
    ['bg_text_{}.txt'.format(i) for i in range(1_000_000, 9_000_000, 1_000_000)]
)

train_dataset = TextDataset(tokenizer, datapaths)
eval_dataset = TextDataset(tokenizer, ['bg_text_26000000.txt'])

Reading 1/10 bul_newscrawl_2017_1M-sentences_notab.txt
Reading 2/10 bul_wikipedia_2016_1M-sentences_notab.txt
Reading 3/10 bg_text_1000000.txt
Reading 4/10 bg_text_2000000.txt
Reading 5/10 bg_text_3000000.txt
Reading 6/10 bg_text_4000000.txt
Reading 7/10 bg_text_5000000.txt
Reading 8/10 bg_text_6000000.txt
Reading 9/10 bg_text_7000000.txt
Reading 10/10 bg_text_8000000.txt
Packing sequences...
10% complete
20% complete
30% complete
40% complete
50% complete
60% complete
70% complete
80% complete
90% complete
100% complete
Dataset ready.
Reading 1/1 bg_text_26000000.txt
Packing sequences...
10% complete
20% complete
30% complete
40% complete
50% complete
60% complete
70% complete
80% complete
90% complete
100% complete
Dataset ready.


## Model Configuration

In [None]:
from transformers import RobertaForMaskedLM
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config)

## Training

In [None]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    logging_steps=1_000,
    save_steps=20000
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    prediction_loss_only=True,
)

In [None]:
trainer.train()
trainer.save_model(MODEL_DIR)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=168709.0, style=ProgressStyle(description…

{"loss": 7.940483513355255, "learning_rate": 4.970363169718272e-05, "epoch": 0.005927366056345542, "step": 1000}
{"loss": 7.51009136724472, "learning_rate": 4.940726339436545e-05, "epoch": 0.011854732112691084, "step": 2000}
{"loss": 7.38911885356903, "learning_rate": 4.911089509154817e-05, "epoch": 0.017782098169036625, "step": 3000}
{"loss": 7.304487721443176, "learning_rate": 4.88145267887309e-05, "epoch": 0.023709464225382167, "step": 4000}
{"loss": 7.253735353946686, "learning_rate": 4.8518158485913614e-05, "epoch": 0.02963683028172771, "step": 5000}
{"loss": 7.201186093330383, "learning_rate": 4.8221790183096335e-05, "epoch": 0.03556419633807325, "step": 6000}
{"loss": 7.176096982955933, "learning_rate": 4.792542188027906e-05, "epoch": 0.04149156239441879, "step": 7000}
{"loss": 7.1339053525924685, "learning_rate": 4.7629053577461784e-05, "epoch": 0.047418928450764335, "step": 8000}
{"loss": 7.118884016990662, "learning_rate": 4.7332685274644505e-05, "epoch": 0.05334629450710988,



{"loss": 6.792776386737824, "learning_rate": 4.377626564083718e-05, "epoch": 0.12447468718325638, "step": 21000}
{"loss": 6.782762974262238, "learning_rate": 4.3479897338019904e-05, "epoch": 0.1304020532396019, "step": 22000}
{"loss": 6.743408578395844, "learning_rate": 4.318352903520263e-05, "epoch": 0.13632941929594747, "step": 23000}
{"loss": 6.686858028411865, "learning_rate": 4.288716073238535e-05, "epoch": 0.142256785352293, "step": 24000}
{"loss": 6.60186936712265, "learning_rate": 4.2590792429568074e-05, "epoch": 0.14818415140863855, "step": 25000}
{"loss": 6.503564147949219, "learning_rate": 4.2294424126750795e-05, "epoch": 0.15411151746498408, "step": 26000}
{"loss": 6.417073035240174, "learning_rate": 4.1998055823933516e-05, "epoch": 0.16003888352132964, "step": 27000}
{"loss": 6.301224122524261, "learning_rate": 4.1701687521116243e-05, "epoch": 0.16596624957767517, "step": 28000}
{"loss": 6.214125962257385, "learning_rate": 4.1405319218298964e-05, "epoch": 0.171893615634020

The training stopped at some point so let's reload from a checkpoint

In [None]:
model = RobertaForMaskedLM(config=config).from_pretrained(MODEL_DIR + '/checkpoint-60000')

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    prediction_loss_only=True,
)

trainer.train(MODEL_DIR + '/checkpoint-60000')
trainer.save_model(MODEL_DIR)



HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=168709.0, style=ProgressStyle(description…

{"loss": 3.925383972167969, "learning_rate": 3.1921533528146096e-05, "epoch": 0.36156932943707804, "step": 61000}
{"loss": 3.8892468149662016, "learning_rate": 3.1625165225328824e-05, "epoch": 0.36749669549342356, "step": 62000}
{"loss": 3.861357157945633, "learning_rate": 3.1328796922511545e-05, "epoch": 0.37342406154976915, "step": 63000}
{"loss": 3.8234396750926973, "learning_rate": 3.103242861969427e-05, "epoch": 0.3793514276061147, "step": 64000}
{"loss": 3.8271664266586303, "learning_rate": 3.073606031687699e-05, "epoch": 0.3852787936624602, "step": 65000}
{"loss": 3.782004462480545, "learning_rate": 3.043969201405971e-05, "epoch": 0.39120615971880573, "step": 66000}
{"loss": 3.7677099952697755, "learning_rate": 3.014332371124244e-05, "epoch": 0.3971335257751513, "step": 67000}
{"loss": 3.7563066110610963, "learning_rate": 2.984695540842516e-05, "epoch": 0.40306089183149685, "step": 68000}
{"loss": 3.7345009541511534, "learning_rate": 2.955058710560788e-05, "epoch": 0.40898825788



{"loss": 3.5365067932605743, "learning_rate": 2.5994167471800556e-05, "epoch": 0.4801166505639889, "step": 81000}
{"loss": 3.51975986289978, "learning_rate": 2.5697799168983277e-05, "epoch": 0.4860440166203344, "step": 82000}
{"loss": 3.513608780860901, "learning_rate": 2.5401430866166004e-05, "epoch": 0.49197138267667995, "step": 83000}
{"loss": 3.4903784427642823, "learning_rate": 2.5105062563348725e-05, "epoch": 0.49789874873302553, "step": 84000}
{"loss": 3.494800758123398, "learning_rate": 2.480869426053145e-05, "epoch": 0.503826114789371, "step": 85000}
{"loss": 3.4653803877830507, "learning_rate": 2.451232595771417e-05, "epoch": 0.5097534808457166, "step": 86000}
{"loss": 3.460529038906097, "learning_rate": 2.4215957654896895e-05, "epoch": 0.5156808469020622, "step": 87000}
Buffered data was truncated after reaching the output size limit.

In [None]:
model = RobertaForMaskedLM(config=config).from_pretrained(MODEL_DIR + '/checkpoint-160000')

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    prediction_loss_only=True,
)

trainer.train(MODEL_DIR + '/checkpoint-160000')
trainer.save_model(MODEL_DIR)



HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=168709.0, style=ProgressStyle(description…

{"loss": 3.0524137647151948, "learning_rate": 2.284703246418389e-06, "epoch": 0.9543059350716322, "step": 161000}
{"loss": 3.0630431191921232, "learning_rate": 1.988334943601112e-06, "epoch": 0.9602333011279778, "step": 162000}
{"loss": 3.055609819173813, "learning_rate": 1.6919666407838351e-06, "epoch": 0.9661606671843233, "step": 163000}
{"loss": 3.054229326248169, "learning_rate": 1.395598337966558e-06, "epoch": 0.9720880332406688, "step": 164000}
{"loss": 3.0462097029685973, "learning_rate": 1.0992300351492807e-06, "epoch": 0.9780153992970144, "step": 165000}
{"loss": 3.051473555088043, "learning_rate": 8.028617323320036e-07, "epoch": 0.9839427653533599, "step": 166000}
{"loss": 3.05576734995842, "learning_rate": 5.064934295147266e-07, "epoch": 0.9898701314097055, "step": 167000}
{"loss": 3.0550967285633086, "learning_rate": 2.101251266974495e-07, "epoch": 0.9957974974660511, "step": 168000}


