# RoBERTa LM training

In [1]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbmarcin[0m (use `wandb login --relogin` to force relogin)


True

In [2]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [3]:
vocab_size = 16_000

In [4]:
special_tokens = [
    '<url>',
    '<email>',
    '<number>',
    '<date>', 
]

In [5]:
import os

In [6]:
dev_ds = "../data/dev/lm.txt"
test_ds = "../data/test/lm.txt"
train_ds = "../data/train/lm.txt"

notebook_path_prefix = "roberta_lm"

## Building the tokenizer

In [7]:
from tokenizers import ByteLevelBPETokenizer

In [8]:
bpe = ByteLevelBPETokenizer()

In [9]:
bpe.train(
    files=[train_ds], 
    vocab_size=vocab_size, 
    min_frequency=2, 
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ] + special_tokens
)






In [10]:
os.makedirs(notebook_path_prefix, exist_ok=True)
bpe.save_model(notebook_path_prefix)

['roberta_lm/vocab.json', 'roberta_lm/merges.txt']

## Create the tokenizer

In [11]:
from transformers import RobertaConfig, RobertaTokenizerFast

In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained(notebook_path_prefix, max_len=512, use_fast=True)

file roberta_lm/config.json not found
file roberta_lm/config.json not found


In [13]:
tokenizer.add_special_tokens({
    'additional_special_tokens': special_tokens
})

0

In [14]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['<url>', '<email>', '<number>', '<date>']}

## Build dataset

In [15]:
from datasets import load_dataset

In [16]:
dataset = load_dataset('text', data_files={'train': [train_ds], 'test': [test_ds], 'dev': [dev_ds]})

Using custom data configuration default-799b7100722ede0b
Reusing dataset text (/home/mborzymowski/.cache/huggingface/datasets/text/default-799b7100722ede0b/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [17]:
def encode(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [18]:
tokenized_datasets = dataset.map(
    encode,
    batched=True,
    remove_columns=['text'],
    load_from_cache_file=True,
)

Loading cached processed dataset at /home/mborzymowski/.cache/huggingface/datasets/text/default-799b7100722ede0b/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-a9f852216702fb6b.arrow
Loading cached processed dataset at /home/mborzymowski/.cache/huggingface/datasets/text/default-799b7100722ede0b/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-94b0d63627c9f3f7.arrow
Loading cached processed dataset at /home/mborzymowski/.cache/huggingface/datasets/text/default-799b7100722ede0b/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5/cache-29039055cb7e9074.arrow


## DS collocator

In [19]:
from transformers import DataCollatorForLanguageModeling

In [20]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

## Build model

In [21]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]='3,4'

In [22]:
from transformers import RobertaForMaskedLM

In [23]:
config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=1,
    layer_norm_eps=0.00001,
    hidden_size=768,
    hidden_dropout_prob=0.1
)

In [24]:
model = RobertaForMaskedLM(config=config)

In [25]:
model.num_parameters()

98347648

## Training config

In [26]:
import torch
torch.cuda.is_available()

True

In [27]:
from transformers import Trainer, TrainingArguments

In [28]:
training_args = TrainingArguments(
    output_dir=notebook_path_prefix+"_lm",
    overwrite_output_dir=True,
    num_train_epochs=180,
    per_device_train_batch_size=18,
    per_device_eval_batch_size=32,
    save_steps=10_000,
    save_total_limit=3,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    logging_steps=2500,
    eval_steps=2500,
    evaluation_strategy='steps',
    report_to="wandb",
    run_name="roberta-lm"
)

In [29]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets['dev']
)

## Train

In [30]:
trainer.train()

***** Running training *****
  Num examples = 23999
  Num Epochs = 180
  Instantaneous batch size per device = 18
  Total train batch size (w. parallel, distributed & accumulation) = 36
  Gradient Accumulation steps = 1
  Total optimization steps = 120060
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Step,Training Loss,Validation Loss
2500,5.2905,3.825563
5000,3.3831,2.45777
7500,2.4362,1.921545
10000,1.9489,1.533559
12500,1.6162,1.340742
15000,1.4336,1.228006
17500,1.3126,1.154202
20000,1.2205,1.086407
22500,1.1511,1.040299
25000,1.0894,0.998654


***** Running Evaluation *****
  Num examples = 3427
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 64
Saving model checkpoint to roberta_lm_lm/checkpoint-10000
Configuration saved in roberta_lm_lm/checkpoint-10000/config.json
Model weights saved in roberta_lm_lm/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in roberta_lm_lm/checkpoint-10000/tokenizer_config.json
Special tokens file saved in roberta_lm_lm/checkpoint-10000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 64
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 64
Saving model checkpoint to roberta_lm_lm/checkpoint-20000
Configurat

TrainOutput(global_step=120060, training_loss=0.9726481260547196, metrics={'train_runtime': 110275.0351, 'train_samples_per_second': 39.173, 'train_steps_per_second': 1.089, 'total_flos': 1.1368047004169011e+18, 'train_loss': 0.9726481260547196, 'epoch': 180.0})

In [31]:
eval_output = trainer.evaluate(tokenized_datasets["test"]); eval_output

***** Running Evaluation *****
  Num examples = 6858
  Batch size = 64


{'eval_loss': 0.8390029668807983,
 'eval_runtime': 67.215,
 'eval_samples_per_second': 102.031,
 'eval_steps_per_second': 1.607,
 'epoch': 180.0}

In [32]:
trainer.save_model()

Saving model checkpoint to roberta_lm_lm
Configuration saved in roberta_lm_lm/config.json
Model weights saved in roberta_lm_lm/pytorch_model.bin
tokenizer config file saved in roberta_lm_lm/tokenizer_config.json
Special tokens file saved in roberta_lm_lm/special_tokens_map.json


## Perplexity

In [34]:
import math

In [35]:
perplexity = math.exp(eval_output["eval_loss"])
print(perplexity) #2.373339808006104

2.314058633127775


In [36]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=notebook_path_prefix+"_lm",
    tokenizer=notebook_path_prefix+"_lm"
)

loading configuration file roberta_lm_lm/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 16000
}

loading configuration file roberta_lm_lm/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_drop

In [37]:
fill_mask("Hello <mask>.")

[{'sequence': 'Hello impact.',
  'score': 0.15509869158267975,
  'token': 1036,
  'token_str': ' impact'},
 {'sequence': 'Helloticle.',
  'score': 0.10845522582530975,
  'token': 7591,
  'token_str': 'ticle'},
 {'sequence': 'Helloex.',
  'score': 0.09869387745857239,
  'token': 572,
  'token_str': 'ex'},
 {'sequence': 'Hellotices.',
  'score': 0.06232593581080437,
  'token': 2233,
  'token_str': 'tices'},
 {'sequence': 'Hello ex.',
  'score': 0.043094780296087265,
  'token': 376,
  'token_str': ' ex'}]

In [38]:
fill_mask("European <mask>.")

[{'sequence': 'European ex.',
  'score': 0.4437962770462036,
  'token': 376,
  'token_str': ' ex'},
 {'sequence': 'European post.',
  'score': 0.09270468354225159,
  'token': 2084,
  'token_str': ' post'},
 {'sequence': 'European ante.',
  'score': 0.052729833871126175,
  'token': 6676,
  'token_str': ' ante'},
 {'sequence': 'European n.',
  'score': 0.029660899192094803,
  'token': 322,
  'token_str': ' n'},
 {'sequence': 'European no.',
  'score': 0.011745907366275787,
  'token': 651,
  'token_str': ' no'}]