# RoBERTa LM training

In [1]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: bmarcin (use `wandb login --relogin` to force relogin)


True

In [2]:
vocab_size = 16_000

In [3]:
special_tokens = [
    '<url>',
    '<email>',
    '<number>',
    '<date>', 
]

In [4]:
import os

In [5]:
dev_ds = "../data/dev/lm.txt"
test_ds = "../data/test/lm.txt"
train_ds = "../data/train/lm.txt"

notebook_path_prefix = "roberta_lm"

## Building the tokenizer

In [6]:
from tokenizers import ByteLevelBPETokenizer

In [7]:
bpe = ByteLevelBPETokenizer()

In [8]:
bpe.train(
    files=[train_ds], 
    vocab_size=vocab_size, 
    min_frequency=2, 
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ] + special_tokens
)

In [9]:
os.makedirs(notebook_path_prefix, exist_ok=True)
bpe.save_model(notebook_path_prefix)

['roberta_lm\\vocab.json', 'roberta_lm\\merges.txt']

## Create the tokenizer

In [10]:
from transformers import RobertaConfig, RobertaTokenizerFast

In [11]:
tokenizer = RobertaTokenizerFast.from_pretrained(notebook_path_prefix, max_len=512, use_fast=True)

file roberta_lm\config.json not found
file roberta_lm\config.json not found


In [12]:
tokenizer.add_special_tokens({
    'additional_special_tokens': special_tokens
})

0

In [13]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['<url>', '<email>', '<number>', '<date>']}

## Build dataset

In [14]:
from datasets import load_dataset

In [15]:
dataset = load_dataset('text', data_files={'train': [train_ds], 'test': [test_ds], 'dev': [dev_ds]})

Using custom data configuration default-abbb67285606e5bb


Downloading and preparing dataset text/default to C:\Users\Marcin Borzymowski\.cache\huggingface\datasets\text\default-abbb67285606e5bb\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset text downloaded and prepared to C:\Users\Marcin Borzymowski\.cache\huggingface\datasets\text\default-abbb67285606e5bb\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
def encode(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [17]:
tokenized_datasets = dataset.map(
    encode,
    batched=True,
    remove_columns=['text'],
    load_from_cache_file=True,
)

  0%|          | 0/24 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

## DS collocator

In [18]:
from transformers import DataCollatorForLanguageModeling

In [19]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

## Build model

In [20]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [21]:
from transformers import RobertaForMaskedLM

In [22]:
config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=514,
    num_attention_heads=8,
    num_hidden_layers=6,
    type_vocab_size=1,
    layer_norm_eps=0.00001,
    hidden_size=512,
    hidden_dropout_prob=0.1
)

In [23]:
model = RobertaForMaskedLM(config=config)

In [24]:
model.num_parameters()

33948288

## Training config

In [25]:
import torch
torch.cuda.is_available()

True

In [26]:
from transformers import Trainer, TrainingArguments

In [27]:
training_args = TrainingArguments(
    output_dir=notebook_path_prefix+"_lm",
    overwrite_output_dir=True,
    num_train_epochs=180,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=24,
    save_steps=10_000,
    save_total_limit=3,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    logging_steps=2500,
    eval_steps=2500,
    evaluation_strategy='steps',
    report_to="wandb",
    run_name="roberta-lm"
)

In [28]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets['dev']
)

## Train

In [None]:
trainer.train()

***** Running training *****
  Num examples = 23999
  Num Epochs = 180
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 360000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
wandb: wandb version 0.12.6 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Step,Training Loss,Validation Loss
2500,5.8262,5.270166
5000,5.171,4.771377
7500,4.5733,3.736655
10000,3.7961,3.150887
12500,3.3277,2.743751
15000,2.9447,2.442991
17500,2.6435,2.219984
20000,2.4261,2.053136
22500,2.2398,1.896559
25000,2.0733,1.741439


***** Running Evaluation *****
  Num examples = 3427
  Batch size = 24
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 24
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 24
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 24
Saving model checkpoint to roberta_lm_lm\checkpoint-10000
Configuration saved in roberta_lm_lm\checkpoint-10000\config.json
Model weights saved in roberta_lm_lm\checkpoint-10000\pytorch_model.bin
tokenizer config file saved in roberta_lm_lm\checkpoint-10000\tokenizer_config.json
Special tokens file saved in roberta_lm_lm\checkpoint-10000\special_tokens_map.json
Deleting older checkpoint [roberta_lm_lm\checkpoint-340000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 24
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 24
***** Running Evaluation *****
  Num examples = 3427
  Batch size = 24
***** Running Evaluation *****
  Num examples = 34

In [30]:
eval_output = trainer.evaluate(tokenized_datasets["test"]); eval_output

***** Running Evaluation *****
  Num examples = 6858
  Batch size = 24


{'eval_loss': 0.8642981648445129,
 'eval_runtime': 49.1668,
 'eval_samples_per_second': 139.484,
 'eval_steps_per_second': 5.817,
 'epoch': 180.0}

In [31]:
trainer.save_model()

Saving model checkpoint to roberta_lm_lm
Configuration saved in roberta_lm_lm\config.json
Model weights saved in roberta_lm_lm\pytorch_model.bin
tokenizer config file saved in roberta_lm_lm\tokenizer_config.json
Special tokens file saved in roberta_lm_lm\special_tokens_map.json


## Perplexity

In [32]:
import math

In [33]:
perplexity = math.exp(eval_output["eval_loss"])
print(perplexity) #2.756616

2.373339808006104


In [34]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=notebook_path_prefix+"_lm",
    tokenizer=notebook_path_prefix+"_lm"
)

loading configuration file roberta_lm_lm\config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 16000
}

loading configuration file roberta_lm_lm\config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropou

In [36]:
fill_mask("Hello <mask>.")

[{'sequence': 'Hello II.',
  'score': 0.10519658774137497,
  'token': 1064,
  'token_str': ' II'},
 {'sequence': 'Hello Commission.',
  'score': 0.05819733068346977,
  'token': 484,
  'token_str': ' Commission'},
 {'sequence': 'Hellotices.',
  'score': 0.05515122041106224,
  'token': 2233,
  'token_str': 'tices'},
 {'sequence': 'Hello speech.',
  'score': 0.02830634079873562,
  'token': 15420,
  'token_str': ' speech'},
 {'sequence': 'Hello all.',
  'score': 0.02700468897819519,
  'token': 673,
  'token_str': ' all'}]