In [1]:
# Train a ROBERTA from scratch

In [2]:
import os
import torch
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [3]:
print(torch.__version__)
torch.cuda.is_available()

1.7.0+cu101


True

In [4]:
base_dir = '../'

model_dir = base_dir + 'model/'
bert_model_dir = model_dir + 'cc100_kn_ROBERTA_pt/'
tokenizer_mdl = model_dir + 'cc100_kn_tokenizer_pt'
if not os.path.exists(tokenizer_mdl):
    os.makedirs(tokenizer_mdl, exist_ok=True)
    
#training_file = base_dir + 'data/news-data.txt'
# Bigger file
training_file = base_dir + '../cc-100_dataset/kn.txt'

ROBERTA_PRETRAINED = 'roberta-base' #125M params
#ROBERTA_PRETRAINED = 'distilroberta-base' #80M params

In [11]:
VOCAB_SIZE = 50265
MAX_POS_EMB = 514

In [12]:
IS_TRAIN_TOKENIZER = False

In [13]:
# Train and save the tokenizer
if IS_TRAIN_TOKENIZER:
    kn_tokenizer = ByteLevelBPETokenizer()
    print('Saving tokenizer at:', tokenizer_mdl)
    kn_tokenizer.train(files=training_file, vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"])
    kn_tokenizer.save_model(tokenizer_mdl)
else:
    print('NOT Training the tokenizer now!')

Saving tokenizer at: ../../cc-100_dataset/kn.txt


In [14]:
# Fine-tune the pre-trained BERT model using RoBERTa configurations
from transformers import RobertaConfig, RobertaForMaskedLM
config = RobertaConfig(vocab_size=VOCAB_SIZE,
    max_position_embeddings=MAX_POS_EMB,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1)

# Load the model
roberta_model = RobertaForMaskedLM.from_pretrained(ROBERTA_PRETRAINED, config=config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMaskedLM: ['roberta.encoder.layer.6.attention.self.query.weight', 'roberta.encoder.layer.6.attention.self.query.bias', 'roberta.encoder.layer.6.attention.self.key.weight', 'roberta.encoder.layer.6.attention.self.key.bias', 'roberta.encoder.layer.6.attention.self.value.weight', 'roberta.encoder.layer.6.attention.self.value.bias', 'roberta.encoder.layer.6.attention.output.dense.weight', 'roberta.encoder.layer.6.attention.output.dense.bias', 'roberta.encoder.layer.6.attention.output.LayerNorm.weight', 'roberta.encoder.layer.6.attention.output.LayerNorm.bias', 'roberta.encoder.layer.6.intermediate.dense.weight', 'roberta.encoder.layer.6.intermediate.dense.bias', 'roberta.encoder.layer.6.output.dense.weight', 'roberta.encoder.layer.6.output.dense.bias', 'roberta.encoder.layer.6.output.LayerNorm.weight', 'roberta.encoder.layer.6.output.LayerNorm.bias', 'roberta.encoder.layer.7.attention.self.query.

In [15]:
# Load the trained tokenizer
from transformers import RobertaTokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained(tokenizer_mdl)

In [16]:
# Dataset loaders
from transformers import DataCollatorForLanguageModeling
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(tokenizer=roberta_tokenizer, file_path=training_file, block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.15)

In [18]:
# Start training the language model
from transformers import Trainer, TrainingArguments

EPOCH=50
BATCH_SIZE=48 #decrease this number for out of memory issues

training_args = TrainingArguments(
    output_dir=bert_model_dir, #dir to save the model
    overwrite_output_dir=True,
    num_train_epochs=EPOCH,
    per_gpu_train_batch_size=BATCH_SIZE,
    max_steps=(EPOCH * 13000000), # EPOCH * Train size
    save_steps=10000,
    save_total_limit=2)

#define trainer object with above training args
trainer = Trainer(model=roberta_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    tokenizer=roberta_tokenizer,
    prediction_loss_only=True)



In [19]:
print('Number of parameters:', roberta_model.num_parameters())

Number of parameters: 82170201


In [20]:
# save the model summary
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/model_summary')

#To create the tensor on GPU
cuda0 = torch.device('cuda')

# Sample input
tok_in = roberta_tokenizer.encode(["ತೆರೆ ಮೇಲೆ ಯಜಮಾನನ <mask>"])
in_tensor = torch.tensor([tok_in], dtype=torch.long, device=cuda0)

# Add the graph
writer.add_graph(roberta_model, in_tensor)
writer.close()

  input_tensor.shape == tensor_shape for input_tensor in input_tensors


In [None]:
#Train and save the model 
trainer.train()
trainer.save_model(bert_model_dir)

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,3.786863
1000,2.964228
1500,2.658411
2000,2.458144
2500,2.307055
3000,2.20337
3500,2.100064
4000,2.022754
4500,1.94975
5000,1.89057


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed