In [1]:
# Train a ROBERTA from scratch

In [2]:
import os
import torch
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [3]:
print(torch.__version__)
torch.cuda.is_available()

1.7.0+cu101


True

In [4]:
base_dir = '../'

model_dir = base_dir + 'model/'
bert_model_dir = model_dir + 'cc100_kn_ROBERTA/'
tokenizer_mdl = model_dir + 'cc100_kn_tokenizer'
if not os.path.exists(tokenizer_mdl):
    os.makedirs(tokenizer_mdl, exist_ok=True)
    
#training_file = base_dir + 'data/news-data.txt'
# Bigger file
training_file = base_dir + '../cc-100_dataset/kn.txt'

In [5]:
VOCAB_SIZE = 52000
MAX_POS_EMB = 514

In [6]:
IS_TRAIN_TOKENIZER = False

In [7]:
# Train and save the tokenizer
if IS_TRAIN_TOKENIZER:
    kn_tokenizer = ByteLevelBPETokenizer()
    print('Saving tokenizer at:', training_file)
    kn_tokenizer.train(files=training_file, vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"])
    kn_tokenizer.save_model(tokenizer_mdl)
else:
    print('NOT Training the tokenizer now!')

NOT Training the tokenizer now!


In [8]:
# Fine-tune the pre-trained BERT model using RoBERTa configurations
from transformers import RobertaConfig, RobertaForMaskedLM
config = RobertaConfig(vocab_size=VOCAB_SIZE,
    max_position_embeddings=MAX_POS_EMB,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1)

# Load the model
roberta_model = RobertaForMaskedLM(config)

In [9]:
# Load the trained tokenizer
from transformers import RobertaTokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained(tokenizer_mdl)

In [10]:
# Dataset loaders
from transformers import DataCollatorForLanguageModeling
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(tokenizer=roberta_tokenizer, file_path=training_file, block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.15)

In [11]:
# Start training the language model
from transformers import Trainer, TrainingArguments

EPOCH=50
BATCH_SIZE=48 #decrease this number for out of memory issues

training_args = TrainingArguments(
    output_dir=bert_model_dir, #dir to save the model
    overwrite_output_dir=True,
    num_train_epochs=EPOCH,
    per_gpu_train_batch_size=BATCH_SIZE,
    save_steps=1000,
    save_total_limit=2)

#define trainer object with above training args
trainer = Trainer(model=roberta_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    tokenizer=roberta_tokenizer,
    prediction_loss_only=True)



In [12]:
print('Number of parameters:', roberta_model.num_parameters())

Number of parameters: 83504416


In [13]:
# save the model summary
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/model_summary')

#To create the tensor on GPU
cuda0 = torch.device('cuda')

# Sample input
tok_in = roberta_tokenizer.encode(["ತೆರೆ ಮೇಲೆ ಯಜಮಾನನ <mask>"])
in_tensor = torch.tensor([tok_in], dtype=torch.long, device=cuda0)

# Add the graph
writer.add_graph(roberta_model, in_tensor)
writer.close()

  input_tensor.shape == tensor_shape for input_tensor in input_tensors


In [None]:
#Train and save the model 
trainer.train()
trainer.save_model(bert_model_dir)

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,4.94405
1000,4.181748
1500,3.882671
2000,3.676838
2500,3.515695
3000,3.369197
3500,3.201846
4000,3.024055
4500,2.850354
5000,2.715285
