In [1]:
# Reference: https://medium.com/analytics-vidhya/byolm-32d728efbf21

In [2]:
import os
import torch
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [3]:
print(torch.__version__)
torch.cuda.is_available()

1.7.0+cu101


True

In [4]:
base_dir = '../'

model_dir = base_dir + 'model/'
bert_model_dir = model_dir + 'newsBERT/'
tokenizer_mdl = model_dir + 'kn_tokenizer'
if not os.path.exists(tokenizer_mdl):
    os.makedirs(tokenizer_mdl, exist_ok=True)
    
training_file = base_dir + 'data/news-data.txt'

In [5]:
VOCAB_SIZE = 52000
MAX_POS_EMB = 514

In [6]:
# Train and save the tokenizer
kn_tokenizer = ByteLevelBPETokenizer()
print('Saving tokenizer at:', training_file)
kn_tokenizer.train(files=training_file, vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"])
kn_tokenizer.save_model(tokenizer_mdl)

Saving tokenizer at: ../data/news-data.txt


['../model/kn_tokenizer/vocab.json', '../model/kn_tokenizer/merges.txt']

In [7]:
# Fine-tune the pre-trained BERT model using RoBERTa configurations
from transformers import RobertaConfig, RobertaForMaskedLM
config = RobertaConfig(vocab_size=VOCAB_SIZE,
    max_position_embeddings=MAX_POS_EMB,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1)

# Load the model
roberta_model = RobertaForMaskedLM(config)

In [8]:
# # Initialize the tokenizer
# from tokenizers import ByteLevelBPETokenizer
# kn_tokenizer = ByteLevelBPETokenizer(tokenizer_mdl + "/vocab.json", tokenizer_mdl + "/merges.txt")
# kn_tokenizer._tokenizer.post_processor = BertProcessing(
#     ("</s>", kn_tokenizer.token_to_id("</s>")),
#     ("<s>", kn_tokenizer.token_to_id("<s>")),
# )
# kn_tokenizer.enable_truncation(max_length=MAX_POS_EMB)
# kn_tokenizer.enable_padding()

# kn_tokenizer._pad_token = '<PAD>'
# kn_tokenizer.mask_token = '<mask>'

In [9]:
# Load the trained tokenizer
from transformers import RobertaTokenizer
roberta_tokenizer = RobertaTokenizer(tokenizer_mdl + "/vocab.json", tokenizer_mdl + "/merges.txt")

In [10]:
# # Define a custom dataset loader
# from torch.utils.data import Dataset
# class CustomDataset(Dataset):
#     def __init__(self, tokenizer, data_file_paths, block_size=512):
#         self.tokenizer = tokenizer
#         # Load the data
#         self.sentences = []
#         for data_file in data_file_paths:
#             with open(data_file, encoding="utf-8") as f:
#                 self.sentences += [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
#         self.sentences = tokenizer.encode_batch(self.sentences)
            
#     def __len__(self):
#         return len(self.sentences)

#     def __getitem__(self, i):
#         # Return the tokenized sentence as tensor
#         return torch.tensor(self.sentences[i].ids, dtype=torch.long)
    
#dataset = CustomDataset(roberta_tokenizer, data_file_paths=[training_file])

In [11]:
# Dataset loaders
from transformers import DataCollatorForLanguageModeling
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(tokenizer=roberta_tokenizer, file_path=training_file, block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.15)

In [12]:
# Start training the language model
from transformers import Trainer, TrainingArguments

EPOCH=200
BATCH_SIZE=48 #decrease this number for out of memory issues

training_args = TrainingArguments(
    output_dir=bert_model_dir, #dir to save the model
    overwrite_output_dir=True,
    num_train_epochs=EPOCH,
    per_gpu_train_batch_size=BATCH_SIZE,
    save_steps=1000,
    save_total_limit=2)

#define trainer object with above training args
trainer = Trainer(model=roberta_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    tokenizer=roberta_tokenizer,
    prediction_loss_only=True)



In [16]:
torch.cuda.get_device_name(0)

'GeForce RTX 2080 Ti'

In [18]:
# save the model summary
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/model_summary')

#To create the tensor on GPU
cuda0 = torch.device('cuda')

# Sample input
tok_in = roberta_tokenizer.encode(["ತೆರೆಯಮೇಲೆ ಯಜನಮಾನನ <mask>"])
in_tensor = torch.tensor([tok_in], dtype=torch.long, device=cuda0)

# Add the graph
writer.add_graph(roberta_model, in_tensor)
writer.close()

In [None]:
#Train and save the model 
trainer.train()
trainer.save_model(bert_model_dir)

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,4.764899
1000,3.747816
1500,3.455454
2000,3.165751
2500,2.843078
3000,2.590426


In [None]:
for parameter in roberta_model.parameters():
    print(parameter)