In [1]:
import torch

from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM

from utils.tokenizer import get_tokenizer

import os
import glob
import warnings

In [2]:
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [3]:
##Get the checkpoint with the highest number of steps
checkpoint_path = 'data/model/custom-lm/'
model_checkpoints = glob.glob(os.path.join(checkpoint_path, 'checkpoint-*'))
latest_checkpoint = 'checkpoint-' + max([checkpoint.split('/')[-1].split('-')[-1] for checkpoint in model_checkpoints])

latest_checkpoint_path = os.path.join(checkpoint_path, latest_checkpoint)

if os.path.exists(latest_checkpoint_path) == False:
    raise ValueError("Cannot find the latest checkpoint path. Bug in code.")

In [4]:
tokenizer = get_tokenizer('data/tokenizer/trained_tokenizer.json')

In [5]:
config = RobertaConfig(vocab_size = 1000, 
                       max_position_embeddings = 514, 
                       num_attention_heads = 12, 
                       num_hidden_layers = 6, 
                       type_vocab_size = 1)

In [6]:
model = RobertaForMaskedLM(config)

In [7]:
dataset = LineByLineTextDataset(tokenizer = tokenizer, 
                                file_path = "data/internal/train_lm.txt", 
                                block_size=128)



In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm = True, mlm_probability = 0.15)

In [9]:
training_args = TrainingArguments(output_dir = 'data/model/custom-lm', 
                                  overwrite_output_dir = False, 
                                  num_train_epochs = 100, 
                                  per_device_train_batch_size = 512, 
                                  save_steps = 10_000, 
                                  save_total_limit = 2, 
                                  prediction_loss_only=True, 
                                  dataloader_num_workers = 2,
                                  bf16 = True)

In [10]:
trainer = Trainer(model = model, 
                  args = training_args, 
                  data_collator = data_collator, 
                  train_dataset = dataset)

Using amp half precision backend


In [None]:
trainer.train(latest_checkpoint_path)

Loading model from data/model/custom-lm/checkpoint-70000).
***** Running training *****
  Num examples = 1530382
  Num Epochs = 100
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 299000
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 23
  Continuing training from global step 70000
  Will skip the first 23 epochs then the first 1230 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


HBox(children=(FloatProgress(value=0.0, max=1230.0), HTML(value='')))