In [1]:
import torch

from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM

from utils.tokenizer import get_tokenizer

import os
import glob
import warnings

In [2]:
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [3]:
##Get the checkpoint with the highest number of steps
checkpoint_path = 'data/model/custom-lm/'
model_checkpoints = glob.glob(os.path.join(checkpoint_path, 'checkpoint-*'))
latest_checkpoint = 'checkpoint-' + max([checkpoint.split('/')[-1].split('-')[-1] for checkpoint in model_checkpoints])

latest_checkpoint_path = os.path.join(checkpoint_path, latest_checkpoint)

if os.path.exists(latest_checkpoint_path) == False:
    raise ValueError("Cannot find the latest checkpoint path. Bug in code.")

In [4]:
tokenizer = get_tokenizer('data/tokenizer/trained_tokenizer.json')

In [5]:
config = RobertaConfig(vocab_size = 1000, 
                       max_position_embeddings = 514, 
                       num_attention_heads = 12, 
                       num_hidden_layers = 6, 
                       type_vocab_size = 1)

In [6]:
model = RobertaForMaskedLM(config)

In [7]:
dataset = LineByLineTextDataset(tokenizer = tokenizer, 
                                file_path = "data/internal/train_lm.txt", 
                                block_size=128)



In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm = True, mlm_probability = 0.15)

In [9]:
training_args = TrainingArguments(output_dir = 'data/model/custom-lm', 
                                  overwrite_output_dir = False, 
                                  num_train_epochs = 116, 
                                  per_device_train_batch_size = 512, 
                                  save_steps = 10_000, 
                                  save_total_limit = 2, 
                                  prediction_loss_only=True, 
                                  dataloader_num_workers = 2,
                                  bf16 = True)

In [10]:
trainer = Trainer(model = model, 
                  args = training_args, 
                  data_collator = data_collator, 
                  train_dataset = dataset)

Using amp half precision backend


In [11]:
trainer.train(latest_checkpoint_path)

Loading model from data/model/custom-lm/checkpoint-190000).
***** Running training *****
  Num examples = 1530382
  Num Epochs = 116
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 346840
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 63
  Continuing training from global step 190000
  Will skip the first 63 epochs then the first 1630 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


HBox(children=(FloatProgress(value=0.0, max=1630.0), HTML(value='')))




Step,Training Loss
190500,0.8563
191000,0.8559
191500,0.8516
192000,0.8531
192500,0.8541
193000,0.8514
193500,0.8545
194000,0.856
194500,0.8517
195000,0.8545


Saving model checkpoint to data/model/custom-lm/checkpoint-200000
Configuration saved in data/model/custom-lm/checkpoint-200000/config.json
Model weights saved in data/model/custom-lm/checkpoint-200000/pytorch_model.bin
Deleting older checkpoint [data/model/custom-lm/checkpoint-180000] due to args.save_total_limit
Saving model checkpoint to data/model/custom-lm/checkpoint-210000
Configuration saved in data/model/custom-lm/checkpoint-210000/config.json
Model weights saved in data/model/custom-lm/checkpoint-210000/pytorch_model.bin
Deleting older checkpoint [data/model/custom-lm/checkpoint-190000] due to args.save_total_limit
Saving model checkpoint to data/model/custom-lm/checkpoint-220000
Configuration saved in data/model/custom-lm/checkpoint-220000/config.json
Model weights saved in data/model/custom-lm/checkpoint-220000/pytorch_model.bin
Deleting older checkpoint [data/model/custom-lm/checkpoint-200000] due to args.save_total_limit
Saving model checkpoint to data/model/custom-lm/chec

TrainOutput(global_step=346840, training_loss=0.35650673286544454, metrics={'train_runtime': 32408.3525, 'train_samples_per_second': 5477.733, 'train_steps_per_second': 10.702, 'total_flos': 2.2651898525182333e+18, 'train_loss': 0.35650673286544454, 'epoch': 116.0})

In [12]:
import torch

In [13]:
seq = "This is "

In [46]:
batch_size = 2
inpts = torch.Tensor([tokenizer.bos_token_id]).repeat(batch_size).to("cuda")


In [71]:
seq_tokens = tokenizer("this is", return_tensors="pt", return_attention_mask=False, return_token_type_ids=False).to("cuda")

In [74]:
seq_tokens

{'input_ids': tensor([[  1, 200, 114,   2]], device='cuda:0')}

In [73]:
with torch.no_grad():
    a = model.generate(seq_tokens["input_ids"], 
                       max_length = 10,    
                       num_beams=5, 
                       no_repeat_ngram_size=2, 
                       num_return_sequences=5, 
                       early_stopping=True
)
    
    print(a)
    print(tokenizer.batch_decode(a))

tensor([[  1, 200, 114,   2,   2,   1,   1,   1],
        [  1, 200, 114,   2,  41,   2,   1,   1],
        [  1, 200, 114,   2,  41,   8,   8,   2],
        [  1, 200, 114,   2,  41,  18,  18,   2],
        [  1, 200, 114,   2,  41,   8, 362,   2]], device='cuda:0')
['[BOS] this is [EOS] [EOS] [BOS] [BOS] [BOS]', '[BOS] this is [EOS] n [EOS] [BOS] [BOS]', '[BOS] this is [EOS] n!! [EOS]', '[BOS] this is [EOS] n.. [EOS]', '[BOS] this is [EOS] n! bor [EOS]']


In [48]:
# with torch.no_grad():
#     logits = model(inpts).logits[:, -1,:]

ValueError: not enough values to unpack (expected 2, got 1)

In [33]:
inpts['input_ids']

tensor([[  1, 200, 114,   2]], device='cuda:0')

In [41]:
tokenizer.batch_decode(inpts['input_ids'])

['[BOS] this is [EOS]']

In [29]:
logits.shape

torch.Size([1, 1000])