In [2]:
from transformers import BertConfig, BertForMaskedLM, DataCollatorForWholeWordMask,\
    BertTokenizer, TrainingArguments, Trainer
from src.train_utils import set_seed,TrainParams, get_torch_device
from dataset import data_loader, SeqMlmDataset
import torch 
import math

import warnings
warnings.filterwarnings(action='ignore')
os.environ["WANDB_DISABLED"] = "true"

In [3]:
tp = TrainParams(
    log_steps = 10,
    save_steps = 50,
    epoch_size=20,
    max_seq_len=512,
    batch_size=12,
    pretrain_model = 'hfl/chinese-roberta-wwm-ext',
    max_to_save=3
) 
device = get_torch_device()
set_seed()

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [8]:
tokenizer = BertTokenizer.from_pretrained(tp.pretrain_model, do_lower_case=True)

train_dataset = SeqMlmDataset(data_loader('./trainsample/train_mlm.txt'), tp.max_seq_len, tokenizer)
valid_dataset = SeqMlmDataset(data_loader('./trainsample/valid_mlm.txt'), tp.max_seq_len, tokenizer)

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

In [9]:
training_args = TrainingArguments(
    output_dir='./checkpoint/tapt',
    overwrite_output_dir=True,
    num_train_epochs=tp.epoch_size,
    per_device_train_batch_size=tp.batch_size,
    save_steps=tp.save_steps,
    save_total_limit=tp.max_to_save
)


model = BertForMaskedLM.from_pretrained(tp.pretrain_model).to(device)
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
***** Running training *****
  Num examples = 3984
  Num Epochs = 20
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 6640
The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward

Step,Training Loss
500,0.5165
1000,0.4414
1500,0.4023
2000,0.3847
2500,0.3724
3000,0.3465
3500,0.3273
4000,0.325
4500,0.3022
5000,0.2927


Saving model checkpoint to ./checkpoint/tapt/checkpoint-50
Configuration saved in ./checkpoint/tapt/checkpoint-50/config.json
Model weights saved in ./checkpoint/tapt/checkpoint-50/pytorch_model.bin
Saving model checkpoint to ./checkpoint/tapt/checkpoint-100
Configuration saved in ./checkpoint/tapt/checkpoint-100/config.json
Model weights saved in ./checkpoint/tapt/checkpoint-100/pytorch_model.bin
Saving model checkpoint to ./checkpoint/tapt/checkpoint-150
Configuration saved in ./checkpoint/tapt/checkpoint-150/config.json
Model weights saved in ./checkpoint/tapt/checkpoint-150/pytorch_model.bin
Saving model checkpoint to ./checkpoint/tapt/checkpoint-200
Configuration saved in ./checkpoint/tapt/checkpoint-200/config.json
Model weights saved in ./checkpoint/tapt/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [checkpoint/tapt/checkpoint-50] due to args.save_total_limit
Saving model checkpoint to ./checkpoint/tapt/checkpoint-250
Configuration saved in ./checkpoint/tapt/checkpo

NameError: name 'logger' is not defined

In [15]:
output_train_file = os.path.join(training_args.output_dir, "train_results.txt")

with open(output_train_file, "w") as writer:
    for key, value in sorted(train_result.metrics.items()):
        writer.write(f"{key} = {value}\n")


eval_result = trainer.evaluate() 
result = {}
output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
perplexity = math.exp(eval_result["eval_loss"])
result["perplexity"] = perplexity
with open(output_eval_file, "w") as writer:
    for key, value in sorted(result.items()):
        writer.write(f"{key} = {value}\n")

***** Running Evaluation *****
  Num examples = 996
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: chinese_ref. If chinese_ref are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.


In [16]:
trainer.save_model('tapt_20epoch')

Saving model checkpoint to tapt_20epoch
Configuration saved in tapt_20epoch/config.json
Model weights saved in tapt_20epoch/pytorch_model.bin
