In [None]:
import warnings
warnings.filterwarnings(action='ignore')
os.environ["WANDB_DISABLED"] = "true"
import pandas as pd 
import numpy as np 
from transformers import BertConfig, BertForMaskedLM, DataCollatorForWholeWordMask,\
    BertTokenizer, TrainingArguments, Trainer
from src.train_utils import set_seed,TrainParams, get_torch_device
from dataset import data_loader, SeqMlmDataset
import torch 
import math

In [6]:
tp = TrainParams(
    log_steps = 10,
    save_steps = 50,
    epoch_size= 10,
    max_seq_len=512,
    batch_size=12,
    pretrain_model = 'hfl/chinese-roberta-wwm-ext',
    max_to_save=3
) 
device = get_torch_device()
set_seed()

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


## Task Adaptive Continue Train

In [10]:

tokenizer = BertTokenizer.from_pretrained(tp.pretrain_model, do_lower_case=True)
train_dataset = SeqMlmDataset(data_loader('./trainsample/train_mlm.txt'), tp.max_seq_len, tokenizer)
training_args = TrainingArguments(
    output_dir='./checkpoint/tapt',
    overwrite_output_dir=True,
    num_train_epochs=tp.epoch_size,
    per_device_train_batch_size=tp.batch_size,
    save_steps=tp.save_steps,
    save_total_limit=tp.max_to_save
)


model = BertForMaskedLM.from_pretrained(tp.pretrain_model).to(device)
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
train_result = trainer.train()
output_train_file = os.path.join(training_args.output_dir, "train_results.txt")


***** Running training *****
  Num examples = 9962
  Num Epochs = 10
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 8310
The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: chinese_ref. If chinese_ref are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.


Step,Training Loss
500,0.5052
1000,0.4635
1500,0.4336
2000,0.4252
2500,0.3928
3000,0.3926
3500,0.3594
4000,0.3654
4500,0.3473
5000,0.3466


Saving model checkpoint to ./checkpoint/tapt/checkpoint-50
Configuration saved in ./checkpoint/tapt/checkpoint-50/config.json
Model weights saved in ./checkpoint/tapt/checkpoint-50/pytorch_model.bin
Saving model checkpoint to ./checkpoint/tapt/checkpoint-100
Configuration saved in ./checkpoint/tapt/checkpoint-100/config.json
Model weights saved in ./checkpoint/tapt/checkpoint-100/pytorch_model.bin
Saving model checkpoint to ./checkpoint/tapt/checkpoint-150
Configuration saved in ./checkpoint/tapt/checkpoint-150/config.json
Model weights saved in ./checkpoint/tapt/checkpoint-150/pytorch_model.bin
Saving model checkpoint to ./checkpoint/tapt/checkpoint-200
Configuration saved in ./checkpoint/tapt/checkpoint-200/config.json
Model weights saved in ./checkpoint/tapt/checkpoint-200/pytorch_model.bin
Deleting older checkpoint [checkpoint/tapt/checkpoint-50] due to args.save_total_limit
Saving model checkpoint to ./checkpoint/tapt/checkpoint-250
Configuration saved in ./checkpoint/tapt/checkpo

In [12]:
trainer.save_model('tapt_10epoch')

Saving model checkpoint to tapt_10epoch
Configuration saved in tapt_10epoch/config.json
Model weights saved in tapt_10epoch/pytorch_model.bin


## Multitask finetune

In [14]:
import collections 
from itertools import chain
import torch 
import time 

from torch.utils.data.dataset import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.utils.data.dataloader import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn import CrossEntropyLoss

from src.train_utils import set_seed, ModelSave, get_torch_device, EarlyStop, TrainParams
from src.evaluation import binary_cls_report, classification_inference
from src.metric import  binary_cls_metrics, binary_cls_log

from models import BertClassifier,BertMtl
from dataset import SeqPairMtlDataset, data_loader
from evaluation import overall_f1
import transformers 
transformers.logging.set_verbosity_error()
from transformers import BertTokenizer,AdamW, get_linear_schedule_with_warmup
device = get_torch_device()
set_seed()

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [15]:
tp = TrainParams(
    log_steps = 10,
    save_steps = 10000,
    epoch_size=20,
    loss_fn=nn.CrossEntropyLoss(),
    max_seq_len=512,
    batch_size=20,
    lr=5e-6,
    weight_decay=0.0,
    epsilon=1e-6,
    warmup_steps=100,
    dropout_rate=0.5,
    label_size=2,
    gradient_clip=1.0,
    hidden_s=200,
    hidden_e=200,
    early_stop_params = {
        'monitor':'f1',
        'mode':'max',
        'min_delta': 0,
        'patience':3,
        'verbose':False
    },
    tokenizer = 'hfl/chinese-roberta-wwm-ext', 
    pretrain_model = './checkpoint/tapt_10epoch',
    continue_train=False
)

In [16]:
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext', do_lower_case=True)
special_tokens_dict = {'additional_special_tokens':['[t]','[c]','[o]','[e]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')

train_dataset = SeqPairMtlDataset(data_loader('./trainsample/train4.txt'), tp.max_seq_len, tokenizer)
valid_dataset = SeqPairMtlDataset(data_loader('./trainsample/valid4.txt'), tp.max_seq_len, tokenizer)
train_sampler = RandomSampler(train_dataset)
valid_sampler = SequentialSampler(valid_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=tp.batch_size)
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=tp.batch_size)

We have added 4 tokens


In [57]:
tp.update({'num_train_steps': int(len(train_loader)*tp.epoch_size)})

CKPT = './checkpoint/single_task_bert5'
saver = ModelSave(CKPT, continue_train=False)
saver.init()
es = EarlyStop(**tp.early_stop_params)
global_step = 0
tb = SummaryWriter(CKPT)

model = BertMtl(tp)
model.bert.resize_token_embeddings(len(tokenizer))

model.to(device)
optimizer, scheduler = model.get_optimizer()

./checkpoint/single_task_bert5 model cleaned


In [60]:
for epoch_i in range(1, tp['epoch_size']):
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10}  | {'Elapsed':^9}")
    print("-"*60)

    # Measure the elapsed time of each epoch
    t0_epoch, t0_batch = time.time(), time.time()
    total_loss, batch_loss, batch_counts = 0, 0, 0

    model.train()
    for step, batch in enumerate(train_loader):
        global_step +=1
        batch_counts +=1

        #Forward propogate
        model.zero_grad()
        feature = {k:v.to(device) for k, v in batch.items()}
        logits = model(feature)
        loss = model.compute_loss(feature, logits)
        batch_loss += loss.item()
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), tp.gradient_clip)
        optimizer.step()
        scheduler.step()
        # Log steps for train loss logging
        if (step % tp.log_steps == 0 and step != 0) or (step == len(train_loader) - 1):
            time_elapsed = time.time() - t0_batch
            print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^9} | {time_elapsed:^9.2f}")
            tb.add_scalar('loss/batch_train', batch_loss / batch_counts, global_step=global_step)
            batch_loss, batch_counts = 0, 0
            t0_batch = time.time()

        # Save steps for ckpt saving and dev evaluation
        if (step % tp.save_steps == 0 and step != 0) or (step == len(train_loader) - 1):
            val_metrics = binary_cls_metrics(model, valid_loader, device, label_name='label1')
            for key, val in val_metrics.items():
                tb.add_scalar(f'metric/{key}', val, global_step=global_step)
            avg_train_loss = total_loss / step
            tb.add_scalars('loss/train_valid',{'train': avg_train_loss,
                                                'valid': val_metrics['val_loss']}, global_step=global_step)
            saver(total_loss / step, val_metrics['val_loss'], epoch_i, global_step, model, optimizer, scheduler)

    # On Epoch End: calcualte train & valid loss and log overall metrics
    time_elapsed = time.time() - t0_epoch
    val_metrics = binary_cls_metrics(model, valid_loader, device, label_name='label1')
    avg_train_loss = total_loss / step

    print("-"*70)
    print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_metrics['val_loss']:^10.6f} | {time_elapsed:^9.2f}")
    binary_cls_log(epoch_i, val_metrics)
    print("\n")
    if es.check(val_metrics):
        break 

 Epoch  |  Batch  |  Train Loss  |  Val Loss   |  Elapsed 
------------------------------------------------------------
   2    |   10    |   0.398738   |     -     |   11.32  
   2    |   20    |   0.380345   |     -     |   10.23  
   2    |   30    |   0.298078   |     -     |   10.23  
   2    |   40    |   0.366752   |     -     |   10.22  
   2    |   50    |   0.356135   |     -     |   10.24  
   2    |   60    |   0.323243   |     -     |   10.24  
   2    |   70    |   0.345801   |     -     |   10.22  
   2    |   80    |   0.360564   |     -     |   10.22  
   2    |   90    |   0.326366   |     -     |   10.28  
   2    |   100   |   0.280422   |     -     |   10.22  
   2    |   110   |   0.407383   |     -     |   10.22  
   2    |   120   |   0.323768   |     -     |   10.26  
   2    |   130   |   0.315418   |     -     |   10.26  
   2    |   140   |   0.342691   |     -     |   10.23  
   2    |   150   |   0.392780   |     -     |   10.23  
   2    |   160   |   0.3

In [61]:
result = classification_inference(model, valid_loader, device)
valid = pd.read_csv('./trainsample/valid.csv')
valid['pred'] = result['pred']
valid['prob'] = result['prob']
valid.loc[:,['id','single_entity','pred','prob']].to_csv('valid5.csv')
overall_f1(valid)

{'f1_s': 0.9561561561561561,
 'f1_e': 0.9499736703528172,
 'f1': 0.9524466646741527}