In [4]:
import collections 
from itertools import chain
import torch 
import time 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings(action='ignore')

from torch.utils.data.dataset import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.utils.data.dataloader import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn import CrossEntropyLoss

## 项目folder
from dataset import SeqLabelDataset
from models import BertCrf
from evaluation import islegal, aggregate_f1

from src.dataset.converter import data_loader 
from src.train_utils import set_seed, ModelSave, get_torch_device, EarlyStop, TrainParams
from src.evaluation import seqlabel_inference, seqlabel_report
from src.metric import  seq_tag_metrics, tag_cls_log
from src.seqlabel_utils import extract_entity, get_entity_bio


import transformers 
from transformers import BertTokenizer
transformers.logging.set_verbosity_error()
device = get_torch_device()

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [5]:
tp = TrainParams(
    log_steps = 10,
    save_steps = 10000,
    epoch_size=20,
    max_seq_len=512,
    batch_size=20,
    loss_fn=nn.CrossEntropyLoss(),
    lr=5e-5,
    crf_lr = 5e-5 * 10,
    weight_decay=0.0,
    epsilon=1e-6,
    warmup_steps=100,
    dropout_rate=0.2,
    gradient_clip=5.0,
    early_stop_params = {
        'monitor':'f1_micro',
        'mode':'max',
        'min_delta': 0,
        'patience':3,
        'verbose':False
    },
    pretrain_model = 'bert-base-chinese',
    continue_train=False,
    label2idx ={
        'O':0, 'B-FIN':1,'I-FIN':2
    },
    idx2label = {0:'O',1:'B-FIN',2:'I-FIN'},
    label_size=3,
    schema='BIO'
)

In [20]:
tokenizer = BertTokenizer.from_pretrained(tp.pretrain_model, do_lower_case=True)
train_dataset = SeqLabelDataset(data_loader('/kaggle/input/finent/train_bio.txt'), tokenizer, tp.max_seq_len, tp.label2idx)
valid_dataset = SeqLabelDataset(data_loader('/kaggle/input/finent/valid_bio.txt'), tokenizer, tp.max_seq_len,  tp.label2idx)
train_sampler = RandomSampler(train_dataset)
valid_sampler = SequentialSampler(valid_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=tp.batch_size)
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=tp.batch_size)

In [21]:
tp.update({'num_train_steps': int(len(train_loader)*tp.epoch_size)})

CKPT = './checkpoint/bio_tag'
saver = ModelSave(CKPT, continue_train=False)
saver.init()
es = EarlyStop(**tp.early_stop_params)
global_step = 0
tb = SummaryWriter(CKPT)

model = BertCrf(tp)
model.to(device)
optimizer, scheduler = model.get_optimizer()



Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

In [22]:
for epoch_i in range(tp['epoch_size']):
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10}  | {'Elapsed':^9}")
    print("-"*60)

    # Measure the elapsed time of each epoch
    t0_epoch, t0_batch = time.time(), time.time()
    total_loss, batch_loss, batch_counts = 0, 0, 0

    model.train()
    for step, batch in enumerate(train_loader):
        global_step +=1
        batch_counts +=1

        #Forward propogate
        model.zero_grad()
        feature = {k:v.to(device) for k, v in batch.items()}
        logits = model(feature)
        loss = model.compute_loss(feature, logits)
        batch_loss += loss.item()
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), tp.gradient_clip)
        optimizer.step()
        scheduler.step()
        # Log steps for train loss logging
        if (step % tp.log_steps == 0 and step != 0) or (step == len(train_loader) - 1):
            time_elapsed = time.time() - t0_batch
            print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^9} | {time_elapsed:^9.2f}")
            tb.add_scalar('loss/batch_train', batch_loss / batch_counts, global_step=global_step)
            batch_loss, batch_counts = 0, 0
            t0_batch = time.time()

        # Save steps for ckpt saving and dev evaluation
        if (step % tp.save_steps == 0 and step != 0) or (step == len(train_loader) - 1):
            val_metrics = seq_tag_metrics(model, valid_loader, tp.idx2label, tp.schema, device)
            for key, val in val_metrics.items():
                tb.add_scalar(f'metric/{key}', val, global_step=global_step)
            avg_train_loss = total_loss / step
            tb.add_scalars('loss/train_valid',{'train': avg_train_loss,
                                                'valid': val_metrics['val_loss']}, global_step=global_step)
            saver(total_loss / step, val_metrics['val_loss'], epoch_i, global_step, model, optimizer, scheduler)

    # On Epoch End: calcualte train & valid loss and log overall metrics
    time_elapsed = time.time() - t0_epoch
    val_metrics = seq_tag_metrics(model, valid_loader, tp.idx2label, tp.schema, device)
    avg_train_loss = total_loss / step

    print("-"*70)
    print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_metrics['val_loss']:^10.6f} | {time_elapsed:^9.2f}")
    tag_cls_log(epoch_i, val_metrics)
    print("\n")
    if es.check(val_metrics):
        break 

 Epoch  |  Batch  |  Train Loss  |  Val Loss   |  Elapsed 
------------------------------------------------------------
   1    |   10    |  509.616402  |     -     |   16.03  
   1    |   20    |  169.999390  |     -     |   14.22  
   1    |   30    |  56.893949   |     -     |   14.08  
   1    |   40    |  32.519269   |     -     |   13.97  
   1    |   50    |  24.801992   |     -     |   13.96  
   1    |   60    |  23.853972   |     -     |   14.01  
   1    |   70    |  17.577939   |     -     |   13.85  
   1    |   80    |  17.861504   |     -     |   13.93  
   1    |   90    |  20.950386   |     -     |   14.26  
   1    |   100   |  15.709005   |     -     |   14.02  
   1    |   110   |  16.964316   |     -     |   13.91  
   1    |   120   |  17.720364   |     -     |   13.83  
   1    |   130   |  17.901567   |     -     |   14.19  
   1    |   140   |  20.489372   |     -     |   14.06  
   1    |   150   |  18.284148   |     -     |   14.11  
   1    |   160   |  16.1

## Evaluation 

In [24]:
valid = pd.read_csv('/kaggle/input/finent/valid.csv')
train = pd.read_csv('/kaggle/input/finent/train.csv')
train.fillna({'entities':''},inplace=True)
valid.fillna({'entities':''},inplace=True)
# 抽取训练集中已知实体
known_entity = set(chain(*train['entities'].map(lambda x: x.split(';')).values))

In [None]:
pred = seqlabel_inference(model,valid_loader, device)
valid['pred_pos'] = [get_entity_bio(i, tp.idx2label) for i in pred]
valid['pred_entity'] = valid.apply(lambda x: extract_entity(x.corpus, x.pred_pos)['FIN'], axis=1)
valid['pred_entity'] = valid['pred_entity'].map(lambda x: [i for i in x if islegal(i)])

In [37]:
stat = aggregate_f1(valid['id'].values, 
                  valid['entities'].values,
                  valid['pred_entity'].values, 
                  known_entity)
stat.to_csv('bert_crf.csv')

All Entity Evaluation
   precision    recall        f1
0   0.559633  0.683621  0.615444
Unknown Entity Evalutation
   precision    recall        f1
0    0.37467  0.546154  0.444444
