In [2]:
import collections 
from itertools import chain
import torch 
import time 
import pandas as pd
from torch.utils.data.dataset import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.utils.data.dataloader import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.nn import CrossEntropyLoss

from src.train_utils import set_seed, ModelSave, get_torch_device, EarlyStop, TrainParams
from src.evaluation import binary_cls_report, classification_inference
from src.metric import  binary_cls_metrics, binary_cls_log

from models import BertClassifier
from dataset import SeqPairMtlDataset, data_loader
from evaluation import overall_f1
import transformers 
transformers.logging.set_verbosity_error()
from transformers import BertTokenizer,AdamW, get_linear_schedule_with_warmup
device = get_torch_device()

No GPU available, using the CPU instead.


In [3]:
tp = TrainParams(
    log_steps = 10,
    save_steps = 10000,
    epoch_size=20,
    loss_fn=nn.CrossEntropyLoss(),
    max_seq_len=512,
    batch_size=20,
    lr=5e-6,
    weight_decay=0.0,
    epsilon=1e-6,
    warmup_steps=100,
    dropout_rate=0.5,
    label_size=2,
    gradient_clip=1.0,
    hidden_s=200,
    hidden_e=200,
    early_stop_params = {
        'monitor':'f1',
        'mode':'max',
        'min_delta': 0,
        'patience':3,
        'verbose':False
    },
    pretrain_model = 'hfl/chinese-roberta-wwm-ext',
    continue_train=False
)

In [None]:
tokenizer = BertTokenizer.from_pretrained(tp.pretrain_model, do_lower_case=True)
special_tokens_dict = {'additional_special_tokens':['[t]','[c]','[o]','[e]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')

train_dataset = SeqPairMtlDataset(data_loader('./trainsample/train4.txt'), tp.max_seq_len, tokenizer)
valid_dataset = SeqPairMtlDataset(data_loader('./trainsample/valid4.txt'), tp.max_seq_len, tokenizer)
train_sampler = RandomSampler(train_dataset)
valid_sampler = SequentialSampler(valid_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=tp.batch_size)
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=tp.batch_size)

In [None]:
tp.update({'num_train_steps': int(len(train_loader)*tp.epoch_size)})

CKPT = './checkpoint/single_task_bert4'
saver = ModelSave(CKPT, continue_train=False)
saver.init()
es = EarlyStop(**tp.early_stop_params)
global_step = 0
tb = SummaryWriter(CKPT)

model = BertMtl(tp)
model.bert.resize_token_embeddings(len(tokenizer))

model.to(device)
optimizer, scheduler = model.get_optimizer()

In [9]:
for epoch_i in range(tp['epoch_size']):
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10}  | {'Elapsed':^9}")
    print("-"*60)

    # Measure the elapsed time of each epoch
    t0_epoch, t0_batch = time.time(), time.time()
    total_loss, batch_loss, batch_counts = 0, 0, 0

    model.train()
    for step, batch in enumerate(train_loader):
        global_step +=1
        batch_counts +=1

        #Forward propogate
        model.zero_grad()
        feature = {k:v.to(device) for k, v in batch.items()}
        logits = model(feature)
        loss = model.compute_loss(feature, logits)
        batch_loss += loss.item()
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), tp.gradient_clip)
        optimizer.step()
        scheduler.step()
        # Log steps for train loss logging
        if (step % tp.log_steps == 0 and step != 0) or (step == len(train_loader) - 1):
            time_elapsed = time.time() - t0_batch
            print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^9} | {time_elapsed:^9.2f}")
            tb.add_scalar('loss/batch_train', batch_loss / batch_counts, global_step=global_step)
            batch_loss, batch_counts = 0, 0
            t0_batch = time.time()

        # Save steps for ckpt saving and dev evaluation
        if (step % tp.save_steps == 0 and step != 0) or (step == len(train_loader) - 1):
            val_metrics = binary_cls_metrics(model, valid_loader, device, label_name='label1')
            for key, val in val_metrics.items():
                tb.add_scalar(f'metric/{key}', val, global_step=global_step)
            avg_train_loss = total_loss / step
            tb.add_scalars('loss/train_valid',{'train': avg_train_loss,
                                                'valid': val_metrics['val_loss']}, global_step=global_step)
            saver(total_loss / step, val_metrics['val_loss'], epoch_i, global_step, model, optimizer, scheduler)

    # On Epoch End: calcualte train & valid loss and log overall metrics
    time_elapsed = time.time() - t0_epoch
    val_metrics = binary_cls_metrics(model, valid_loader, device, label_name='label1')
    avg_train_loss = total_loss / step

    print("-"*70)
    print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_metrics['val_loss']:^10.6f} | {time_elapsed:^9.2f}")
    binary_cls_log(epoch_i, val_metrics)
    print("\n")
    if es.check(val_metrics):
        break 

 Epoch  |  Batch  |  Train Loss  |  Val Loss   |  Elapsed 
------------------------------------------------------------
   1    |   10    |   1.468846   |     -     |   12.05  
   1    |   20    |   1.468075   |     -     |   10.14  
   1    |   30    |   1.393609   |     -     |   10.13  
   1    |   40    |   1.369234   |     -     |   10.10  
   1    |   50    |   1.361736   |     -     |   10.11  
   1    |   60    |   1.277964   |     -     |   10.17  
   1    |   70    |   1.310135   |     -     |   10.10  
   1    |   80    |   1.212889   |     -     |   10.13  
   1    |   90    |   1.216510   |     -     |   10.13  
   1    |   100   |   1.070289   |     -     |   10.16  
   1    |   110   |   0.996813   |     -     |   10.12  
   1    |   120   |   0.861399   |     -     |   10.15  
   1    |   130   |   0.790482   |     -     |   10.10  
   1    |   140   |   0.672316   |     -     |   10.11  
   1    |   150   |   0.656919   |     -     |   10.14  
   1    |   160   |   0.5

In [10]:
result = classification_inference(model, valid_loader, device)
valid = pd.read_csv('./trainsample/valid.csv')
valid['pred'] = result['pred']
valid['prob'] = result['prob']
valid.loc[:,['id','single_entity','pred','prob']].to_csv('valid4.csv')
overall_f1(valid)

{'f1_s': 0.9552058111380144,
 'f1_e': 0.9480381760339343,
 'f1': 0.9509052300755663}

### Benifit from multitask: compare version 3 & 4

In [5]:
v3 = pd.read_csv('./submit/valid3.csv')
v4 = pd.read_csv('./submit/valid4.csv')
valid = pd.read_csv('./trainsample/valid.csv')
valid['pred_v3'] = v3['pred']
valid['pred_v4'] = v4['pred']

In [19]:
valid.loc[(valid['pred_v4']!=valid['pred_v3']) &(valid['pred_v4']==valid['label']),['pred_v4','pred_v3','label','negative','entity','single_entity','title','text']]

Unnamed: 0,pred_v4,pred_v3,label,negative,entity,single_entity,title,text
176,0,1,0,0,['钱牛牛'],钱牛牛,,"钱牛牛从成立之初就拥抱合规,不搞自融、不设立资金池、不做期限错配,根据监管要求做好合规备案"
226,0,1,0,0,"['宜信', '小额贷']",宜信,,"但营业收入增长的比例远远大于净利润,说明营业成本也在不断攀升,宜信还在扩张阶段;2、宜信的模..."
339,1,0,1,1,"['融和贷', '上海银行', '合肥安易贷投资管理有限公司']",上海银行,"上海银行,存管融和贷失联,原股东涉骗局;网传270亿理财产品爆仓为假","上海银行:网传我行270亿理财产品爆仓系谣言。公告:近日,网络传言上海银行浦东分行(张..."
407,0,1,0,1,"['微交易', '坚固环球']",微交易,,【图】坚固环球微交易违法吗?为什么很多人都说是违法的?
426,1,0,1,1,"['网贷天眼', '生菜金融', '弘坤资产管理(上海)有限公司']",弘坤资产管理(上海)有限公司,网贷天眼早报:P2P爆雷风险或已基本释出 生菜金融被立案,"弘坤资产17.4亿基金兑付延期,法人涉嫌行贿被留置 ? 近日,有投资人反映,本应于4月12..."
501,0,1,0,1,"['钱爸爸', '财迷之家']",财迷之家,,钱爸爸老板跑路。有通过财迷之家渠道的吗
503,1,0,1,1,"['正聚源', '小资钱包', '资易贷(北京)金融信息服务有限公司']",正聚源,"经侦立案抓人没有控钱 正聚源资产端逍遥法外 219年4月26日,北京市公安局海淀分局对资易贷...","经侦立案抓人没有控钱 正聚源资产端逍遥法外 2019年4月26日,北京市公安局海淀分局对资易..."
763,1,0,1,1,['海象理财'],海象理财,海象理财最新消息 海象理财公布详细的兑付方案,"现在网上出现了很多投资理财平台,海象理财就是其中之一,有很多人将钱投到这个平台了。但是,有..."
825,1,0,1,1,"['九鼎投资', '君安湘合', '北京银行']",九鼎投资,"公司新闻: 1、一汽集团发布“龙腾行动”:投54亿元支持自主,红旗3年达成40万辆。 2...","公司新闻: 1、一汽集团发布龙腾行动:投54亿元支持自主,红旗3年达成40万辆。 2、*..."
846,1,0,1,1,['宜贷网'],宜贷网,网贷天眼早报:银河集团违规被立案调查 宜贷网退出新进展,"新闻每天都在发生,行业日新月异。每天清晨,网贷天眼读早报,为您带来最新鲜、劲爆的行业新..."
