In [2]:
import collections 
from itertools import chain
import torch 
import time 
import pandas as pd 

from torch.utils.data.dataset import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SequentialSampler, RandomSampler
from torch.utils.data.dataloader import DataLoader
from torch.utils.tensorboard import SummaryWriter

## 项目folder
from dataset import SeqMultiLabelDataset
from process import Schema2Label
from evaluation import multilabel_inference,extract_multilabel,event_evaluation

## 通用folder
from src.dataset.converter import data_loader 
from src.train_utils import set_seed, ModelSave, get_torch_device, EarlyStop, TrainParams
from src.models.bert import BertClassifier
from src.metrics import multilabel_metrics, multilabel_log

import transformers 
from transformers import BertTokenizer
transformers.logging.set_verbosity_error()
device = get_torch_device()

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [3]:

schema = Schema2Label(DIR+'/duee_event_schema.json')

tp = TrainParams(
    log_steps = 50,
    save_steps = 10,
    epoch_size=20,
    max_seq_len=500,
    batch_size=16,
    loss_fn=nn.BCEWithLogitsLoss(),
    lr=5e-5,
    weight_decay=0.0,
    epsilon=1e-6,
    warmup_steps=100,
    dropout_rate=0.2,
    gradient_clip=5.0,
    early_stop_params = {
        'monitor':'acc',
        'mode':'max',
        'min_delta': 0,
        'patience':3,
        'verbose':False
    },
    slot_positions = [1,10,17,24,31,38,45,52,59,66,73,80,87,94,101,106,111,116,121,126,133,140,147,154,161,168,174,179,184,189,194,199,204,209,214,219,224,229,236,243,250,257,264,271,278,285,292,299,306,313,320,328,335,342,349,357,364,371,378,385,392,399,406,413,420],
    pretrain_model = 'bert-base-chinese',
    continue_train=False,
    label2idx = schema.event_label,
    idx2label = {j:i for i,j in schema.event_label.items()},
    label_size=len(schema.event_label)
)


In [5]:
from torch import nn
from transformers import BertModel
from transformers import AdamW, get_linear_schedule_with_warmup

class BertSlotClassifier(nn.Module):
    """Bert Model for Classification Tasks. with certain slots embedding enhancement 
    """

    def __init__(self, tp):
        super(BertSlotClassifier, self).__init__()
        self.tp = tp
        self.label_size = tp.label_size
        self.loss_fn = tp.loss_fn
        self.bert = BertModel.from_pretrained(tp.pretrain_model)
        self.dropout_layer = nn.Dropout(tp.dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 1) ## binary logits

    def forward(self, features):
        """
        features: {input_ids, token_type_ids, attention_mask, label_ids}
        """
        outputs = self.bert(input_ids=features['input_ids'],
                            token_type_ids=features['token_type_ids'],
                            attention_mask=features['attention_mask'])
        sequence_output = outputs[0]
        slot_output = sequence_output[:, self.tp.slot_positions,: ]
        batch_size, label_size, emb_size = slot_output.shape
        slot_output = slot_output.reshape(batch_size * label_size, emb_size)
        logits = self.classifier(slot_output) # batch* label_sizse * 1
        logits = logits.squeeze() 
        logits = logits.reshape(batch_size, label_size)
        return logits 

    def compute_loss(self, features, logits):
        loss = self.loss_fn(logits, features['label'])
        return loss

    def get_optimizer(self):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        params = [
            {'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.tp.weight_decay},
            {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        optimizer = AdamW(params, lr=self.tp.lr, eps=self.tp.epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=self.tp.num_train_steps,
                                                    num_warmup_steps=self.tp.warmup_steps)
        return optimizer, scheduler

In [6]:
tokenizer = BertTokenizer.from_pretrained(tp.pretrain_model, do_lower_case=True)
special_tokens_dict = {'additional_special_tokens':['[E{}]'.format(i) for i in range(len(schema.event_label))]}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

train_dataset = SeqMultiLabelDataset(data_loader(DIR+'/train_event_slot.txt'),
                                tokenizer, tp.max_seq_len, tp.label2idx)
valid_dataset = SeqMultiLabelDataset(data_loader(DIR+'/valid_event_slot.txt'), 
                                tokenizer, tp.max_seq_len,  tp.label2idx)
train_sampler = RandomSampler(train_dataset)
valid_sampler = SequentialSampler(valid_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=tp.batch_size)
valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=tp.batch_size)

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [7]:
tp.update({'num_train_steps': int(len(train_loader)*tp.epoch_size)})

CKPT = './checkpoint/event'
saver = ModelSave(CKPT, continue_train=False)
saver.init()
es = EarlyStop(**tp.early_stop_params)
global_step = 0
tb = SummaryWriter(CKPT)

model = BertSlotClassifier(tp)
model.bert.resize_token_embeddings(len(tokenizer))
model.to(device)
optimizer, scheduler = model.get_optimizer()



Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

In [8]:
for epoch_i in range(tp['epoch_size']):
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10}  | {'Elapsed':^9}")
    print("-"*60)

    # Measure the elapsed time of each epoch
    t0_epoch, t0_batch = time.time(), time.time()
    total_loss, batch_loss, batch_counts = 0, 0, 0

    model.train()
    for step, batch in enumerate(train_loader):
        global_step +=1
        batch_counts +=1

        #Forward propogate
        model.zero_grad()
        feature = {k:v.to(device) for k, v in batch.items()}
        logits = model(feature)
        loss = model.compute_loss(feature, logits)
        batch_loss += loss.item()
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), tp.gradient_clip)
        optimizer.step()
        scheduler.step()
        # Log steps for train loss logging
        if (step % tp.log_steps == 0 and step != 0) or (step == len(train_loader) - 1):
            time_elapsed = time.time() - t0_batch
            print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^9} | {time_elapsed:^9.2f}")
            tb.add_scalar('loss/batch_train', batch_loss / batch_counts, global_step=global_step)
            batch_loss, batch_counts = 0, 0
            t0_batch = time.time()

    # On Epoch End: calcualte train & valid loss and log overall metrics
    time_elapsed = time.time() - t0_epoch
    val_metrics = multilabel_metrics(model, valid_loader, device)
    avg_train_loss = total_loss / step

    print("-"*70)
    print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_metrics['val_loss']:^10.6f} | {time_elapsed:^9.2f}")
    multilabel_log(epoch_i, val_metrics)
    print("\n")
    if es.check(val_metrics):
        break 

 Epoch  |  Batch  |  Train Loss  |  Val Loss   |  Elapsed 
------------------------------------------------------------
   1    |   50    |   0.166924   |     -     |   42.38  
   1    |   100   |   0.081180   |     -     |   40.60  
   1    |   150   |   0.074701   |     -     |   40.54  
   1    |   200   |   0.042176   |     -     |   40.53  
   1    |   250   |   0.022432   |     -     |   40.61  
   1    |   300   |   0.017627   |     -     |   40.56  
   1    |   350   |   0.015358   |     -     |   40.53  
   1    |   400   |   0.014137   |     -     |   40.55  
   1    |   450   |   0.011127   |     -     |   40.53  
   1    |   500   |   0.011375   |     -     |   40.56  
   1    |   550   |   0.011874   |     -     |   40.50  
   1    |   600   |   0.011201   |     -     |   40.56  
   1    |   650   |   0.010526   |     -     |   40.53  
   1    |   700   |   0.009680   |     -     |   40.55  
   1    |   744   |   0.010575   |     -     |   35.10  
-------------------------

In [10]:
test_dataset = SeqMultiLabelDataset(data_loader(DIR+'/test_event_mrc.txt'), 
                                tokenizer, tp.max_seq_len,  tp.label2idx)
test_sampler = SequentialSampler(test_dataset)
test_loader = DataLoader(test_dataset, sampler=test_sampler, batch_size=tp.batch_size)
test = pd.read_csv(DIR+'/test.csv')
pred = multilabel_inference(model, test_loader, device)
test['pred'] = pred 
test['pred_label'] = test['pred'].map(lambda x: extract_multilabel(x, tp.idx2label, 0.5))

In [11]:
valid = pd.read_csv(DIR+"/valid.csv")
pred = multilabel_inference(model, valid_loader, device)
valid['pred'] = pred
valid['pred_label'] = valid['pred'].map(lambda x: extract_multilabel(x, tp.idx2label, 0.5))

In [12]:
valid.to_csv('./trainsample/valid_event_slot_pred.csv')
test.to_csv('./trainsample/test_event_slot_pred.csv')

In [13]:
valid['event_label'] = valid['event_label'].map(lambda x: ast.literal_eval(x))
event_evaluation(valid['event_label'].values, valid['pred_label'].values)

{'n_sample': 1492,
 'n_pos': 1651,
 'precision': 0.9413524835427888,
 'recall': 0.952755905511811,
 'f1': 0.9470198675496688,
 'accuracy': 0.89937106918239}