In [1]:
import json
import os

In [2]:
def read_by_lines(path):
    result=[]
    with open(path,'r') as infile:
        for line in infile:
            result.append(line.strip())
    return result

def write_by_lines(path,data):
    with open(path,'w') as outfile:
        [outfile.write(d+'\n') for d in data]

def load_dict(dict_path):
    vocab={}
    for line in open(dict_path,'r',encoding='utf-8'):
        value,key=line.strip().split('\t')
        vocab[key]=int(value)
    return vocab

In [3]:
def data_process(path,mode='trigger',is_predict=False):
    def label_data(data,start,l,_type):
        for i in range(start,start+l):
            suffix='B-' if i==start else "I-"
            data[i]=suffix+str(_type)
        return data
    
    sents=[]
    output=["text_a"] if is_predict else ["text_a\tlabel"]
    with open(path) as f:
        for line in f:
            d_json=json.loads(line.strip())
            _id=d_json["id"]
            text_a=["," if t==' ' or t=='\n' or t=='\t' else t for t in list(d_json['text'].lower())]

            if is_predict:
                sents.append({'text':d_json['text'],"id":_id})
                output.append('\002'.join(text_a))
            else:
                if mode=='trigger':
                    labels=["O"]*len(text_a)
                    for event in d_json.get('event_list',[]):
                        event_type=event['event_type']
                        start=event['trigger_start_index']
                        trigger=event['trigger']
                        labels=label_data(labels,start,len(trigger),event_type)
                    output.append("{}\t{}".format('\002'.join(text_a),'\002'.join(labels)))
                elif mode=='role':
                    for event in d_json.get('event_list',[]):#不存在event_list时不进人该层循环,eg:test.json,训练不会用到test,无伤大雅
                        labels=["O"]*len(text_a)
                        for arg in event['arguments']:
                            role_type=arg['role']
                            argument=arg["argument"]
                            start=arg['argument_start_index']
                            labels=label_data(labels,start,len(argument),role_type)
                        output.append("{}\t{}".format('\002'.join(text_a),'\002'.join(labels)))
    return output

In [4]:
def schema_process(path,mode='trigger'):
    def label_add(labels,_type):
        if 'B-{}'.format(_type) not in labels:
            labels.extend(["B-{}".format(_type),"I-{}".format(_type)])
        return labels
    
    labels=[]
    for line in read_by_lines(path):
        d_json=json.loads(line)
        if mode=='trigger':
            labels=label_add(labels,d_json['event_type'])
        elif mode=='role':
            for role in d_json['role_list']:
                labels=label_add(labels,role['role'])
    labels.append("O")
    tags=[]
    for ind,label in enumerate(labels):
        tags.append("{}\t{}".format(ind,label))
    return tags

In [5]:
conf_dir='./data/data158120'
schema_path="{}/event_schema.json".format(conf_dir)
tags_trigger_path="{}/trigger_tag.dict".format(conf_dir)
tags_role_path="{}/role_tag.dict".format(conf_dir)

tags_trigger=schema_process(schema_path,'trigger')
write_by_lines(tags_trigger_path,tags_trigger)

tags_role=schema_process(schema_path,'role')
write_by_lines(tags_role_path,tags_role)


data_dir=conf_dir
trigger_save_dir="{}/trigger".format(data_dir)
role_save_dir="{}/role".format(data_dir)

if not os.path.exists(trigger_save_dir):
    os.makedirs(trigger_save_dir)
if not os.path.exists(role_save_dir):
    os.makedirs(role_save_dir)

In [6]:
train_tri=data_process("{}/train.json".format(data_dir),"trigger")
write_by_lines("{}/train.tsv".format(trigger_save_dir),train_tri)

dev_tri=data_process("{}/dev.json".format(data_dir),"trigger")
write_by_lines("{}/dev.tsv".format(trigger_save_dir),dev_tri)

test_tri=data_process("{}/test.json".format(data_dir),"trigger")
write_by_lines("{}/test.tsv".format(trigger_save_dir),test_tri)

In [7]:
train_role=data_process("{}/train.json".format(data_dir),"role")
write_by_lines("{}/train.tsv".format(role_save_dir),train_role)

dev_role=data_process("{}/dev.json".format(data_dir),"role")
write_by_lines("{}/dev.tsv".format(role_save_dir),dev_role)


#训练不会用到
test_role=data_process("{}/test.json".format(data_dir),'role')
write_by_lines("{}/test.tsv".format(role_save_dir),test_role)

# 构建词典

In [8]:
def get_vocab():
    train_lines=open('data/data158120/train.json','r',encoding='utf-8').readlines()
    dev_lines=open('data/data158120/dev.json','r',encoding='utf-8').readlines()
    lines=train_lines+dev_lines
    vocab=set()
    for line in lines:
        ll=json.loads(line.strip())
        for c in ll['text']:
            vocab.add(c)
    
    vocab={c:i+2 for i,c in enumerate(list(vocab))}
    vocab['<pad>'],vocab['<unk>']=0,1
    return vocab

In [9]:
vocab=get_vocab()
vocab_size=len(list(vocab))

In [10]:
#将一句str转化为ids
def word2id(line,vocab,max_len=145):
    r=[]
    for c in line:
        if c not in vocab:
            r.append(vocab['<unk>'])
        else:
            r.append(vocab[c])
    r=r[:max_len]
    lens=len(r)#记录截断后,未填充时的长度
    r=r+[0]*(max_len-len(r))
    # print("line:",line)
    # print("r:",r,"lens:",lens)
    return r,lens

# 配置BiLSTM+CRF模型

In [11]:
import paddle
import paddle.nn as nn

In [12]:
class LSTM_Model(nn.Layer):
    def __init__(self,vocab_num,emb_size,hidden_size,num_layers,num_labels,dropout):
        super(LSTM_Model,self).__init__()
        self.embedding=nn.Embedding(vocab_num,emb_size)
        self.lstm=nn.LSTM(emb_size,hidden_size,num_layers=num_layers,direction='bidirect',dropout=dropout)
        self.linear=nn.Linear(hidden_size*2,num_labels+2)
        self.dropout=nn.Dropout(dropout)
        self.crf=LinearChainCrf(num_labels)
        self.decoder=ViterbiDecoder(self.crf.transitions)
    
    def forward(self,input_ids,seq_lens=None,target=None):
        token_emb=self.embedding(input_ids)
        sequence_output,_=self.lstm(token_emb)
        outputs=self.linear(sequence_output)
        _,logits=self.decoder(outputs,seq_lens)
        return outputs,logits

# 模型训练

In [13]:
num_epoch=10
learning_rate=0.001
base_dir='./data/data158120'
tag_path="{}/trigger_tag.dict".format(base_dir)

data_dir="{}/trigger".format(base_dir)
train_data="{}/train.tsv".format(data_dir)
dev_data="{}/dev.tsv".format(data_dir)
test_data="{}/test.tsv".format(data_dir)

predict_data="{}/test.json".format(base_dir)

checkpoints="{}/trigger/".format(base_dir)
init_ckpt="{}/trigger/best.pdparams".format(base_dir)

weight_decay=0.01
warmup_proportion=0.1
max_seq_len=145
valid_step=500
skip_step=50
batch_size=32
predict_save_path=None
seed=1024

In [14]:
def convert_example_to_feature(example,label_vocab=None,max_seq_len=145,no_entity_label='O',ignore_label=-1,is_test=False):
    tokens,labels,seq_len=example
    input_ids,seq_lens=word2id(tokens,vocab)
    if is_test:
        return input_ids,seq_lens
    elif label_vocab is not None:
        encoded_label=labels[:seq_lens]
        encoded_label=[label_vocab[x] for x in encoded_label]
        # print("convert:")
        # print(len(encoded_label),max_seq_len,seq_lens)
        encoded_label=encoded_label+[-1]*(max_seq_len-seq_lens)
        # print(len(encoded_label),max_seq_len,seq_lens)
        return input_ids,encoded_label,seq_lens

In [15]:
class DuEventExtraction(paddle.io.Dataset):
    def __init__(self,data_path,tag_path):
        self.label_vocab=load_dict(tag_path)
        self.word_ids=[]
        self.label_ids=[]
        self.seq_lens=[]
        with open(data_path,'r',encoding='utf-8') as fp:
            next(fp)#不要第一行 ,第一行是output=["text_a"] if is_predict else ["text_a\tlabel"]
            for line in fp.readlines():
                words,labels=line.strip().split('\t')
                words=words.split('\002')
                labels=labels.split('\002')
                self.word_ids.append(words)
                self.label_ids.append(labels)
                self.seq_lens.append(len(words[:145]))
        self.label_num=max(self.label_vocab.values())+1
    
    def __len__(self):
        return len(self.word_ids)
    
    def __getitem__(self,index):
        return self.word_ids[index],self.label_ids[index],self.seq_lens[index]

In [16]:
def do_train():
    paddle.set_device('gpu')
    no_entity_label="O"
    ignore_label=-1
    label_map=load_dict(tag_path)
    id2label={val:key for key,val in label_map.items()}
    vocab_num,emb_size,hidden_size,num_layers,num_labels,dropout=vocab_size,256,256,2,len(list(id2label)),0.1

    model=LSTM_Model(vocab_num,emb_size,hidden_size,num_layers,num_labels,dropout)
    
    train_ds=DuEventExtraction(train_data,tag_path)
    dev_ds=DuEventExtraction(dev_data,tag_path)
    test_ds=DuEventExtraction(test_data,tag_path)

    trans_func=partial(
        convert_example_to_feature,
        label_vocab=train_ds.label_vocab,
        max_seq_len=max_seq_len,
        no_entity_label=no_entity_label,
        ignore_label=ignore_label,
        is_test=False
    )

    batchify_fn=lambda samples,fn=Tuple(
        Pad(axis=0,pad_val=0),
        Pad(axis=0,pad_val=0),
        Stack()
    ):fn(list(map(trans_func,samples)))

    batch_sampler=paddle.io.DistributedBatchSampler(train_ds,batch_size=batch_size,shuffle=True)

    train_loader=paddle.io.DataLoader(
        dataset=train_ds,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn
    )

    dev_loader=paddle.io.DataLoader(
        dataset=dev_ds,
        batch_size=batch_size,
        collate_fn=batchify_fn
    )

    test_loader=paddle.io.DataLoader(
        dataset=test_ds,
        batch_size=batch_size,
        collate_fn=batchify_fn
    )

    num_training_steps=len(train_loader)*num_epoch
    decay_params=[p.name for n,p in model.named_parameters() if not any(nd in n for nd in ['bias','norm'])]

    optimizer=paddle.optimizer.AdamW(
        learning_rate=learning_rate,
        parameters=model.parameters(),
        weight_decay=weight_decay,
        apply_decay_param_fun=lambda x : x in decay_params
    )

    metric=ChunkEvaluator(label_list=train_ds.label_vocab.keys(),suffix=False)
    criterion=LinearChainCrfLoss(model.crf)

    step,best_f1=0,0.0
    model.train()
    for epoch in range(num_epoch):
        for idx,(input_ids,labels,seq_lens) in enumerate(train_loader):
            # print("input_ids:",input_ids[:3])
            outputs,logits=model(input_ids,seq_lens,labels)
            # print(labels.shape,outputs.shape)
            loss=criterion(inputs=outputs,lengths=seq_lens,labels=labels)
            loss=paddle.mean(loss)
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
            loss_item=loss.numpy().item()
            
            if step>0 and step%skip_step==0:
                print(f'train epoch:{epoch} - step:{step} (total:{num_training_steps}) - loss:{loss_item:.6f}')
            
            if step>0 and step%valid_step==0:
                p,r,f1,avg_loss=evaluate(model,criterion,metric,len(label_map),dev_loader)
                print(f'dev step:{step} - loss:{avg_loss:.5f},precision:{p:.5f},recall:{r:.5f},f1:{f1:.5f}current best {best_f1:.5f}')
                if f1>best_f1:
                    best_f1=f1
                    print(f'=======================save best model \nbest performerence {best_f1:.5f}')
                    paddle.save(model.state_dict(),'{}/best.pdparams'.format(checkpoints))
            
            step+=1

    paddle.save(model.state_dict(),"{}/final.pdparams".format(checkpoints))
    


In [17]:
@paddle.no_grad()
def evaluate(model,criterion,metric,num_label,data_loader):
    model.eval()
    metric.reset()
    losses=[]
    for input_ids,labels,seq_lens in data_loader:
        outputs,logits=model(input_ids,seq_lens,labels)
        preds=logits
        n_infer,n_label,n_correct=metric.compute(None,seq_lens,preds,labels)
        metric.update(n_infer.numpy(),n_label.numpy(),n_correct.numpy())
    
        loss=paddle.mean(
            criterion(inputs=outputs,lengths=seq_lens,labels=labels)
        )
        losses.append(loss.numpy()[0])
    
    avg_loss=np.mean(losses)
    precision,recall,f1_score=metric.accumulate()

    model.train()
    return precision,recall,f1_score,avg_loss

# 训练事件识别模型

In [18]:
import numpy as np

In [19]:
from paddlenlp.layers import LinearChainCrfLoss,ViterbiDecoder,LinearChainCrf
from functools import partial
from paddlenlp.data import Stack,Tuple,Pad
from paddlenlp.metrics import ChunkEvaluator

In [20]:
def do_predict():
    step=0
    paddle.set_device('gpu')
    no_entity_label="O"
    ignore_label=-1
    label_map=load_dict(tag_path)
    id2label={val:key for key,val in label_map.items()}
    vocab_num,emb_size,hidden_size,num_layers,num_labels,dropout=vocab_size,256,256,2,len(list(id2label)),0.1
    model=LSTM_Model(vocab_num,emb_size,hidden_size,num_layers,num_labels,dropout)

    if not init_ckpt or not os.path.isfile(init_ckpt):
        raise Exception("init checkpoints {} not exist".format(init_ckpt))
    else:
        state_dict=paddle.load(init_ckpt)
        model.set_dict(state_dict)
        print("Loaded parameters from {}".format(init_ckpt))
    
    sentences=read_by_lines(predict_data)
    sentences=[json.loads(sent) for sent in sentences]
    encoded_inputs_list=[]
    for sent in sentences:
        # print("be",sent['text'])
        # sent=sent['text'].replace('',"\002")#感觉是多余之举,后面convert example to feature 也没有用到\002分词,而且只有少数有空格
        #如果第一个参数是空串,则会间隔插入\002,包含前后
        #两种情况都不对,所以应该把这句话删掉
        # print("ed",sent)
        # print("sent:",sent)
        # print("list(sent)",list(sent))
        sent=sent['text']#这句话漏了会出bug
        input_ids=convert_example_to_feature([list(sent),[],len(sent)],max_seq_len=max_seq_len,is_test=True)#返回input_ids,seq_lens
        # print("input_ids",input_ids[0])
        encoded_inputs_list.append((input_ids))


    batchify_fn=lambda samples,fn=Tuple(
        Pad(axis=0,pad_val=0),
        Stack()
    ):fn(samples)

    batch_encoded_inputs=[encoded_inputs_list[i:i+batch_size] for i in range(0,len(encoded_inputs_list),batch_size)]

    results=[]
    model.eval()
    for batch in batch_encoded_inputs:
        input_ids,seq_lens=batchify_fn(batch)
        # print("input_ids:",input_ids[:10])
        input_ids=paddle.to_tensor(input_ids)
        seq_lens=paddle.to_tensor(seq_lens)
        
        outputs,logits=model(input_ids,seq_lens)
        probs_ids=logits.numpy()
        for p_ids,seq_len in zip(probs_ids.tolist(),seq_lens.numpy().tolist()):
            # label_one=[id2label[pid] for pid in p_ids[1:seq_len-1]]#为什么要[1:seq_len-1]
            label_one=[id2label[pid] for pid in p_ids[:seq_len]]
            # print("p_ids",p_ids)
            # print([id2label[pid] for pid in p_ids])
            results.append({'label':label_one})
            if step>0 and step%skip_step==0:
                print(f' step:{step}')
            step+=1
            if step>=100:#只看前面十个predict这么久干嘛
                break

        if step>=100:#只看前面十个predict这么久干嘛
            break 
    
    sentences=sentences[:len(results)]# 少predict了很多,加这个
    assert len(results) == len(sentences)
    # print(results[:10])
    for sent,ret in zip(sentences,results):
        sent["pred"]=ret
    
    sentences=[json.dumps(sent,ensure_ascii=False) for sent in sentences]

    # print(sentences[:50])
    for sent in sentences[:50]:
        print('='*10)
        print(type(sent))
        print(sent)
        # print(sent['text'])
        # print(sent['label'])



In [21]:
base_dir='./data/data158120'
tag_path="{}/trigger_tag.dict".format(base_dir)

data_dir="{}/trigger".format(base_dir)
train_data="{}/train.tsv".format(data_dir)
dev_data="{}/dev.tsv".format(data_dir)
test_data="{}/test.tsv".format(data_dir)

predict_data="{}/test.json".format(base_dir)

checkpoints="{}/trigger/".format(base_dir)
init_ckpt="{}/trigger/final.pdparams".format(base_dir)


do_train()
do_predict()

W0802 19:38:27.820214  1251 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0802 19:38:27.825275  1251 gpu_resources.cc:91] device: 0, cuDNN Version: 7.6.


train epoch:0 - step:50 (total:3740) - loss:19.060974
train epoch:0 - step:100 (total:3740) - loss:17.893379
train epoch:0 - step:150 (total:3740) - loss:12.975210
train epoch:0 - step:200 (total:3740) - loss:12.161092
train epoch:0 - step:250 (total:3740) - loss:11.108285
train epoch:0 - step:300 (total:3740) - loss:10.496643
train epoch:0 - step:350 (total:3740) - loss:10.395264
train epoch:1 - step:400 (total:3740) - loss:12.870819
train epoch:1 - step:450 (total:3740) - loss:6.155119
train epoch:1 - step:500 (total:3740) - loss:8.062029




dev step:500 - loss:7.47829,precision:0.46992,recall:0.39898,f1:0.43155current best 0.00000
best performerence 0.43155
train epoch:1 - step:550 (total:3740) - loss:5.317844
train epoch:1 - step:600 (total:3740) - loss:5.892819
train epoch:1 - step:650 (total:3740) - loss:4.675808
train epoch:1 - step:700 (total:3740) - loss:4.136090
train epoch:2 - step:750 (total:3740) - loss:2.785054
train epoch:2 - step:800 (total:3740) - loss:5.592870
train epoch:2 - step:850 (total:3740) - loss:2.949601
train epoch:2 - step:900 (total:3740) - loss:3.880248
train epoch:2 - step:950 (total:3740) - loss:4.166456
train epoch:2 - step:1000 (total:3740) - loss:1.769350
dev step:1000 - loss:3.57513,precision:0.76248,recall:0.67594,f1:0.71661current best 0.43155
best performerence 0.71661
train epoch:2 - step:1050 (total:3740) - loss:3.462531
train epoch:2 - step:1100 (total:3740) - loss:3.368574
train epoch:3 - step:1150 (total:3740) - loss:1.503188
train epoch:3 - step:1200 (total:3740) - loss:2.465120


# 训练论元角色识别模型

In [22]:
base_dir='./data/data158120'
tag_path="{}/role_tag.dict".format(base_dir)

data_dir="{}/role".format(base_dir)
train_data="{}/train.tsv".format(data_dir)
dev_data="{}/dev.tsv".format(data_dir)
test_data="{}/test.tsv".format(data_dir)

predict_data="{}/test.json".format(base_dir)

checkpoints="{}/role/".format(base_dir)
init_ckpt="{}/role/final.pdparams".format(base_dir)

do_train()
do_predict()

train epoch:0 - step:50 (total:4350) - loss:89.512520
train epoch:0 - step:100 (total:4350) - loss:74.403778
train epoch:0 - step:150 (total:4350) - loss:85.160172
train epoch:0 - step:200 (total:4350) - loss:94.533218
train epoch:0 - step:250 (total:4350) - loss:73.825500
train epoch:0 - step:300 (total:4350) - loss:70.997116
train epoch:0 - step:350 (total:4350) - loss:62.695999
train epoch:0 - step:400 (total:4350) - loss:73.339340
train epoch:1 - step:450 (total:4350) - loss:54.493034
train epoch:1 - step:500 (total:4350) - loss:48.933960




dev step:500 - loss:51.74525,precision:0.13764,recall:0.06313,f1:0.08656current best 0.00000
best performerence 0.08656
train epoch:1 - step:550 (total:4350) - loss:54.520622
train epoch:1 - step:600 (total:4350) - loss:41.895187
train epoch:1 - step:650 (total:4350) - loss:40.107841
train epoch:1 - step:700 (total:4350) - loss:43.367210
train epoch:1 - step:750 (total:4350) - loss:40.634727
train epoch:1 - step:800 (total:4350) - loss:30.044048
train epoch:1 - step:850 (total:4350) - loss:31.468550
train epoch:2 - step:900 (total:4350) - loss:29.002758
train epoch:2 - step:950 (total:4350) - loss:27.489361
train epoch:2 - step:1000 (total:4350) - loss:32.105904
dev step:1000 - loss:35.12398,precision:0.27836,recall:0.22701,f1:0.25008current best 0.08656
best performerence 0.25008
train epoch:2 - step:1050 (total:4350) - loss:30.729368
train epoch:2 - step:1100 (total:4350) - loss:32.741585
train epoch:2 - step:1150 (total:4350) - loss:29.099920
train epoch:2 - step:1200 (total:4350) -

# 预测

# 触发器预测

In [23]:
# base_dir='./data/data158120'
# tag_path="{}/trigger_tag.dict".format(base_dir)

# data_dir="{}/trigger".format(base_dir)
# train_data="{}/train.tsv".format(data_dir)
# dev_data="{}/dev.tsv".format(data_dir)
# test_data="{}/test.tsv".format(data_dir)

# predict_data="{}/test.json".format(base_dir)

# checkpoints="{}/trigger/".format(base_dir)
# init_ckpt="{}/trigger/best.pdparams".format(base_dir)



# import numpy as np
# from paddlenlp.layers import LinearChainCrfLoss,ViterbiDecoder,LinearChainCrf
# from functools import partial
# from paddlenlp.data import Stack,Tuple,Pad
# from paddlenlp.metrics import ChunkEvaluator
# import paddle
# do_predict()

# 角色预测

In [24]:
# base_dir='./data/data158120'
# tag_path="{}/role_tag.dict".format(base_dir)

# data_dir="{}/role".format(base_dir)
# train_data="{}/train.tsv".format(data_dir)
# dev_data="{}/dev.tsv".format(data_dir)
# test_data="{}/test.tsv".format(data_dir)

# predict_data="{}/test.json".format(base_dir)

# checkpoints="{}/role/".format(base_dir)
# init_ckpt="{}/role/final.pdparams".format(base_dir)

# do_predict()