In [1]:
from process import event_preprocess, argument_preprocess,load_event, load_argument, Schema2Label
from src.seqlabel_utils import pos2bio
from src.preprocess.str_utils import get_useless_chars
from process import text_preprocess, gen_pos
from src.dataset.converter import single_text, double_text
import pandas as pd 
import ast 
import numpy as np 

In [2]:
train_e = load_event('./trainsample/duee_train.json')
valid_e = load_event('./trainsample/duee_dev.json')
test_e = load_event('./trainsample/duee_test2.json')
schema = Schema2Label('./trainsample/duee_event_schema.json')
train_e.shape, valid_e.shape, test_e.shape

((11908, 3), (1492, 3), (34904, 2))

In [3]:
useless_chars = get_useless_chars(train_e['text'].values.tolist() +\
                                  valid_e['text'].values.tolist() +\
                                  test_e['text'].values.tolist())

## pipeline: EventType + Arguments

In [4]:
train_e = event_preprocess(train_e, useless_chars)
valid_e = event_preprocess(valid_e, useless_chars)
test_e = event_preprocess(test_e, useless_chars)

0 out of 11908 trigger not match
0 out of 11908 even pos exceed text line
0 out of 1492 trigger not match
0 out of 1492 even pos exceed text line


### 方案1：Trigger抽取序列标注问题

In [None]:
from src.dataset.converter import single_text
single_text(train_e['id'].values, train_e['clean_text'].values, train_e['event_pos'].values,
            './trainsample','train_event_bio')
single_text(valid_e['id'].values, valid_e['clean_text'].values, valid_e['event_pos'].values, 
            './trainsample','valid_event_bio')
single_text(test_e['id'].values, test_e['clean_text'].values, None ,'./trainsample','test_event_bio')

### 事件抽取方案2: 多标签分类问题

In [None]:
single_text(train_e['id'].values, train_e['clean_text'].values, train_e['event_label'].values,
            './trainsample','train_event_cls')
single_text(valid_e['id'].values, valid_e['clean_text'].values, valid_e['event_label'].values, 
            './trainsample','valid_event_cls')
single_text(test_e['id'].values, test_e['clean_text'].values, None, './trainsample','test_event_cls')

### 事件抽取方案3：column selection

In [6]:
enhancement = ''
pos = 0 
pos_list = [pos]

enhancement = []
for event in schema.event_label:
    event = event.split('-')[1]
    enhancement.append('[unused1]' + event)
    pos+=len(event)+1
    pos_list.append(pos)
enhancement = ''.join(enhancement)   
print(','.join([str(i) for i in pos_list]))
print(enhancement)

0,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,66,69,72,75,78,83,86,89,92,95,98,101,104,107,110,113,116,119,122,125,128,131,134,137,140,143,146,149,152,155,160,163,166,169,174,177,180,183,186,189,192,195,198,201,204
[unused1]出售/收购[unused1]跌停[unused1]加息[unused1]降价[unused1]降息[unused1]融资[unused1]上市[unused1]涨价[unused1]涨停[unused1]发布[unused1]获奖[unused1]上映[unused1]下架[unused1]召回[unused1]道歉[unused1]点赞[unused1]感谢[unused1]会见[unused1]探班[unused1]夺冠[unused1]晋级[unused1]禁赛[unused1]胜负[unused1]退赛[unused1]退役[unused1]产子/女[unused1]出轨[unused1]订婚[unused1]分手[unused1]怀孕[unused1]婚礼[unused1]结婚[unused1]离婚[unused1]庆生[unused1]求婚[unused1]失联[unused1]死亡[unused1]罚款[unused1]拘捕[unused1]举报[unused1]开庭[unused1]立案[unused1]起诉[unused1]入狱[unused1]约谈[unused1]爆炸[unused1]车祸[unused1]地震[unused1]洪灾[unused1]起火[unused1]坍/垮塌[unused1]袭击[unused1]坠机[unused1]裁员[unused1]辞/离职[unused1]加盟[unused1]解雇[unused1]解散[unused1]解约[unused1]停职[unused1]退出[unused1]罢工[unused1]闭幕[unused1]开幕[unused1]游行


In [7]:
double_text(train_e['id'].values, [enhancement] * train_e.shape[0], 
            train_e['clean_text'].values, train_e['event_label'].values,
            './trainsample','train_event_slot')
double_text(valid_e['id'].values, [enhancement] * valid_e.shape[0],
            valid_e['clean_text'].values, valid_e['event_label'].values, 
            './trainsample','valid_event_slot')
double_text(test_e['id'].values, [enhancement] * test_e.shape[0], 
            test_e['clean_text'].values, None, './trainsample','test_event_slot')

### Argument抽取

In [None]:
train_a = load_argument('./trainsample/duee_train.json')
valid_a = load_argument('./trainsample/duee_dev.json')
train_a.shape, valid_a.shape, test_a.shape

In [None]:
train_a = argument_preprocess(train_a, useless_chars)
valid_a = argument_preprocess(valid_a, useless_chars)

In [None]:
test_a = pd.read_csv('./trainsample/test_event_pred.csv',index_col=None)
test_a['pred'] = test_a['pred'].map(lambda x: ast.literal_eval(x))
test_a['pred_label'] = test_a['pred'].map(lambda x: extract_multilabel(x, {j:i for i,j in schema.event_label.items()}, 0.5, greedy=True))
test_a = test_a.explode('pred_label')
test_a['event_text'] = test_a.apply(lambda x: x.pred_label + '-' + x.text_clean, axis=1)


In [None]:
single_text(train_a['id'].values, train_a['event_text'].values, train_a['argument_bio_label'].values,
            './trainsample','train_argument')
single_text(valid_a['id'].values, valid_a['event_text'].values, valid_a['argument_bio_label'].values, 
            './trainsample','valid_argument')
single_text(test_a['id'].values, test_a['event_text'].values, None, './trainsample','test_argument')