In [2]:
import argparse
import torch
import torch.optim as optim
from model.casRel import CasRel
from model.callback import MyCallBack
from model.data import load_data, get_data_iterator
from model.config import Config
from model.evaluate import metric
import torch.nn.functional as F
from fastNLP import Trainer, LossBase

seed = 226
torch.manual_seed(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

parser = argparse.ArgumentParser(description='Model Controller')
parser.add_argument('--lr', type=float, default=1e-5, help='learning rate')
parser.add_argument('--batch_size', type=int, default=8)
parser.add_argument('--max_epoch', type=int, default=10)
parser.add_argument('--max_len', type=int, default=300)
parser.add_argument('--dataset', default='finData', type=str, help='define your own dataset names')
parser.add_argument("--bert_name", default='./pretrained_models/bert-base-chinese/', type=str, help='choose pretrained bert name')
parser.add_argument('--bert_dim', default=768, type=int)
args = parser.parse_args(args=[])
con = Config(args)

In [3]:
def loss_fn(pred, gold, mask):
    pred = pred.squeeze(-1)
    loss = F.binary_cross_entropy(pred, gold, reduction='none')
    if loss.shape != mask.shape:
        mask = mask.unsqueeze(-1)
    loss = torch.sum(loss * mask) / torch.sum(mask)
    return loss

def get_loss(predict, target):  # Casrel计算loss的方程
    mask = target['mask']
    return loss_fn(predict['sub_heads'], target['sub_heads'], mask) + \
            loss_fn(predict['sub_tails'], target['sub_tails'], mask) + \
            loss_fn(predict['obj_heads'], target['obj_heads'], mask) + \
            loss_fn(predict['obj_tails'], target['obj_tails'], mask)



In [4]:
model = CasRel(con).to(device)
data_bundle, rel_vocab = load_data(con.train_path, con.dev_path, con.test_path, con.rel_path)
train_dataset = get_data_iterator(con, data_bundle.get_dataset('train'), rel_vocab)
dev_dataset = get_data_iterator(con, data_bundle.get_dataset('dev'), rel_vocab, is_test=True)
test_dataset = get_data_iterator(con, data_bundle.get_dataset('test'), rel_vocab, is_test=True)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=con.lr)

Some weights of the model checkpoint at ./pretrained_models/bert-base-chinese/ were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ./pretrained_models/bert-base-chinese/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from tqdm import tqdm
def train_epoch(train_loader, model, optimizer, epoch):
    global train_data, batch_samples
    # set model to training mode
    model.train() # 不固定batch normalization和dropout，需要更新
    # step number in one epoch: 336
    train_losses = 0
    for idx, batch_samples in enumerate(tqdm(train_loader)):  # tqdm是显示进度条的,每次加载一个batch32个数据
        train_data = batch_samples[0]
        target_data = batch_samples[1]
        predict = model(train_data['token_ids'], train_data['mask'], 
                train_data['sub_head'], train_data['sub_tail'])
        loss = get_loss(predict, target_data)
        # compute model output and loss
        train_losses += loss.item()
        # clear previous gradients, compute gradients of all variables wrt loss
        optimizer.zero_grad()
        loss.backward()
        # gradient clipping
        # performs updates using calculated gradients
        optimizer.step()  # 要先Loss.backward()之后再用step,先清零参数空间的梯度,用了才会更新模型
        #print(idx, train_losses)
    train_loss = float(train_losses) / len(train_loader)
    print("Epoch: {}, train loss: {}".format(epoch, train_loss))
    
    #Dev
    metric(dev_dataset, rel_vocab, con, model)


    

    



#best_val_f1 = 0.0
#patience_counter = 0
# start training
epoch_num = 8
for epoch in range(1, epoch_num + 1):
    train_epoch(train_dataset, model, optimizer, epoch)
    if epoch % 2 == 0:
        torch.save(model.state_dict(),'./net_epoch{}.pth'.format(epoch_num))
    # val_metrics = evaluate(dev_loader, model, mode='dev')
    # val_f1 = val_metrics['f1']
    # val_p = val_metrics['p']
    # val_r = val_metrics['r']
#     logging.info("Epoch: {}, dev loss: {}, f1 score: {}, precision: {}, recall: {}".format(epoch, val_metrics['loss'], val_f1, val_p, val_r))
#     improve_f1 = val_f1 - best_val_f1
#     if improve_f1 > 1e-5:
#         best_val_f1 = val_f1
#         model.save_pretrained(model_dir)
#         logging.info("--------Save best model!--------")
#         if improve_f1 < config.patience:
#             patience_counter += 1
#         else:
#             patience_counter = 0
#     else:
#         patience_counter += 1
#     # Early stopping and logging best f1
#     if (patience_counter >= config.patience_num and epoch > config.min_epoch_num) or epoch == config.epoch_num:
#         logging.info("Best val f1: {}".format(best_val_f1))
#         break
# logging.info("Training Finished!")

  1%|          | 4/776 [00:18<1:00:28,  4.70s/it]


KeyboardInterrupt: 

100%|██████████| 36/36 [00:02<00:00, 13.10it/s]

correct_num:   0, predict_num:   0, gold_num:  63
f1: 0.00, precision: 0.00, recall: 0.00





(0.0, 0.0, 0.0)

In [20]:
from transformers import BertTokenizer
orders = ['subject', 'relation', 'object']
correct_num, predict_num, gold_num = 0, 0, 0
tokenizer = BertTokenizer.from_pretrained(con.bert_name)

for batch_x, batch_y in tqdm(dev_dataset):  # x用来测试，y是准确的数据集
    with torch.no_grad():
        token_ids = batch_x['token_ids']
        mask = batch_x['mask']
        encoded_text = model.get_encoded_text(token_ids, mask)
        pred_sub_heads, pred_sub_tails = model.get_subs(encoded_text)  # 预测
        sub_heads = torch.where(pred_sub_heads[0] > 0.05)[0]
        # if len(sub_heads)>0:
        #     print(sub_heads)
        sub_tails = torch.where(pred_sub_tails[0] > 0.5)[0]
        subjects = []
        for sub_head in sub_heads:
            sub_tail = sub_tails[sub_tails >= sub_head]
            if len(sub_tail) > 0:
                sub_tail = sub_tail[0]
                subject = ''.join(tokenizer.decode(token_ids[0][sub_head: sub_tail + 1]).split())
                subjects.append((subject, sub_head, sub_tail))

100%|██████████| 36/36 [00:03<00:00, 11.06it/s]


[]

In [3]:
from random import choice
from transformers import BertTokenizer
from transformers import BertModel
from collections import defaultdict
def find_head_idx(source, target):
    target_len = len(target)
    for i in range(len(source)):
        if source[i: i + target_len] == target:
            return i
    return -1

tokenizer = BertTokenizer.from_pretrained(con.bert_name)
bert = BertModel.from_pretrained(con.bert_name)

json_data = data_bundle.get_dataset('train')[223]
tokenized = tokenizer(json_data['text'])
tokens = tokenized['input_ids'] # 句子的length
masks = tokenized['attention_mask']
text_len = len(tokens)

token_ids = torch.tensor(tokens, dtype=torch.long)
masks = torch.tensor(masks, dtype=torch.bool)
"""主体和客体起始位置的记录"""
sub_heads, sub_tails = torch.zeros(text_len), torch.zeros(text_len)
sub_head, sub_tail = torch.zeros(text_len), torch.zeros(text_len)
obj_heads = torch.zeros((text_len, con.num_relations))
obj_tails = torch.zeros((text_len, con.num_relations))

s2ro_map = defaultdict(list)  # 创建一个dictionary，将键-值对更新为键-列表对，每个键可以调用list的属性
for spo in json_data['spo_list']:
    triple = (tokenizer(spo['subject'], add_special_tokens=False)['input_ids'], 
                rel_vocab.to_index(spo['predicate']),
                tokenizer(spo['object'], add_special_tokens=False)['input_ids']) # 把文本转换成id然后记录三元组,同时避免加入[CLS][SEP]这些特殊符号
    """
    - ISSUE: 如果某一个词语多次出现则只能找到第一个位置，这里有问题
    - SOLUTION: 再加一个变量记录这个词语是否出现过，如果出现过就记录他的位置，然后从这个位置开始往后找
    - WHY: 其实不改也可以，因为一个主体在文本中的意思应该是一样的。但是因为BERT模型会考虑前后文信息所以最好还是改一下？不确定，看验证结果，不是很严重的问题。
    """
    sub_head_idx = find_head_idx(tokens, triple[0])
    obj_head_idx = find_head_idx(tokens, triple[2])
    """可以试一下assert判断+终止"""
    if sub_head_idx != -1 and obj_head_idx != -1:
        sub = (sub_head_idx, sub_head_idx + len(triple[0]) - 1) # 主体位置
        s2ro_map[sub].append(
            (obj_head_idx, obj_head_idx + len(triple[2]) - 1, triple[1]))  # 用append解决一个主体对应多个客体的问题。客体位置+关系

if s2ro_map:  # 可能没有记录
    for s in s2ro_map:
        sub_heads[s[0]] = 1
        sub_tails[s[1]] = 1
    sub_head_idx, sub_tail_idx = choice(list(s2ro_map.keys()))
    sub_head[sub_head_idx] = 1
    sub_tail[sub_tail_idx] = 1
    for ro in s2ro_map.get((sub_head_idx, sub_tail_idx), []):
        obj_heads[ro[0]][ro[2]] = 1
        obj_tails[ro[1]][ro[2]] = 1

Some weights of the model checkpoint at ./pretrained_models/bert-base-chinese/ were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ./pretrained_models/bert-base-chinese/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
import json
relation = []
event = []
with open('./data/data2.0.json', 'r', encoding='utf-8') as json_file:
    f = json_file.readlines()
    for line in f:
        json_line = json.loads(line.strip())
        relation += json_line['result']['data']['normal']
        event += json_line['result']['data']['cangu']

In [43]:
len(relation)
import random


In [45]:
def clean_relations(relations):
    ret = []
    for item in relations:
        results = item['results']
        if results:
            add = True
            for rel in results:
                if rel['head'] == '公司' or rel['head'] == '本公司':
                    if random.random() > 0.2:
                        add = False
                        break
            if add:
                ret.append(item)
    return ret
a = clean_relations(relation)

In [46]:
len(a)

883

In [47]:
a[:10]

[{'text': '交易完成后，公司占华高世纪股权比例为99.56%，华高世纪成为公司控股子公司。',
  'results': [{'head': '公司', 'relation': '子公司', 'tail': '华高世纪'}]},
 {'text': '基于神州高铁整线智能运营优势以及在天津市轨道交通的深耕布局,经与中国交建等方面洽商一致,2020年7月15日神州高铁及子公司神州高铁轨道交通运营管理有限公司(以下简称“神铁运营”,神州高铁与神铁运营合称“神州高铁方”)与中国交建签署了《天津地铁2、3号线存量PPP项目股权转让框架协议》,中国交建同意按照天津市政府转让的相关条件,将其持有的2号线和3号线项目公司部分股权转让给神州高铁方,神州高铁方同意按照相关条件受让前述股权。',
  'results': [{'head': '神州高铁',
    'relation': '子公司',
    'tail': '神州高铁轨道交通运营管理有限公司'}]},
 {'text': '为解决宝安鸿基地产集团股份有限公司（以下简称“宝安地产”）与本公司在惠州地区存在的同业竞争，宝安地产拟以人民币17,000万元受让本公司之全资子公司惠州市宝安房地产开发有限公司（以下简称“惠州地产公司”）100%股权。',
  'results': [{'head': '本公司', 'relation': '子公司', 'tail': '惠州市宝安房地产开发有限公司'},
   {'head': '本公司', 'relation': '子公司', 'tail': '惠州市宝安房地产开发有限公司'}]},
 {'text': '中国宝安集团股份有限公司（以下简称“公司”）控股子公司深圳市贝特瑞新能源材料股份有限公司（证券简称：贝特瑞，证券代码：835185）于2018年10月26日召开了第四届董事会第三十三次会议，审议通过了《关于转让广东芳源环保股份有限公司部分股权的议案》，同意贝特瑞在未来12个月内通过全国中小企业股份转让系统转让所持有的广东芳源环保股份有限公司（证券简称：芳源环保，证券代码：839247）股份不超过1,000万股，详见贝特瑞在全国中小企业股份转让系统发布的《第四届董事会第三十三次会议决议公告》（公告编号：2018-089）。',
  'results': [{'head

In [37]:
len(event)

9864

In [13]:
event[:5]

[{'text': '2009年6月12日,信息披露义务人与中国平安签署了《股份购买协议》,协议主要内容如下:转让方将向受让方转让其持有的深发展520,414,439股股份,占深发展总股本的16.76%。',
  'pre_events': [{'trigger': '购买',
    'event_type': '转让事件',
    'arguments': [{'argument': '深发展',
      'role': '被转让方|公司',
      'argument_start_index': 58}]},
   {'trigger': '持有',
    'event_type': '参股事件',
    'arguments': [{'argument': '深发展',
      'role': '被参股方|公司',
      'argument_start_index': 58},
     {'argument': '16.76%', 'role': '参股比例|数值', 'argument_start_index': 84}]}]},
 {'text': '深圳发展银行股份有限公司拟向中国平安保险（集团）股份有限公司（“中国平安”）发行股份（“本次发行”），中国平安拟以其持有的平安银行股份有限公司（“平安银行”）约90.75%的股份以及部分现金认购公司本次发行的股份。',
  'pre_events': [{'trigger': '持有',
    'event_type': '参股事件',
    'arguments': [{'argument': '中国平安',
      'role': '参股方|公司',
      'argument_start_index': 51},
     {'argument': '平安银行股份有限公司', 'role': '被参股方|公司', 'argument_start_index': 61},
     {'argument': '90.75%', 'role': '参股比例|数值', 'argument_start_index': 80}]}]},
 {'text': '公司之全资子公司万科置业（香港）有限公司（简称“万科置业”）于2012年5月13日与永泰地产有限公司（简称“永泰地产

In [29]:
def clean_parse_events(events):
    ret = []
    tri_dic = {}
    for item in events:
        es = item['pre_events']
        for e in es:
            tri_dic[e['trigger']] = tri_dic[e['trigger']] + 1 if e['trigger'] in tri_dic else 1
            #if e['event_type'] == '转让事件':
       
    white_triggers = [key  for key in tri_dic if tri_dic[key] >= 5]

    for item in events:
        es = item['pre_events']
        add = True
        for e in es:
            if e['trigger'] not in white_triggers or len(e['arguments']) < 2 or e['event_type'] != '购买事件':
                add = False
                break
        if add:
            ret.append(item)
            '''
            tmp = {}
            tmp['text'] = item['text']
            tmp['results'] = []
            for e in es:
                sop = {}
                sop['relation'] = '参股'
                for arg in e['arguments']:
                    if arg['role'] == '参股方|公司':
                        sop['head'] = arg['argument']
                        if len(sop.keys()) == 3:
                            tmp['results'].append(sop.copy())  

                    if arg['role'] == '被参股方|公司':
                        sop['tail'] = arg['argument']
                        if len(sop.keys()) == 3:
                            tmp['results'].append(sop.copy())  
                              
            ret.append(tmp)
            '''

    return ret


In [30]:
b = clean_parse_events(event)
print(len(b))

856


In [34]:
b[20:30]

[{'text': '本次交易中深天马拟通过向特定对象非公开发行股份购买上海天马70%股权、成都天马40%股权、武汉天马90%股权、上海光电子100%股权、深圳光电子100%股权，并募集配套资金。',
  'pre_events': [{'trigger': '购买',
    'event_type': '购买事件',
    'arguments': [{'argument': '中深天马',
      'role': '买方|公司',
      'argument_start_index': 4},
     {'argument': '上海天马', 'role': '卖方|公司', 'argument_start_index': 25},
     {'argument': '70%', 'role': '股权类数据|数值', 'argument_start_index': 29},
     {'argument': '股权', 'role': '股权类产品|产品', 'argument_start_index': 32},
     {'argument': '成都天马', 'role': '卖方|公司', 'argument_start_index': 35},
     {'argument': '40%', 'role': '股权类数据|数值', 'argument_start_index': 39},
     {'argument': '武汉天马', 'role': '卖方|公司', 'argument_start_index': 45},
     {'argument': '上海光电子', 'role': '卖方|公司', 'argument_start_index': 55},
     {'argument': '深圳光电子', 'role': '卖方|公司', 'argument_start_index': 67}]}]},
 {'text': '根据初步预估，标的资产上海天马70%股权、成都天马40%股权、武汉天马90%股权、上海光电子100%股权、深圳光电子100%股权的预估值分别为122,875.99万元、58,058.23万元、163,594.20万元、167,111.90万元、29,962.13万元，总计541,602.

In [9]:
relation[:20]

[{'text': '交易概述。', 'results': []},
 {'text': '没有单位。', 'results': []},
 {'text': '转让方有权选择以现金对价人民币11,449,117,658元(“现金对价”)或以中国平安新发行的299,088,758股H股股份(“对价股份”)收取交易对价。',
  'results': []},
 {'text': '2012-05-15：1港币=0.813人民币元。', 'results': []},
 {'text': '本次询价转让存托凭证的数量为17,600,000份，占九号公司存托凭证总数量的比例为2.50%，转让原因为自身资金需求。',
  'results': []},
 {'text': '转让存托凭证持有人名称转让存托凭证数量（份）占公司存托凭证总数量比例占所持存托凭证数量的比例转让原因SequoiaCapitalChinaGFHoldcoIII-A,Ltd.6,300,0000.89%5.92%自身资金需求PeopleBetterLimited4,085,0000.58%5.91%自身资金需求ShunweiTMTIIILimited4,085,0000.58%5.91%自身资金需求WestSummitGlobalTechnologyFund,L.P.300,0000.04%0.85%自身资金需求WtmtechLimited1,450,0000.21%4.66%自身资金需求IntelCapitalCorporation600,0000.09%2.85%自身资金需求ZhaoduanLimited10,0000.00%0.07%自身资金需求CliffInvestmentPte.Ltd.730,0000.10%5.89%自身资金需求WltechLimited10,0000.00%0.16%自身资金需求WestOriginFTLP30,0000.00%0.73%自身资金需求。',
  'results': []},
 {'text': '2009年6月12日,信息披露义务人与中国平安签署了《股份购买协议》,协议主要内容如下:转让方将向受让方转让其持有的深发展520,414,439股股份,占深发展总股本的16.76%。',
  'results': []},
 {'text': '深圳发展银行股

In [10]:
def clean_relations(relations):
    ret = []
    for item in relations:
        results = item['results']
        if results:
            add = True
            #for rel in results:
            #    if rel['head'] == '公司':
            #        add = False
            #        break
            if add:
                ret.append(item)
    return ret
a = clean_relations(relation)

In [11]:
len(a)

1644

In [6]:
a[:5]

[{'text': '基于神州高铁整线智能运营优势以及在天津市轨道交通的深耕布局,经与中国交建等方面洽商一致,2020年7月15日神州高铁及子公司神州高铁轨道交通运营管理有限公司(以下简称“神铁运营”,神州高铁与神铁运营合称“神州高铁方”)与中国交建签署了《天津地铁2、3号线存量PPP项目股权转让框架协议》,中国交建同意按照天津市政府转让的相关条件,将其持有的2号线和3号线项目公司部分股权转让给神州高铁方,神州高铁方同意按照相关条件受让前述股权。',
  'results': [{'head': '神州高铁',
    'relation': '子公司',
    'tail': '神州高铁轨道交通运营管理有限公司'}]},
 {'text': '为解决宝安鸿基地产集团股份有限公司（以下简称“宝安地产”）与本公司在惠州地区存在的同业竞争，宝安地产拟以人民币17,000万元受让本公司之全资子公司惠州市宝安房地产开发有限公司（以下简称“惠州地产公司”）100%股权。',
  'results': [{'head': '本公司', 'relation': '子公司', 'tail': '惠州市宝安房地产开发有限公司'},
   {'head': '本公司', 'relation': '子公司', 'tail': '惠州市宝安房地产开发有限公司'}]},
 {'text': '惠州市宝安房地产开发有限公司股东结构：本公司持股99.8％，本公司之全资子公司中国宝安集团投资有限公司（以下简称“中宝投资”）持股0.2％。',
  'results': [{'head': '本公司', 'relation': '子公司', 'tail': '中国宝安集团投资有限公司'},
   {'head': '本公司', 'relation': '子公司', 'tail': '中国宝安集团投资有限公司'}]},
 {'text': '本公司于2015年12月14日召开第十二届董事局第三十七次会议,审议通过了《关于全资子公司拟有条件全面要约收购国际精密集团有限公司股权的议案》,本公司全资子公司宝安科技有限公司拟以每股1.5港元的价格有条件全面要约收购国际精密集团有限公司(以下简称“目标公司”或“国际精密“)股权,并拟注销目标公司尚未行使的全部期权,注销价格根据要约价与期权行使价的差额来确定。',

In [None]:
# CLEAN ORIGINAL DATA

import json
relation = []
event = []
with open('./JD_example.json', 'r', encoding='utf-8') as json_file:
    f = json_file.readlines()
    for line in f:
        json_line = json.loads(line.strip())
        relation += json_line['result']['data']['normal']
        event += json_line['result']['data']['cangu']
def clean_relations(relations):
    ret = []
    for item in relations:
        results = item['results']
        if results:
            add = True
            for rel in results:
                if rel['head'] == '公司':
                    add = False
                    break
            if add:
                ret.append(item)
    return ret
a = clean_relations(relation)

def clean_parse_events(events):
    ret = []
    tri_dic = {}
    for item in events:
        es = item['pre_events']
        for e in es:
            tri_dic[e['trigger']] = tri_dic[e['trigger']] + 1 if e['trigger'] in tri_dic else 1
    white_triggers = [key  for key in tri_dic if tri_dic[key] >= 5]
    
    for item in events:
        es = item['pre_events']
        add = True
        for e in es:
            if e['trigger'] not in white_triggers or len(e['arguments']) < 2 or e['event_type'] != '参股事件':
                add = False
                break
        if add:
            #ret.append(item)
            tmp = {}
            tmp['text'] = item['text']
            tmp['results'] = []
            for e in es:
                sop = {}
                sop['relation'] = '参股'
                for arg in e['arguments']:
                    if arg['role'] == '参股方|公司':
                        sop['head'] = arg['argument']
                        if len(sop.keys()) == 3:
                            tmp['results'].append(sop.copy())  

                    if arg['role'] == '被参股方|公司':
                        sop['tail'] = arg['argument']
                        if len(sop.keys()) == 3:
                            tmp['results'].append(sop.copy())  
                              
            ret.append(tmp)
    return ret
b = clean_parse_events(event)
c = clean_relations(b)
data = a + c
import random
train = []
test = []
dev = []
for tmp in data:
    r = random.random()
    if r < 0.2:
        test.append(tmp)
    elif r < 0.92:
        train.append(tmp)
    else:
        dev.append(tmp)
random.shuffle(train)
random.shuffle(test)
random.shuffle(dev)  

In [48]:
train = []
test = []
dev = []
for tmp in a:
    r = random.random()
    if r < 0.2:
        test.append(tmp)
    elif r < 0.92:
        train.append(tmp)
    else:
        dev.append(tmp)
random.shuffle(train)
random.shuffle(test)
random.shuffle(dev)  

In [49]:
len(train)

642