In [1]:
import torch
import transformers

print(torch.__version__)
print(transformers.__version__)

2.0.1+cpu
4.12.1


In [6]:
# model

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertConfig, BertForQuestionAnswering, BertModel
from transformers import InputExample
from transformers.data.processors.squad import SquadV2Processor, squad_convert_examples_to_features
from transformers.data.datasets.squad import SquadDataset

config = BertConfig.from_pretrained('../NLP_models/bert-base-chinese')
# config = BertConfig.from_pretrained('chinese-bert-wwm')
print(config.hidden_size)
# config.to_json_file("self_QA/config.json")
tokenizer = BertTokenizer.from_pretrained("../NLP_models/bert-base-chinese", config=config)

768


In [7]:
import json
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, data_path, mode='train'):
        
        with open(data_path, "r", encoding="utf-8") as f:
            self.data = json.load(f)["data"]
        self.samples = []
        self.long_samples = []  # 记录输入文本长度大于512的样本的索引
        for item in self.data:
            for paragraph in item["paragraphs"]:
                context = paragraph["context"]
                for qa in paragraph["qas"]:
                    question = qa["question"]
                    id = qa["id"]
                    if mode == 'train':
                        answer_start = [answer["answer_start"] for answer in qa["answers"]]
                        answer_text = [answer["text"] for answer in qa["answers"]]
                    else: # 验证集会有多个答案(不用这样也可以，不传入mode参数即可)
                        answer_start = [max(answer["answer_start"] for answer in qa["answers"])]
                        answer_text = [max(answer["text"] for answer in qa["answers"])]
                    if len(answer_start) == 0 or len(answer_text) == 0: # 如果答案为空，忽略该样本
                        continue
#                     input_dict = tokenizer.encode_plus(
#                         question, context, max_length=512, padding="max_length", truncation=True, return_tensors="pt"
#                     )
#                     if input_dict["input_ids"].shape[1] > 512:  # 如果输入文本长度大于512，记录索引
#                         self.long_samples.append(len(self.samples))
                    self.samples.append({
                        "id": id,
                        "context": context,
                        "question": question,
                        "answer_start": answer_start,
                        "answer_text": answer_text
                    })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        sample = self.samples[index]
        # # 这里通过Tokenizer的encode_plus方法将question和context拼接并转化为input_ids, attention_mask, token_type_ids这三个参数
        input_dict = tokenizer.encode_plus(
            sample["question"], sample["context"], max_length=512, padding="max_length", truncation=True, return_tensors="pt"
        )
        start_positions = torch.tensor(sample["answer_start"], dtype=torch.long)
        end_positions = torch.tensor([start_position + len(answer_text) - 1 for start_position, answer_text in zip(sample["answer_start"], sample["answer_text"])], dtype=torch.long)

        # 截断设置，可能不太合理
        start_positions[(start_positions >= 512) | (start_positions < 0)] = 510
        end_positions[(end_positions >= 512) | (end_positions < 0)] = 511        
        
        input_dict["start_positions"] = start_positions
        input_dict["end_positions"] = end_positions
        
        return input_dict

In [35]:
data_path ="./CJRC/transfered/big_train_data.json"

with open(data_path, "r", encoding="utf-8") as f:
    data = json.load(f)["data"]
samples = []
for item in data:
    for paragraph in item["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            question = qa["question"]
            id = qa["id"]
            answer_start = [answer["answer_start"] for answer in qa["answers"]]
            answer_text = [answer["text"] for answer in qa["answers"]]
            samples.append({
                "id": id,
                "context": context,
                "question": question,
                "answer_start": answer_start,
                "answer_text": answer_text
            })
#     break

samples[2]

{'id': 'e139eef6-fc0c-4953-acec-a83a0095ce4e.txt_003',
 'context': '经审查,原告提供的证据1-3、被告中华联合广东分公司提供的证据4-5、被告万友公司提供的证据6,各方对其真实性均没有异议,本院对其真实性予以确认综合本院采信的证据及当事人的陈述,本院认定以下事实:2015年6月1日,田x17驾驶粤A×××××号车辆与严x3驾驶的赣C×××××号重型仓栅式货车发生碰撞,造成两车不同程度损坏的交通事故交警部门作出事故认定书,认定严x3承担事故的全部责任,田x17不负事故责任粤A×××××号车辆在原告处投保了保险金额为908000元的机动车损失保险,事故发生在保险期间内事故发生后,粤A×××××号车辆的被保险人陈x18就该车辆的损失以财产保险合同纠纷起诉至佛山市禅城区人民法院案经审理,佛山市禅城区人民法院于2015年8月18日作出(2015)佛城法民二初字第1006号民事判决,查明粤A×××××号车辆经广州市华盟价格事务所有限公司评估,损失价格为241541元,陈x18支付了粤A×××××号车辆的维修费241541元、评估费9050元;本案原告在庭审中明确表示不申请重新对车辆损失进行评估鉴定并判决原告向陈x18支付粤A×××××号车辆损失保险理赔款250591元2015年10月11日,原告向陈x18赔付了250591元及诉讼费用2529元后原告提起本案之诉并查明,赣C×××××号车辆的所有人为被告万友公司,该车辆在被告中华联合广东分公司处投保了交强险,事故发生在保险期内事故发生后,被告中华联合广东分公司向该车辆的被保险人许x19赔付了2000元诉讼中,被告徐11确认其为该车辆的实际支配人,严x3是被告徐11雇请,是从事派遣工作过程中发生案涉交通事故被告徐11与被告万友公司签订《车辆挂靠合同书》,被告万友公司同意被告徐11就赣C×××××号车辆挂靠被告万友公司名下',
 'question': '投保人所投保险险种？',
 'answer_start': [233],
 'answer_text': ['机动车损失保险']}

In [38]:
type(samples[2]['answer_start'])

list

In [40]:
l = [ast['answer_start'] for ast in samples if len(ast['answer_start']) > 1]
l

[]

In [32]:
path ="./CJRC/transfered/big_train_data.json"

train_dataset = MyDataset(path)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=16,
    num_workers=0 # 这个是多线程数，最好设为0
)

next(iter(train_data_loader))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

{'input_ids': tensor([[[ 101,  752, 3125,  ..., 5468, 1394,  102]],

        [[ 101,  752, 3125,  ..., 6158, 1440,  102]],

        [[ 101, 2832,  924,  ..., 1440,  704,  102]],

        ...,

        [[ 101, 6158, 1440,  ...,    0,    0,    0]],

        [[ 101, 1333, 6158,  ...,    0,    0,    0]],

        [[ 101, 1352, 3175,  ...,    0,    0,    0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 1, 1, 1]],

        [[0, 0, 0,  ..., 1, 1, 1]],

        [[0, 0, 0,  ..., 1, 1, 1]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        ...,

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]]), 'start_positions': tensor([[153],
        [180],
        [233],
        [225],
        [ 30],
        [ 54],
        [180],
       

In [8]:
class BertQA(nn.Module):
    
    def __init__(self, pretrained_path, config_path):
        super(BertQA, self).__init__()
        self.config = BertConfig.from_pretrained(config_path)
        self.bert = BertModel.from_pretrained(pretrained_path, config=self.config)
        self.dropout = torch.nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.qa_outputs = torch.nn.Linear(self.bert.config.hidden_size, 2)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]
        pooled_output = outputs[1]
        
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        return start_logits, end_logits

In [26]:
# test

context = "依据上述有效证据,本院确认以下案件事实:原、被告双方于××××年××月份结婚,婚后生一女孩,取名马62,2013年11月21日,\
双方在新民市民政局协议离婚,协议内容为:婚生女孩马62由女方别x2抚养,男方每月给孩子抚养费1000元(从2013年12月1日起至2020年12月止)\
双方离婚后,小孩马62一直随被告生活××××年××月××日,婚生女孩马62从内乡县第二小学转入沈阳大学新民师范学院附属小学上学至今,\
现原告以小孩马62已年满十周岁,自愿提出与原告一起生活为由,诉至法院,请求依法变更小孩马62由原告抚养。"
question = "离婚协议约定马62由谁抚养？"

print(context)

依据上述有效证据,本院确认以下案件事实:原、被告双方于××××年××月份结婚,婚后生一女孩,取名马62,2013年11月21日,双方在新民市民政局协议离婚,协议内容为:婚生女孩马62由女方别x2抚养,男方每月给孩子抚养费1000元(从2013年12月1日起至2020年12月止)双方离婚后,小孩马62一直随被告生活××××年××月××日,婚生女孩马62从内乡县第二小学转入沈阳大学新民师范学院附属小学上学至今,现原告以小孩马62已年满十周岁,自愿提出与原告一起生活为由,诉至法院,请求依法变更小孩马62由原告抚养。


In [24]:
model = BertQA('../NLP_models/bert-base-chinese',config_path="../NLP_models/bert-base-chinese/config.json")
checkpoint = torch.load('../NLP_models/self_models/cjrc_model_dict.pth.bar',map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['state_dict'])

Some weights of the model checkpoint at ../NLP_models/bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [21]:
# 测试

question = "谁胜诉了？"
context = '''一、被告 东营美利达新型材料科技有限公司 于本判决生效之日起十日内支付原告 山东富润润滑油股份有限公司 货款153540元、违约金23584.5及自2021年3月20日起至清偿之日止的违约金（违约金计算方式：以153540元为基数，按日万分之五计算）；\r\n二、驳回原告<a href="https://www.tianyancha.com/company/3366658716" target="_blank" data-type="company">山东富润润滑油股份有限公司</a>的其他诉讼请求。\r\n如果未按本判决指定的期间履行给付金钱义务，应当依照《中华人民共和国民事诉讼法》第二百六十条规定，加倍支付迟延履行期间的债务利息。\r\n案件受理费5357元，减半收取计2678.5元，由被告<a href="https://www.tianyancha.com/company/2339951049" target="_blank" data-type="company">东营美利达新型材料科技有限公司</a>负担，于本判决生效后七日内向本院交纳。\r\n如不服本判决，可以在判决书送达之日起十五日内，向本院递交上诉状，并按对方当事人的人数提出副本，同时按照不服本判决部分的上诉请求数额，交纳案件受理费，上诉于山东省东营市中级人民法院。上诉期满后七日内仍未交纳上诉案件受理费的，按自动撤回上诉处理。'''

inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
# 获取答案开始和结束位置
start_logits, end_logits = model(**inputs)
# print(start_logits, end_logits)

start = torch.argmax(start_logits)
end = torch.argmax(end_logits) + 1
print(start,end)

answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[start:end]))
print(answer)

TypeError: argmax(): argument 'input' (position 1) must be Tensor, not str

In [28]:
question = "北京大学成立于何时？"
context = "北京大学创立于1898年，前身为京师大学堂，是中国近现代第一所国立综合性大学..."

In [30]:
from transformers import pipeline

model_path = "../NLP_models/bert-base-chinese"
model = BertForQuestionAnswering.from_pretrained(model_path)

qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)
result = qa_pipeline(question=question, context=context)
print("model answer question: \n", result)

Some weights of the model checkpoint at ../NLP_models/bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ../NL

model answer question: 
 {'score': 0.002553990576416254, 'start': 0, 'end': 40, 'answer': '北京大学创立于1898年，前身为京师大学堂，是中国近现代第一所国立综合性大学...'}


In [23]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer


# 加载已训练好的模型
model_path = "../NLP_models/bert-base-chinese"
model = BertForQuestionAnswering.from_pretrained(model_path)
# 加载tokenizer
tokenizer = BertTokenizer.from_pretrained("../NLP_models/bert-base-chinese")

# 设置测试用例
question = "北京大学成立于何时？"
context = "北京大学创立于1898年，前身为京师大学堂，是中国近现代第一所国立综合性大学..."

# 对测试用例进行编码
inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
# 获取答案开始和结束位置
outputs = model(**inputs)
start = torch.argmax(outputs.start_logits)
end = torch.argmax(outputs.end_logits) + 1
print(start,end)

answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[start:end]))
print(answer)

Some weights of the model checkpoint at ../NLP_models/bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ../NL

tensor(38) tensor(1)

