In [30]:
import json
from pathlib import Path

加载和合并原始数据

In [31]:
data_dir = Path('../data')

In [32]:
ccf = json.load(open(data_dir / 'CCF.json', 'r', encoding='utf-8'))
len(ccf)

200000

In [33]:
sci = json.load(open(data_dir / 'CS-SCI_1-2.json', 'r', encoding='utf-8'))
len(sci)

100000

In [34]:
with open(data_dir / 'arxiv.json', 'r', encoding='utf-8') as f:
    arxiv = [json.loads(line.strip(',\n')) for line in f.readlines()]
len(arxiv)

200000

In [35]:
from itertools import chain
from tqdm import tqdm

In [36]:
data = list(chain(ccf, sci, arxiv))
len(data)

500000

In [37]:
del ccf, sci, arxiv

所有数据共50万条，每条数据均包含_id、title、abstract、author四个字段

数据清洗
1. 去除缺失值
2. 去除重复值

In [38]:
data1 = list(filter(lambda d: d['_id'] and d['title'] and d['abstract'] and d['author'], data))
len(data1)

491862

In [39]:
unique_id = set()
unique_title = set()
data2 = []
for d in tqdm(data1):
    if d['_id'] not in unique_id and d['title'] not in unique_title:
        unique_id.add(d['_id'])
        unique_title.add(d['title'])
        data2.append(d)

len(data2)

100%|██████████| 491862/491862 [00:00<00:00, 1893975.74it/s]


449383

In [43]:
for d in tqdm(data2):
    d['_id'] = d['_id'].replace('\n', '')
    d['title'] = d['title'].replace('\n', '')
    d['abstract'] = d['abstract'].replace('\n', ' ')
    d['author'] = d['author'].replace('\n', '')

100%|██████████| 449383/449383 [00:00<00:00, 1151321.36it/s]


In [44]:
with open(data_dir / 'data.jsonl', 'w', encoding='utf-8') as f:
    for d in tqdm(data2):
        f.write(json.dumps(d, ensure_ascii=False) + '\n')

100%|██████████| 449383/449383 [00:13<00:00, 32194.50it/s]


In [41]:
import math

In [None]:
prompt = '''
# 任务：从给定的论文摘要文本中提取命名实体和实体之间的关系。
# 任务步骤：
1. 归纳总结出摘要文本中的研究问题、研究方法、实验结果和结论。
2. 分别从总结的文本中提取出对应的一个或多个关键词。
3. 按规定的json格式输出
# 输入：{0}
# 输出格式：
{{
    'entities': [
        {{'id': '', 'type': 'Problem', 'name': ''}},
        {{'id': '', 'type': 'Method', 'name': ''}},
        {{'id': '', 'type': 'Result', 'name': ''}},
        {{'id': '', 'type': 'Conclusion', 'name': ''}},
    ],
    'relations': [
        {{'type': '', 'from_entity_id': '', 'to_entity_id': ''}}
    ]
}}
'''

In [None]:
with open('../data/data.jsonl', 'r', encoding='utf8') as f:
    data = [json.loads(line) for line in f]
    data = [{
        'custom_id': d['_id'],
        'method': 'POST',
        'url': '/v4/chat/completions',
        'body': {
            'model': 'glm-4-flash',
            'messages': [
                {'role': 'system', 'content': '你好，你是一个实体和关系提取的专家，你需要从以下给定的论文摘要文本中提取出命名实体以及它们的关系。'},
                {'role': 'user', 'content': prompt.format(d['abstract'])}
            ]
        }
    } for d in data]
batch_size = math.ceil(len(data) / 4)
print(f'batch_size: {batch_size}')
data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
batch_size = math.ceil(len(data[0]) / 4)
print(f'sub_batch_size: {batch_size}')
for i in range(len(data)):
    data[i] = [data[i][j:j + batch_size] for j in range(0, len(data[i]), batch_size)]
for i, d in enumerate(data):
    for j, d1 in enumerate(d):
        with open(f'../data/data{i + 1}-{j + 1}.jsonl', 'w', encoding='utf8') as f:
            for item in tqdm(d1):
                f.write(json.dumps(item, ensure_ascii=False) + '\n')