In [None]:
import json
from uuid import uuid4

from tqdm import tqdm

从data.json中提取所有的论文实体和作者实体及其关系

In [None]:
def load_data():
    with open('../data/data.json', 'r', encoding='utf-8') as f:
        yield from iter(json.load(f))

In [None]:
paper_entities = []
author_entities = []
relations = []
author_dict = {}
for paper in load_data():
    paper_id = paper['_id']
    paper_name = paper['title']
    paper_properties = {'abstract': paper['abstract']}
    paper_node = {
        'id': paper_id,
        'name': paper_name,
        'type': 'Paper',
        'properties': paper_properties
    }
    paper_entities.append(paper_node)
    for author in paper['author'].split(','):
        if author not in author_dict:
            author_dict[author] = str(uuid4()).replace('-', '')
            author_node = {
                'id': author_dict[author],
                'name': author,
                'type': 'Author',
            }
            author_entities.append(author_node)
        relation = {
            'type': 'Write',
            'from_entity_id': author_dict[author],
            'to_entity_id': paper_id,
            'from_entity_type': 'Author',
            'to_entity_type': 'Paper'
        }
        relations.append(relation)

print(f'paper entities: {len(paper_entities)}, authors: {len(author_entities)}, relations: {len(relations)}')

with open('../output/paper_entities.jsonl', 'w', encoding='utf-8') as f:
    for paper_entity in tqdm(paper_entities):
        f.write(json.dumps(paper_entity, ensure_ascii=False) + '\n')

with open('../output/author_entities.jsonl', 'w', encoding='utf-8') as f:
    for author_entity in tqdm(author_entities):
        f.write(json.dumps(author_entity, ensure_ascii=False) + '\n')

with open('../output/author_paper_relations.jsonl', 'w', encoding='utf-8') as f:
    for relation_entity in tqdm(relations):
        f.write(json.dumps(relation_entity, ensure_ascii=False) + '\n')


重新设置模型提取结果中的实体id

In [None]:
def set_paper_id(content):
    entities: list[dict[str, str]] = content['entities']
    relations: list[dict[str, str]] = content['relations']
    if not entities or not relations:
        return False
    entity_id_dict = {entity['id']: str(uuid4()).replace('-', '') for entity in entities}
    for entity in entities:
        entity['id'] = entity_id_dict[entity['id']]
    relations = [
        relation for relation in relations
        if relation['from_entity_id'] in entity_id_dict and relation['to_entity_id'] in entity_id_dict
    ]
    for relation in relations:
        relation['from_entity_id'] = entity_id_dict[relation['from_entity_id']]
        relation['to_entity_id'] = entity_id_dict[relation['to_entity_id']]
    return True

In [None]:
with open('../temp/data_prompt1.jsonl', 'r', encoding='utf-8') as f:
    with open('../temp/paper_extract1.jsonl', 'w', encoding='utf-8') as of:
        for line in f:
            data = json.loads(line)
            try:
                set_paper_id(data['content'])
            except Exception:
                continue
            of.write(json.dumps(data, ensure_ascii=False) + '\n')

从模型提取结果中提取实验结果、结论及其关系

In [None]:
paper_result_relations = []
paper_conclusion_relations = []
result_entities = []
conclusion_entities = []
with open('../temp/paper_extract1.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        entities = data['content']['entities']
        paper_id = data['id']
        result_entity = [entity for entity in entities if entity['type'].lower() == 'result']
        conclusion_entity = [entity for entity in entities if entity['type'].lower() == 'conclusion']
        result_entities.extend(result_entity)
        conclusion_entities.extend(conclusion_entity)

        for entity in result_entity:
            relation = {
                'type': 'Result',
                'from_entity_id': paper_id,
                'to_entity_id': entity['id'],
                'from_entity_type': 'Paper',
                'to_entity_type': 'Result'
            }
            paper_result_relations.append(relation)

        for entity in conclusion_entity:
            relation = {
                'type': 'Conclude',
                'from_entity_id': paper_id,
                'to_entity_id': entity['id'],
                'from_entity_type': 'Paper',
                'to_entity_type': 'Conclusion'
            }
            paper_conclusion_relations.append(relation)

with open('../output/result_entities.jsonl', 'w', encoding='utf-8') as f:
    for entity in tqdm(result_entities, desc='result entities'):
        f.write(json.dumps(entity, ensure_ascii=False) + '\n')

with open('../output/conclusion_entities.jsonl', 'w', encoding='utf-8') as f:
    for entity in tqdm(conclusion_entities, desc='conclusion entities'):
        f.write(json.dumps(entity, ensure_ascii=False) + '\n')

with open('../output/paper_result_relations.jsonl', 'w', encoding='utf-8') as f:
    for relation in tqdm(paper_result_relations, desc='paper result relations'):
        f.write(json.dumps(relation, ensure_ascii=False) + '\n')

with open('../output/paper_conclusion_relations.jsonl', 'w', encoding='utf-8') as f:
    for relation in tqdm(paper_conclusion_relations, desc='paper conclusion relations'):
        f.write(json.dumps(relation, ensure_ascii=False) + '\n')

提取论文问题和方法实体

In [None]:
problem_entities = []
method_entities = []
paper_problem_relations = []
paper_method_relations = []
with open('../temp/data_prompt_updated.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        paper_id = data['id']
        entities = data['content']['entities']
        problem_entity = [entity for entity in entities if entity['type'].lower() == 'problem']
        method_entity = [entity for entity in entities if entity['type'].lower() == 'method']
        problem_entities.extend(problem_entity)
        method_entities.extend(method_entity)

        for entity in problem_entity:
            relation = {
                'type': 'Solve',
                'from_entity_id': paper_id,
                'to_entity_id': entity['id'],
                'from_entity_type': 'Paper',
                'to_entity_type': 'Problem'
            }
            paper_problem_relations.append(relation)

        for entity in method_entity:
            relation = {
                'type': 'Utilize',
                'from_entity_id': paper_id,
                'to_entity_id': entity['id'],
                'from_entity_type': 'Paper',
                'to_entity_type': 'Method'
            }
            paper_method_relations.append(relation)

with open('../output/problem_entities.jsonl', 'w', encoding='utf-8') as f:
    for entity in tqdm(problem_entities, desc='problem entities'):
        f.write(json.dumps(entity, ensure_ascii=False) + '\n')

with open('../output/paper_problem_relations.jsonl', 'w', encoding='utf-8') as f:
    for relation in tqdm(paper_problem_relations, desc='paper problem relations'):
        f.write(json.dumps(relation, ensure_ascii=False) + '\n')

with open('../output/method_entities.jsonl', 'w', encoding='utf-8') as f:
    for entity in tqdm(method_entities, desc='method entities'):
        f.write(json.dumps(entity, ensure_ascii=False) + '\n')

with open('../output/paper_method_relations.jsonl', 'w', encoding='utf-8') as f:
    for relation in tqdm(paper_method_relations, desc='paper method relations'):
        f.write(json.dumps(relation, ensure_ascii=False) + '\n')

提取聚类结果中的学科实体及其关系

In [None]:
subject0_dict = {}
subject1_dict = {}
problem_subject1_relations_dict = {}
subject1_subject0_relations_dict = {}
with open('../temp/cluster.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        problem_id = data['to_entity_id']
        subjects = data['from_entity_name'].split(' - ')
        subject0 = subjects[0]
        subject1 = subjects[1]
        if subject0 not in subject0_dict:
            subject0_dict[subject0] = {
                'id': str(uuid4()).replace('-', ''),
                'type': 'Subject0',
                'name': subject0
            }
        subject0_entity = subject0_dict[subject0]
        subject0_id = subject0_entity['id']
        if subject1 not in subject1_dict:
            subject1_dict[subject1] = {
                'id': str(uuid4()).replace('-', ''),
                'type': 'Subject1',
                'name': subject1
            }
        subject1_entity = subject1_dict[subject1]
        subject1_id = subject1_entity['id']
        if (subject1_id, subject0_id) not in subject1_subject0_relations_dict:
            subject1_subject0_relations_dict[(subject1_id, subject0_id)] = {
                'type': 'Belongs',
                'from_entity_id': subject1_id,
                'to_entity_id': subject0_id,
                'from_entity_type': 'Subject1',
                'to_entity_type': 'Subject0'
            }
        if (problem_id, subject1_id) not in problem_subject1_relations_dict:
            problem_subject1_relations_dict[(problem_id, subject1_id)] = {
                'type': 'Belongs',
                'from_entity_id': problem_id,
                'to_entity_id': subject1_id,
                'from_entity_type': 'Problem',
                'to_entity_type': 'Subject1'
            }

with open('../output/subject0_entities.jsonl', 'w', encoding='utf-8') as f:
    for subject in tqdm(subject0_dict.values(), desc='subject0 entities'):
        f.write(json.dumps(subject, ensure_ascii=False) + '\n')

with open('../output/subject1_entities.jsonl', 'w', encoding='utf-8') as f:
    for subject in tqdm(subject1_dict.values(), desc='subject1 entities'):
        f.write(json.dumps(subject, ensure_ascii=False) + '\n')

with open('../output/problem_subject1_relations.jsonl', 'w', encoding='utf-8') as f:
    for relation in tqdm(problem_subject1_relations_dict.values(), desc='problem subject1 relations'):
        f.write(json.dumps(relation, ensure_ascii=False) + '\n')

with open('../output/subject1_subject0_relations.jsonl', 'w', encoding='utf-8') as f:
    for relation in tqdm(subject1_subject0_relations_dict.values(), desc='subject1 subject0 relations'):
        f.write(json.dumps(relation, ensure_ascii=False) + '\n')