In [1]:
import json
import csv
import os

```json
    result: {
        'id': 80402, 
        'text': '\ufeff芯驰科技完成近10亿元B+轮融资，将用于加强大规模量产落地和服务能力。据悉，本轮融资将用于持续提升芯驰核心技术，迭代更新车规芯片产品，加强大规模量产落地和服务能力，加速芯驰产品更广泛上车应用。', 
        'entities': [
            {'id': 782, 'label': '公司', 'start_offset': 1, 'end_offset': 5}, 
            {'id': 783, 'label': '公司', 'start_offset': 50, 'end_offset': 52}, 
            {'id': 785, 'label': '技术', 'start_offset': 61, 'end_offset': 65}, 
            {'id': 789, 'label': '作用', 'start_offset': 68, 'end_offset': 82}, 
            {'id': 792, 'label': '公司', 'start_offset': 85, 'end_offset': 87}
        ], 
        'relations': [
            {'id': 531, 'from_id': 783, 'to_id': 785, 'type': '研发_公司_技术'}, 
            {'id': 533, 'from_id': 785, 'to_id': 789, 'type': '实现_技术_作用'}
        ], 
        'Comments': []
    }
```

In [2]:
def findMatch_entity_name(id, entities, text):
    # returns corresponding `label`
    for e in entities:
        if e["id"] == id:
            return text[e["start_offset"]:e["end_offset"]]

In [3]:
def extract_triple(file_name, relation, out_dir):
    # file_name: .jsonl file name of the data
    # relation: "合作" | "研发" | "投资"
    # out_dir: output directory, a string


    # return: out_dir/${relation}.csv: csv file that contains all data matching the relation
    #           - csv format: subject,relation,object
    assert relation in ["合作", "研发", "投资"]

    output_list = list()
    with open(file_name, 'r', encoding="utf-8") as f:
        json_list = list(f)

    for json_str in json_list:
        result = json.loads(json_str)
        text = result["text"]
        entities = result["entities"]
        relations = result["relations"]
        for i,r in enumerate(relations):
            output_dict = dict()
            # 先找对应关系
            if r["type"][:2] == relation:
                # match subject
                from_entity = findMatch_entity_name(r['from_id'], entities, text)
                to_entity = findMatch_entity_name(r['to_id'], entities, text)
                output_dict["subject"] = from_entity
                output_dict["relation"] = r["type"]
                output_dict["object"] = to_entity
                output_list.append(output_dict)
    
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    with open(out_dir+relation+'.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['subject', 'relation', 'object'])
        for d in output_list:
            writer.writerow(d.values())


In [4]:
extract_triple("modifieid_output.jsonl", "合作", "./data/")

In [5]:
extract_triple("modifieid_output.jsonl", "研发", "./data/")

In [6]:
extract_triple("modifieid_output.jsonl", "投资", "./data/")