In [1]:
import json
import random

import pandas as pd
import os


def process_json_file_task1():
    datas = []
    triad_df = pd.read_excel(DATA_PATH + 'task1/train_triad.xlsx')
    triad_df.fillna("", inplace=True)
    PMIDs = triad_df['PMID'].tolist()
    json_files_directory = DATA_PATH + 'task1/'
    num = 1
    for file_name in os.listdir(json_files_directory):
        if file_name.endswith('.json'):
            try:
                file_path = os.path.join(json_files_directory, file_name)
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    input_text = data['text']
                    sourceid = data['sourceid']
                    if int(sourceid) not in PMIDs:
                        continue
                    # 查找匹配的三元组
                    matching_rows = triad_df[triad_df['PMID'] == int(sourceid)]
                    output = []
                    for _, row in matching_rows.iterrows():
                        gene = row['GENE'].strip()
                        disease = row['DISEASE'].strip()
                        if row['FUNCTION'] == "":
                            continue
                        if gene not in input_text or disease not in input_text:
                            continue
                        output.append((gene, row['FUNCTION'], disease))
                    line = {"instruction": prompt1, "input": input_text, "output": str(output), "history": []}
                    datas.append(line)
                    num += 1
            except Exception as e:
                print("error:", e)
                continue
    return datas


def process_json_file_task2():
    datas = []
    with open(DATA_PATH + 'task2/train.json', 'r') as file:
        data = json.load(file)
        num = 1
        for key, value in data.items():
            title = value['title']
            abstract = title + value['abstract']
            relations = []
            for rel in value['relations']:
                chemical_id = rel['chemical']
                disease_id = rel['disease']
                # Find the corresponding names
                chemical_names = []
                disease_names = []
                for name, id in value['chemical2id'].items():
                    if id == chemical_id:
                        if name not in chemical_names:
                            chemical_names.append(name)
                for name, id in value['disease2id'].items():
                    if id == disease_id:
                        if name not in disease_names:
                            disease_names.append(name)

                if chemical_names and disease_names:
                    for disease_name in disease_names:
                        for chemical_name in chemical_names:
                            # 获取当前化学物 & 疾病 所在abstract的起始位置
                            chemical_start_idx = abstract.lower().find(chemical_name)
                            disease_start_idx = abstract.lower().find(disease_name)
                            if chemical_start_idx == -1 or disease_start_idx == -1:
                                continue

                            # 获取末端位置 & 最终的实体
                            chemical_end_idx = chemical_start_idx + len(chemical_name)
                            correct_chemical = abstract[chemical_start_idx:chemical_end_idx]

                            disease_end_idx = disease_start_idx + len(disease_name)
                            correct_disease = abstract[disease_start_idx:disease_end_idx]

                            if correct_chemical not in abstract or correct_disease not in abstract:
                                print(value, "\n\n")
                                continue

                            relations.append((correct_chemical, correct_disease))

            line = {"instruction": prompt2, "input": abstract, "output": str(relations), "history": []}
            datas.append(line)
    return datas


def process_json_file_task3():
    datas = []
    with open(DATA_PATH + 'task3/train.json', 'r') as file:
        data = json.load(file)
        num = 1
        for key, entry in data.items():
            abstract = entry['abstract']
            triples_text = []
            for triple in entry['triples']:
                if triple['drug'] not in abstract or triple['target'] not in abstract:
                    continue

                triples_text.append((triple['drug'], triple['interaction'], triple['target']))

            line = {"instruction": prompt3, "input": abstract, "output": str(triples_text), "history": []}
            datas.append(line)
    return datas


if __name__ == '__main__':
    # prompt1 = "In this Gene-Disease relation extraction task, you need to follow 3 steps. You need to extract the (gene, function change, disease) triplet from the text, such as: (SHROOM3, LOF, Neural tube defects). The second element in the triple means the regulation that the gene produces to the disease. Types of regulations are: LOF and GOF, which indicate loss or gain of function; REG, which indicates a general regulatory relationship; COM, which indicates that the functional change between genes and diseases is more complex, and it is difficult to determine whether the functional change is LOF or GOF. Please return all the relations extracted from the text in ternary format [[GENE, FUNCTION, DISEASE]]."
    prompt1 = "You are a genetic disease expert. In this Gene-Disease relation extraction task, you need to follow 3 steps. You need to extract the [gene, function change, disease] triplet from the text, such as: [SHROOM3, LOF, Neural tube defects]. The second element in the triple means the regulation that the gene produces to the disease. Types of regulations are: LOF and GOF, which indicate loss or gain of function; REG, which indicates a general regulatory relationship; COM, which indicates that the functional change between genes and diseases is more complex, and it is difficult to determine whether the functional change is LOF or GOF. Please return all the relations extracted from the text in ternary format [[GENE, FUNCTION, DISEASE]]."
    prompt2 = "You are a biologist. I'll give you the abstract of literature. Please identify all the [[compound,disease]] relations in the abstract, and just give me a list of all relations you recognized"
    prompt3 = "You are a medicinal chemist. Now you need to identify all the drug-drug interactions from the text I provide to you, and please only write down all the drug-drug interactions in the format of [[drug, interaction, drug]]. "
    DATA_PATH = 'data/'
    datas1 = process_json_file_task1()
    datas2 = process_json_file_task2()
    datas3 = process_json_file_task3()
    datas = datas1 + datas2 + datas3
    print(len(datas1), len(datas2), len(datas3))
    random.seed(42)
    random.shuffle(datas)
    fout = open("data/instruction.json", "w", encoding="utf8")
    json.dump(datas, fout, ensure_ascii=False, indent=2)
    fout.close()


208 500 664
