In [1]:
import json

In [2]:
mapping = {"CAUSE": 'cause', 'TREAT': 'treat', 'PRESENT': 'present', 'AGGRAVATE': 'aggravate', 'PREVENT': 'prevent', 'IMPROVE': 'improve', 'AFFECT': 'affect'}
mapping_types = {'DISEASE': '<disease>', 'SYMPTOM': '<symptom>', 'TREATMENT': '<treatment>', 'RISK-FACTOR': '<risk-factor>',
                 'TEST': '<test>', 'GENE': '<gene>', 'BIOMARKER': '<biomarker>', 'COMPLICATION': '<complication>',
                 'PROGNOSIS': '<prognosis>', 'COMORBIDITY': '<comorbidity>',
                 'PROGRESSION': '<progression>', 'BODY-PART': '<body-part>'}

In [3]:
ANNOTATED_ABSTRACTS = "../../Data/annotated_abstracts.jsonl"
EXTRACTED_RELATIONS = "../../Data/extracted_relations.jsonl"

In [4]:
with open(ANNOTATED_ABSTRACTS) as json_file, open(EXTRACTED_RELATIONS, 'w') as outfile:
    json_list = list(json_file)
    for id_, x in enumerate(json_list):
        triplets = ""
        row = json.loads(x)
        if 'relations' in row.keys():
            for relation in row['relations']:
                first_part = ' '.join([token['text'] for token in row['tokens'][relation['head_span']['token_start']:relation['head_span']['token_end'] + 1]])
                second_part = ' '.join([token['text'] for token in row['tokens'][relation['child_span']['token_start']:relation['child_span']['token_end'] + 1]])
                triplet = {"subject": first_part, "relation": mapping[relation["label"]], "object": second_part}
                json.dump(triplet, outfile)
                outfile.write('\n')

In [5]:
ORIGINAL_FILE_TRAIN="../../Data/training_data.jsonl"
ORIGINAL_FILE_VALIDATION="../../Data/validation_data.jsonl"

In [6]:
def split_jsonl_file(input_file, training_file, validation_file, split_index=250):
    with open(input_file, 'r') as infile:
        lines = infile.readlines()

    first_set = lines[:split_index]
    second_set = lines[split_index:]

    with open(training_file, 'w') as outfile1:
        for line in first_set:
            outfile1.write(line)

    with open(validation_file, 'w') as outfile2:
        for line in second_set:
            outfile2.write(line)

In [7]:
split_jsonl_file(EXTRACTED_RELATIONS, ORIGINAL_FILE_TRAIN, ORIGINAL_FILE_VALIDATION)

In [8]:
def transform_to_chat_format(input_file_path, output_file_path):
    transformed_data = []

    with open(input_file_path, 'r') as input_file:
        for line in input_file:
            item = json.loads(line)
            subject = item['subject']
            relation = item['relation']
            obj = item['object']
            
            if relation == "cause":
                user_message = f"What does {subject} cause?"
                assistant_message = f"{subject} causes {obj}."
            elif relation == "aggravate":
                user_message = f"What does {subject} aggravate?"
                assistant_message = f"{subject} aggravates {obj}."
            elif relation == "treat":
                user_message = f"How is {obj} treated?"
                assistant_message = f"{obj} is treated with {subject}."
            elif relation == "prevent":
                user_message = f"How can {obj} be prevented?"
                assistant_message = f"{obj} can be prevented with {subject}."
            elif relation == "present":
                user_message = f"Where is the {subject} present?"
                assistant_message = f"{subject} is present in {obj}."
            elif relation == "improve":
                user_message = f"How can {obj} be improved?"
                assistant_message = f"{obj} can be improved with {subject}."
            elif relation == "affect":
                user_message = f"What affects the {subject} ?"
                assistant_message = f"{subject} affects {obj}."
            else:  # Default template for other relations
                user_message = f"Tell me about {subject} and its relation to {obj}."
                assistant_message = f"{subject} and {obj} are related through {relation}."
            
            chat_entry = {
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": user_message},
                    {"role": "assistant", "content": assistant_message}
                ]
            }
            
            transformed_data.append(chat_entry)

    with open(output_file_path, 'w') as output_file:
        for item in transformed_data:
            output_file.write(json.dumps(item) + '\n')

In [9]:
MESSAGE_FILE_TRAIN = '../../Data/chat_completion_format_training_data.jsonl'
MESSAGE_FILE_VALIDATION = '../../Data/chat_completion_format_validation_data.jsonl'

In [10]:
transform_to_chat_format(ORIGINAL_FILE_TRAIN, MESSAGE_FILE_TRAIN)

print("Data transformation complete. The transformed data is saved to", MESSAGE_FILE_TRAIN)

Data transformation complete. The transformed data is saved to ../../Data/chat_completion_format_training_data.jsonl


In [11]:
transform_to_chat_format(ORIGINAL_FILE_VALIDATION, MESSAGE_FILE_VALIDATION)

print("Data transformation complete. The transformed data is saved to", MESSAGE_FILE_VALIDATION)

Data transformation complete. The transformed data is saved to ../../Data/chat_completion_format_validation_data.jsonl
