In [5]:
import json
import os

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def process_files(eval_file_path, train_file_path, output_train_file_path):
    # Load the JSON files
    eval_data = load_json(eval_file_path)
    train_data = load_json(train_file_path)
    
    # Extract prefLabels from eval data
    eval_pref_labels = {entry['prefLabel'] for entry in eval_data}
    
    # Filter train data
    new_train_data = []
    for entry in train_data:
        if entry['prefLabel'] in eval_pref_labels:
            continue
        if 'relationships' in entry and entry['relationships'] is not None:
            new_relationships = [rel for rel in entry['relationships'] if rel['target'] not in eval_pref_labels]
            entry['relationships'] = new_relationships
        new_train_data.append(entry)
    
    # Save the modified train data to a new file
    save_json(new_train_data, output_train_file_path)

if __name__ == "__main__":
    eval_file_path = 'BAO_eval_set.json'  # Path to the eval file
    train_file_path = 'BAO_train_set.json'  # Path to the train file (change this accordingly)
    output_train_file_path = 'BAO_train_set_processed.json'  # Path to save the processed train file
    
    process_files(eval_file_path, train_file_path, output_train_file_path)
    print(f"Processed train data saved to {output_train_file_path}")


Processed train data saved to BAO_train_set_processed.json


In [4]:
# Converting ids to prefLabel for the BAO dataset (relationship properties)

import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

def create_id_to_prefLabel_map(eval_data, train_data):
    id_to_prefLabel = {entry['id']: entry['prefLabel'] for entry in eval_data}
    id_to_prefLabel.update({entry['id']: entry['prefLabel'] for entry in train_data})
    return id_to_prefLabel

def update_relationships(data, id_to_prefLabel):
    for entry in data:
        if 'relationships' in entry and entry['relationships']:
            for rel in entry['relationships']:
                if rel['target'] in id_to_prefLabel:
                    rel['target'] = id_to_prefLabel[rel['target']]
    return data

def process_files(eval_file_path, train_file_path, output_eval_file_path, output_train_file_path):
    # Load the JSON files
    eval_data = load_json(eval_file_path)
    train_data = load_json(train_file_path)

    # Create a mapping from id to prefLabel for both datasets
    id_to_prefLabel = create_id_to_prefLabel_map(eval_data, train_data)

    # Update relationships in eval data
    updated_eval_data = update_relationships(eval_data, id_to_prefLabel)

    # Update relationships in train data
    updated_train_data = update_relationships(train_data, id_to_prefLabel)

    # Save the updated JSON files
    save_json(updated_eval_data, output_eval_file_path)
    save_json(updated_train_data, output_train_file_path)

if __name__ == "__main__":
    eval_file_path = 'BAO_eval_set.json'  # Path to the eval file
    train_file_path = 'BAO_train_set.json'  # Path to the train file
    output_eval_file_path = 'BAO_eval_set_processed.json'  # Path to save the processed eval file
    output_train_file_path = 'BAO_train_set_processed.json'  # Path to save the processed train file

    process_files(eval_file_path, train_file_path, output_eval_file_path, output_train_file_path)
    print(f"Processed eval data saved to {output_eval_file_path}")
    print(f"Processed train data saved to {output_train_file_path}")


Processed eval data saved to BAO_eval_set_processed.json
Processed train data saved to BAO_train_set_processed.json
