In [18]:
import json

def clean_json(json_path, is_group3=False):
    with open(json_path, 'r') as f:
        data = json.load(f)

    query_data = data["query"]
    corpus_data = data["corpus"]
    qrels_data = data["qrel"]
    processed_dataset = []

    for query_id, doc_relevance_map in qrels_data.items():
        if query_id not in query_data:
            print(f"Warning: Query ID {query_id} from qrels not found in query_data. Skipping.")
            continue

        query_text = query_data[query_id]

        for code_id, relevance_score in doc_relevance_map.items():
            if code_id not in corpus_data:
                print(f"Warning: Code ID {code_id} for Query ID {query_id} from qrels not found in corpus_data. Skipping.")
                continue

            code_text = corpus_data[code_id]

            if not is_group3 and relevance_score == 0:
                continue

            processed_dataset.append({
                "query_id": query_id,
                "query_text": query_text,
                "code_id": code_id,
                "code_text": code_text,
                "relevance": relevance_score
            })
    # extract the file name from the path
    file_name = json_path.split('/')[-1].split('.')[0]
    # create the output file name in cleaned/
    output_file_name = f"cleaned/{file_name}_cleaned.json"
    # write the processed dataset to a new JSON file
    with open(output_file_name, 'w') as f:
        json.dump(processed_dataset, f, indent=4)

In [19]:
# get all the json files under original/
import os
json_files = [f for f in os.listdir('original/') if f.endswith('.json')]
# create cleaned/ directory if it doesn't exist
if not os.path.exists('cleaned/'):
    os.makedirs('cleaned/')


In [20]:
len(json_files), json_files

(21,
 ['reconstructed_group1_hash_renamed.json',
  'reconstructed_group3_helper_as_part_of_groundtruth_original.json',
  'reconstructed_group3_helper_as_part_of_groundtruth_usual_renamed.json',
  'reconstructed_group3_helper_as_other_candidates_original.json',
  'reconstructed_group1_asm_short.json',
  'reconstructed_group3_asm_short.json',
  'reconstructed_group2_hash_renamed.json',
  'reconstructed_group3_asm_long.json',
  'reconstructed_group2_wasm.json',
  'reconstructed_group2_asm_short.json',
  'reconstructed_group2_asm_long.json',
  'reconstructed_group2_usual_renamed.json',
  'reconstructed_group3_helper_as_part_of_groundtruth_hash_renamed.json',
  'reconstructed_group1_asm_long.json',
  'reconstructed_group1_usual_renamed.json',
  'reconstructed_group1_wasm.json',
  'reconstructed_group3_helper_as_part_of_groundtruth_wasm.json',
  'reconstructed_group3_helper_as_other_candidates_hash_renamed.json',
  'reconstructed_group2_original.json',
  'reconstructed_group3_helper_as_other

In [21]:


for json_file in json_files:
    # check if the file is group3
    if 'group3' in json_file:
        clean_json(f'original/{json_file}', is_group3=True)
    else:
        clean_json(f'original/{json_file}')