In [1]:
import json
import hashlib
from pathlib import Path
import random
import pandas as pd

In [2]:
def calculate_md5(file_path):
    md5 = hashlib.md5()
    with open(file_path, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            md5.update(chunk)
    return md5.hexdigest()

pdf_folder = Path('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/insurance_policy')
md5_to_filename = {}
for pdf_file in pdf_folder.glob('*.pdf'):
    md5 = calculate_md5(pdf_file)
    md5_to_filename[md5] = pdf_file.name

In [3]:
with open('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/qna_free_text_sample.json', 'r') as f:
    qna_data = json.load(f)

with open('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/ground_truth.json', 'r') as f:
    ground_truth = json.load(f)


In [4]:

meta_dataset = []
for item in qna_data:
    case_id = item['id']
    merged = item.copy()
    
    if case_id in ground_truth:
        merged['ground_truth'] = ground_truth[case_id]
    

    expected_md5 = item.get('expected_md5')
    if expected_md5 in md5_to_filename:
        merged['document_name'] = md5_to_filename[expected_md5]
    else:
        merged['document_name'] = None
    
    meta_dataset.append(merged)


with open('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/meta_dataset.json', 'w') as f:
    json.dump(meta_dataset, f, indent=2, ensure_ascii=False)

In [5]:
with open('meta_dataset.json', 'r') as f:
    meta_dataset = json.load(f)

random.seed(42)
sampled_dataset = random.sample(meta_dataset, 50)

with open('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/meta_dataset_sample50_seed42.json', 'w') as f:
    json.dump(sampled_dataset, f, indent=2, ensure_ascii=False)

In [6]:
with open('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/meta_dataset_sample50_seed42.json', 'r') as f:
    sampled_dataset = json.load(f)

df_list = []
for item in sampled_dataset:
    row = {
        'id': item.get('id'),
        'patient_info': item.get('patient_info'),
        'expected_md5': item.get('expected_md5'),
        'document_name': item.get('document_name')
    }
    
    if 'ground_truth' in item:
        for key, value in item['ground_truth'].items():
            row[f'gt_{key}'] = value
    
    df_list.append(row)

df = pd.DataFrame(df_list)

df.to_csv('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/meta_dataset_sample50_seed42.csv', index=False, encoding='utf-8-sig')