In [3]:
import csv
import json

def convert_browsecomp_to_gaia_jsonl(queries_file, qrels_file, output_file):
    """
    将BrowseComp数据集格式转换为GAIA数据集格式，并以JSONL格式输出。

    Args:
        queries_file (str): queries.tsv文件的路径。
        qrels_file (str): qrel_golds.txt文件的路径。
        output_file (str): 输出的JSONL文件名。
    """
    # --- 步骤 1: 读取 qrel_golds.txt 并存储答案（文档ID） ---
    # (此部分与之前完全相同)
    answers = {}
    try:
        with open(qrels_file, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 4:
                    query_id, _, doc_id, relevance = parts
                    if int(relevance) > 0:
                        answers[query_id] = doc_id
    except FileNotFoundError:
        print(f"错误: 找不到文件 {qrels_file}")
        return
    
    print(f"成功从 {qrels_file} 加载了 {len(answers)} 个答案条目。")

    # --- 步骤 2: 读取 queries.tsv 并准备转换 ---
    # (此部分逻辑也与之前完全相同，但我们不再一次性构建整个列表，可以直接写入)
    print(f"正在处理 {queries_file} 并写入到 {output_file}...")
    
    # --- 步骤 3: 逐行读取、转换并写入JSONL文件 ---
    count = 0
    try:
        # 我们同时打开输入和输出文件
        with open(queries_file, 'r', encoding='utf-8') as infile, \
             open(output_file, 'w', encoding='utf-8') as outfile:
            
            reader = csv.reader(infile, delimiter='\t')
            for row in reader:
                if len(row) == 2:
                    query_id, question_text = row
                    
                    final_answer = answers.get(query_id, "Answer Document ID Not Found")
                    
                    gaia_record = {
                        "task_id": query_id,
                        "Question": question_text,
                        "Level": 1, 
                        "Final answer": final_answer, 
                        "file_name": ""
                    }
                    
                    # 将字典转换为JSON字符串
                    json_line = json.dumps(gaia_record, ensure_ascii=False)
                    
                    # 写入该行并添加换行符
                    outfile.write(json_line + '\n')
                    count += 1

    except FileNotFoundError:
        print(f"错误: 找不到文件 {queries_file}")
        return
    except IOError as e:
        print(f"读写文件时发生错误: {e}")
        return
        
    print(f"转换成功！共处理并写入了 {count} 条记录到 {output_file}")

In [6]:

# 定义输入和输出文件名
QUERIES_TSV_PATH = 'queries.tsv'
QRELS_GOLDS_PATH = 'qrel_golds.txt'
GAIA_OUTPUT_JSON_PATH = 'bc_gaia_formatted_data.jsonl'

# 执行转换函数
convert_browsecomp_to_gaia_jsonl(QUERIES_TSV_PATH, QRELS_GOLDS_PATH, GAIA_OUTPUT_JSON_PATH)

成功从 qrel_golds.txt 加载了 830 个答案条目。
正在处理 queries.tsv 并写入到 bc_gaia_formatted_data.jsonl...
转换成功！共处理并写入了 830 条记录到 bc_gaia_formatted_data.jsonl
