In [30]:
import json
from collections import defaultdict


def read_json_lines(file_path, id_key):
    """读取每行是一个JSON对象的文件"""
    data = {}
    level_dict = defaultdict(list)
    try:
        print(f'Opening file: {file_path}')
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_number, line in enumerate(file, 1):
                try:
                    json_obj = json.loads(line.strip())
                    if id_key not in json_obj:
                        print(f'Warning: {id_key} not found in line {line_number}')
                        continue
                    if json_obj[id_key] in data:
                        print(
                            f'Warning: Duplicate {id_key} found in line {line_number}'
                        )
                        continue
                    data[json_obj[id_key]] = json_obj
                    level_dict[json_obj['Level']].append(json_obj[id_key])

                except json.JSONDecodeError as e:
                    print(f'第 {line_number} 行JSON解析错误: {e}')
                    continue
        print(f'Successfully read {len(data)} records from {file_path}')
        return data, level_dict
    except FileNotFoundError:
        print(f'错误：文件 {file_path} 未找到')
        return None
    except Exception as e:
        print(f'读取文件时发生错误: {e}')
        return None

In [48]:
test_path = 'data/test_metadata.jsonl'
test_metadata, test_level_dict = read_json_lines(test_path, 'task_id')

valid_path = 'data/validation_metadata.jsonl'
valid_metadata, valid_level_dict = read_json_lines(valid_path, 'task_id')

Opening file: data/test_metadata.jsonl
Successfully read 301 records from data/test_metadata.jsonl
Opening file: data/validation_metadata.jsonl
Successfully read 165 records from data/validation_metadata.jsonl


In [49]:
import random

sampled_id = defaultdict(list)
for level, l in test_level_dict.items():
    sampled_num = int(len(l) / len(test_metadata) * 20)
    print(sampled_num, len(l), len(test_metadata))
    sampled_id[f'Level {level}'] = random.sample(l, sampled_num)

6 93 301
10 159 301
3 49 301


In [50]:
sampled_id

defaultdict(list,
            {'Level 1': ['d89733a3-7d86-4ed8-b5a3-bf4831b06e3c',
              '3c5f0280-b1a3-43cf-817e-c3fa0016b1e2',
              '60fbc5a3-2805-4ad4-8eef-b58843b5053b',
              '220a2b08-ffdc-4665-af4e-025670f5408b',
              '6af95c8f-8cbf-4c12-b02c-f9a23cc1ecb9',
              '70e0a9c6-24bf-48ed-afa1-f0d0eaaa0209'],
             'Level 2': ['4cf4a5c1-7c9c-4cce-94cb-57b8be196244',
              '82b89810-1217-4ad8-aa9f-26e7c74ba6e5',
              '900bb2d0-c2ae-43a6-b25b-62f96c3770e3',
              '04893fc3-34fc-4117-8457-a717ad01a6a9',
              'f5d0b1c6-5e15-4c55-b60c-9fc855dda5cf',
              '021a5339-744f-42b7-bd9b-9368b3efda7a',
              'f2fa52f6-fc8a-498c-98d3-17f66c848d1b',
              '4810c253-7b06-447d-8bf6-64558ac5f00f',
              '3cc53dbf-1ab9-4d21-a56a-fc0151c10f89',
              '9b98305b-af16-489e-adbc-41b5c5a0ec2d'],
             'Level 3': ['460ef201-c5f4-41f4-9acd-e4215384e678',
              'c68c0db6-1929-

In [55]:
'e1fc63a2-da7a-432f-be78-7c4a95598703' in valid_metadata.keys()

True

In [54]:
len(valid_metadata), len(test_metadata)

(165, 301)