In [5]:
import json, os
from tqdm import tqdm
import random
import pandas as pd

In [6]:
os.makedirs("outputs/splits/valid", exist_ok=True)
os.makedirs("outputs/splits/train", exist_ok=True)
os.makedirs("outputs/splits/test", exist_ok=True)

In [7]:
# Step 1: Define paths
original_filtered_path = "outputs/filtered/rubrics_8_15.jsonl"
rubrics_dir = "outputs/rubrics"

# Step 2: Load original filtered data (2735 items)
with open(original_filtered_path, "r") as f:
    all_data = [json.loads(line) for line in f]

# Step 3: Find valid rubrics (rubrics_{i}.json exists and not empty or [{}])
valid_data = []
invalid_indices = []

for idx, example in enumerate(tqdm(all_data, desc="🔍 Checking valid rubrics")):
    rubrics_path = f"{rubrics_dir}/rubrics_{idx}.json"
    try:
        with open(rubrics_path, "r") as f:
            rubrics = json.load(f)
        if isinstance(rubrics, list) and any(r and isinstance(r, dict) for r in rubrics):
            valid_data.append((idx, example))
        else:
            invalid_indices.append(idx)
    except:
        invalid_indices.append(idx)

# Save valid data and indices
with open("outputs/splits/valid/valid_data.jsonl", "w") as f:
    for idx, item in valid_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open("outputs/splits/valid/valid_indices.json", "w") as f:
    json.dump([idx for idx, _ in valid_data], f)




🔍 Checking valid rubrics: 100%|██████████| 2735/2735 [00:00<00:00, 55943.14it/s]


In [9]:
# Shuffle and split
random.seed(42)
random.shuffle(valid_data)

test_size = 500
test_data = valid_data[:test_size]
train_data = valid_data[test_size:]

# Save split
with open("outputs/splits/train/train_data.jsonl", "w") as f:
    for idx, item in train_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open("outputs/splits/test/test_data.jsonl", "w") as f:
    for idx, item in test_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

# Save indices for tracking
with open("outputs/splits/train/train_indices.json", "w") as f:
    json.dump([idx for idx, _ in train_data], f)

with open("outputs/splits/test/test_indices.json", "w") as f:
    json.dump([idx for idx, _ in test_data], f)


df_summary = pd.DataFrame({
    "Split": ["Train", "Test", "Valid", "Invalid"],
    "Count": [len(train_data), len(test_data), len(valid_data), len(invalid_indices)]
})

print(df_summary) 

     Split  Count
0    Train   1861
1     Test    500
2    Valid   2361
3  Invalid    374
