# Doccano JSONL to JSON
Combine original filenames with Doccano annotations into normalized ground truth JSON.

In [None]:
import json

# === Set file paths ===
path_doccano_jsonl = "../../../../data/original/doccano/doccano_groundtruth.jsonl"
path_original_json = "../../../../data/original/golden_dataset_with_spans_norm.json"
output_path = "../../../../data/original/ground_truth.json"

# === Load original JSON (with correct filenames)
with open(path_original_json, "r", encoding="utf-8") as f:
    original_data = json.load(f)

# === Load Doccano JSONL file
doccano_data = []
with open(path_doccano_jsonl, "r", encoding="utf-8") as f:
    for line in f:
        doccano_data.append(json.loads(line))

# === Match by order and copy filename into new format
converted = []
for original_entry, doc_entry in zip(original_data, doccano_data):
    converted.append({
        "file": original_entry["file"],  # Use filename from original
        "text": doc_entry["text"],
        "labels": [
            {"start": start, "end": end, "label": label}
            for start, end, label in doc_entry.get("label", [])
        ]
    })

# === Write to new JSON file
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(converted, f, ensure_ascii=False, indent=2)

print(f"✅ Conversion complete. Output saved to: {output_path}")