# Label Review
Compare original vs reviewed labels; show diffs and overlapping spans.

In [None]:
import json
from collections import namedtuple

# === File paths ===
path_original = "../../../../data/original/golden_dataset_with_spans_norm.json"
path_reviewed = "../../../../data/original/ground_truth.json"

# === Helper to detect overlapping spans in one file ===
def find_overlaps(labels):
    overlaps = []
    # sort by start index
    sorted_lbls = sorted(labels, key=lambda x: x["start"])
    for a, b in zip(sorted_lbls, sorted_lbls[1:]):
        if a["end"] > b["start"]:
            overlaps.append((a, b))
    return overlaps

# === Load both datasets ===
with open(path_original, "r", encoding="utf-8") as f:
    orig_data = json.load(f)

with open(path_reviewed, "r", encoding="utf-8") as f:
    rev_data = json.load(f)

# === Compare label sets per file ===
Diff = namedtuple("Diff", ["file", "orig_labels", "rev_labels"])
diffs = []
for o, r in zip(orig_data, rev_data):
    # assume same order / same filenames
    if o["labels"] != r["labels"]:
        diffs.append(Diff(o["file"], o["labels"], r["labels"]))

# === Count overlapping spans in each dataset ===
orig_overlaps_per_file = {e["file"]: find_overlaps(e["labels"]) for e in orig_data}
rev_overlaps_per_file  = {e["file"]: find_overlaps(e["labels"]) for e in rev_data}

# Summary numbers
num_changed_txts = len(diffs)
total_orig_overlaps = sum(len(v) for v in orig_overlaps_per_file.values())
total_rev_overlaps  = sum(len(v) for v in rev_overlaps_per_file .values())
files_with_orig_overlaps = sum(1 for v in orig_overlaps_per_file.values() if v)
files_with_rev_overlaps  = sum(1 for v in rev_overlaps_per_file .values() if v)

# === 1. How many txts are now different? ===
print(f"1) Number of text files with changed labels: {num_changed_txts}\n")

# === 2. List those txts with labels before and after review ===
print("2) Changed files and their label counts:")
for d in diffs:
    print(f" - {d.file}: before={len(d.orig_labels)} labels, after={len(d.rev_labels)} labels")
print()

# === 3. Count of overlapping spans in both versions ===
print("3) Overlapping spans summary:")
print(f"   • Original JSON: {total_orig_overlaps} overlaps across {files_with_orig_overlaps} files")
print(f"   • Reviewed JSON: {total_rev_overlaps} overlaps across {files_with_rev_overlaps} files\n")

# === 4. Details on changes + overlaps in reviewed ===
print("4) Detail per changed file:")
for d in diffs:
    print(f"\n-- {d.file} --")
    # show which spans were added or removed
    set_orig = set((lbl["start"], lbl["end"], lbl["label"]) for lbl in d.orig_labels)
    set_rev  = set((lbl["start"], lbl["end"], lbl["label"]) for lbl in d.rev_labels)
    added   = set_rev - set_orig
    removed = set_orig - set_rev

    if added:
        print("   Added labels/spans:")
        for s,e,l in sorted(added):
            print(f"     + [{s},{e}] “{l}”")
    if removed:
        print("   Removed labels/spans:")
        for s,e,l in sorted(removed):
            print(f"     - [{s},{e}] “{l}”")
    if not added and not removed:
        print("   (Labels changed, but same spans; maybe only label names differed.)")

    # show overlapping spans in the reviewed version
    overlaps = rev_overlaps_per_file[d.file]
    if overlaps:
        print("   Overlaps in reviewed file:")
        for a,b in overlaps:
            print(f"     • {a} overlaps with {b}")
    else:
        print("   No overlaps in reviewed file.")