In [1]:
import re
import json
from collections import defaultdict
from typing import List, Dict


# -----------------------------
# Simple word tokenizer w/ offsets
# -----------------------------
def tokenize_with_offsets(text: str):
    tokens = []
    offsets = []

    for match in re.finditer(r"\S+", text):
        tokens.append(match.group())
        offsets.append((match.start(), match.end()))

    return tokens, offsets


# -----------------------------
# Convert spans to BIO
# -----------------------------
def convert_to_ner_format(span_data, label):
    # Group spans by document
    docs = defaultdict(list)
    for item in span_data:
        docs[item["document"]].append(item)

    ner_examples = []

    for document, spans in docs.items():
        tokens, offsets = tokenize_with_offsets(document)
        labels = ["O"] * len(tokens)

        for span in spans:
            span_start = span["start"]
            span_end = span["end"]
            label_name = label.replace(" ", "_")

            inside = False

            for i, (tok_start, tok_end) in enumerate(offsets):
                if tok_end <= span_start:
                    continue
                if tok_start >= span_end:
                    break

                if not inside:
                    labels[i] = f"B-{label_name}"
                    inside = True
                else:
                    labels[i] = f"I-{label_name}"

        ner_examples.append({
            "tokens": tokens,
            "labels": labels
        })

    return ner_examples

In [2]:
import random 
import os 

synth_path = "../synthetic_sampling/synthetic_samples.json"
categories = {'Unsupported claim': 'unsupported_claim', 'Lacks synthesis': "lacks_synthesis", 'Coherence': "coherence", 'Format': "format"}

TRAIN_RATIO = 0.8
DEV_RATIO = 0.1
EVAL_RATIO = 0.1

random.seed(42)

with open(synth_path, "r", encoding="utf-8") as f:
    span_data = json.load(f)

for category in categories:
    print(f"Processing {category}")

    ner_data = convert_to_ner_format(span_data[category], category)

    # Shuffle before splitting
    random.shuffle(ner_data)

    n = len(ner_data)
    n_train = int(n * TRAIN_RATIO)
    n_dev = int(n * DEV_RATIO)

    train_split = ner_data[:n_train]
    dev_split = ner_data[n_train:n_train + n_dev]
    eval_split = ner_data[n_train + n_dev:]

    out_dir = categories[category]
    os.makedirs(out_dir, exist_ok=True)

    with open(f"{out_dir}/{out_dir}_ner_train.json", "w", encoding="utf-8") as f:
        json.dump(train_split, f, indent=2, ensure_ascii=False)

    with open(f"{out_dir}/{out_dir}_ner_dev.json", "w", encoding="utf-8") as f:
        json.dump(dev_split, f, indent=2, ensure_ascii=False)

    with open(f"{out_dir}/{out_dir}_ner_eval.json", "w", encoding="utf-8") as f:
        json.dump(eval_split, f, indent=2, ensure_ascii=False)

print("Saved NER-formatted data to ner_data.json")

Processing Unsupported claim
Processing Lacks synthesis
Processing Coherence
Processing Format
Saved NER-formatted data to ner_data.json
