In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from constrerl.evaluate import (
    eval_submission_6_3_ternary_tag_RE,
    eval_submission_6_4_ternary_mention_RE,
    eval_submission_6_2_binary_tag_RE,
)
from constrerl.erl_schema import (
    convert_to_output,
    Article,
    BinaryTagBasedRelation,
    TernaryTagBasedRelation,
    TernaryMentionBasedRelation,
)
from constrerl.annotator import load_train
import glob
from pathlib import Path
import json
import pandas as pd
from collections.abc import Callable, Awaitable


In [4]:
results_dir = "data/results_dev"
deduplicate_dir = "data/dedupe_dev"
results_dir = Path(results_dir)

In [10]:
eval_results: list[dict] = []


for result_file in results_dir.glob("*.json"):
    data = load_train(result_file)
    cleaned_data: dict[str, Article] = {}
    for id in data.keys():
        article = data[id]
        cleaned_article = Article(
            binary_tag_based_relations=[],
            ternary_tag_based_relations=[],
            ternary_mention_based_relations=[],
        )
        added_spos = set()
        for a in article.ternary_mention_based_relations:
            k = a.subject_label, a.predicate, a.object_label
            if k in added_spos:
                continue
            cleaned_article.binary_tag_based_relations.append(
                BinaryTagBasedRelation(
                    subject_label=a.subject_label,
                    object_label=a.object_label,
                )
            )
            cleaned_article.ternary_tag_based_relations.append(
                TernaryTagBasedRelation(
                    subject_label=a.subject_label,
                    predicate=a.predicate,
                    object_label=a.object_label,
                )
            )
            cleaned_article.ternary_mention_based_relations.append(
                TernaryMentionBasedRelation(
                    subject_text_span=a.subject_text_span,
                    subject_label=a.subject_label,
                    predicate=a.predicate,
                    object_text_span=a.object_text_span,
                    object_label=a.object_label,
                )
            )
            added_spos.add((a.subject_label, a.predicate, a.object_label))

        cleaned_data[id] = cleaned_article

    with open(Path(deduplicate_dir) / result_file.name, "w") as f:
        json.dump(
            {id: article.model_dump() for id, article in cleaned_data.items()},
            f,
            indent=2,
        )