In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import sys

sys.path.append("..")

In [26]:
from constrerl.evaluate import (
    eval_submission_6_1_NER,
    eval_submission_6_3_ternary_tag_RE,
    eval_submission_6_4_ternary_mention_RE,
    eval_submission_6_2_binary_tag_RE
)
from constrerl.erl_schema import convert_to_output, Article
import glob
from pathlib import Path
import json
import pandas as pd
from collections.abc import Callable, Awaitable


In [27]:
results_dir = "../data/results_dev"
ground_truth_file = "../data/annotations/dev/dev.json"

results_dir = Path(results_dir)
ner_result_dir = Path("../data/results_ner_dev")
ground_truth_file = Path(ground_truth_file)


with open(ground_truth_file) as f:
    ground_truth = json.load(f)

In [46]:
eval_results: list[dict] = []

import re


def scoring_to_dict(
    f: str | Path, eval_f: Callable[[str | Path, dict], tuple[float]]
) -> dict:
    precision, recall, f1, micro_precision, micro_recall, micro_f1 = eval_f(
        f, ground_truth
    )
    return {
        "$P$": precision,
        "$R$": recall,
        "$F_1$": f1,
        "$P_{micro}$": micro_precision,
        "$R_{micro}$": micro_recall,
        "$F_{1,micro}$": micro_f1,
    }


def scoring_to_df(
    eval_f: Callable[[str | Path, dict], tuple[float]], res_dir=results_dir
) -> pd.DataFrame:
    eval_results: list[dict] = []
    for result_file in res_dir.glob("*.json"):
        result_file = Path(result_file)
        eval_result = scoring_to_dict(result_file, eval_f)
        model_name = (
            " ".join(result_file.name.rstrip(".json").split("-")[:2])
            if "openai" not in result_file.name
            else " ".join(result_file.name.rstrip(".json").split("-")[:3])
        )
        # capitalize the first letter of the name
        model_name = " ".join(
            [
                word.capitalize() if i == 0 else word
                for i, word in enumerate(model_name.split(" "))
            ]
        )
        model_name = re.sub(r"(\d)b", "\\1B", model_name)
        splits = model_name.split(" ")
        if len(splits) > 2:
            model_name = splits[0] + " " + "-".join(splits[1:])

        result_dict = {
            "Model": model_name,
            "RAG": "\checkmark" if "rag" in result_file.name else "$\\times$",
            "LoRA": "\checkmark" if "lora" in result_file.name else "$\\times$",
            "Reorder": "\checkmark" if "reorder" in result_file.name else "$\\times$",
            "Low Tokens": "\checkmark" if "low-tokens" in result_file.name else "$\\times$",
            "Entity Labels": "\checkmark" if "entity-labels" in result_file.name else "$\\times$",
        }
        result_dict.update(eval_result)
        # result_dict.update({f"6_2_2_{k}": v for k, v in ternary_tag_score.items()})
        # result_dict.update({f"6_2_3_{k}": v for k, v in ternary_mention_score.items()})
        eval_results.append(result_dict)
    eval_df = pd.DataFrame(eval_results)
    if "$F_{1,micro}$" in eval_df.columns:
        eval_df = eval_df.sort_values("$F_{1,micro}$")
    return eval_df


task_6_1_1_df = scoring_to_df(eval_submission_6_1_NER, res_dir=ner_result_dir)
task_6_2_1_df = scoring_to_df(eval_submission_6_2_binary_tag_RE)
task_6_2_2_df = scoring_to_df(eval_submission_6_3_ternary_tag_RE)
task_6_2_3_df = scoring_to_df(eval_submission_6_4_ternary_mention_RE)

=== Removed 458 duplicated entities from predictions ===
=== Removed 111 overlapping entities ===
=== Removed 335 duplicated entities from predictions ===
=== Removed 88 overlapping entities ===
=== Removed 458 duplicated entities from predictions ===
=== Removed 111 overlapping entities ===
=== Removed 384 duplicated entities from predictions ===
=== Removed 72 overlapping entities ===
=== Removed 231 duplicated entities from predictions ===
=== Removed 82 overlapping entities ===
=== Removed 231 duplicated entities from predictions ===
=== Removed 82 overlapping entities ===
=== Removed 327 duplicated binary tag-based relations from predictions ===
=== Removed 1059 duplicated binary tag-based relations from predictions ===
=== Removed 1105 duplicated binary tag-based relations from predictions ===
=== Removed 447 duplicated binary tag-based relations from predictions ===
=== Removed 368 duplicated binary tag-based relations from predictions ===
=== Removed 304 duplicated binary tag-b

  "RAG": "\checkmark" if "rag" in result_file.name else "$\\times$",
  "LoRA": "\checkmark" if "lora" in result_file.name else "$\\times$",
  "Reorder": "\checkmark" if "reorder" in result_file.name else "$\\times$",
  "Low Tokens": "\checkmark" if "low-tokens" in result_file.name else "$\\times$",
  "Entity Labels": "\checkmark" if "entity-labels" in result_file.name else "$\\times$",


In [47]:
task_6_2_1_df.to_latex(
    "report/task_6_1_1.tex",
    float_format="%.2f",
    caption="Dev Set Result for Task 6.1.1 (NER) for various models and approaches.",
    label="tab:task:6_1_1:more",
    index=False,
)

In [48]:
task_6_2_1_df.to_latex(
    "report/task_6_2_1.tex",
    float_format="%.2f",
    caption="Further Dev Set Result for Task 6.2.1 for various models and approaches.",
    label="tab:task:6_2_1:more",
    index=False,
)

In [49]:
task_6_2_2_df.to_latex(
    "report/task_6_2_2.tex",
    float_format="%.2f",
    caption="Further Dev Set Result for Task 6.2.2 for various models and approaches.",
    label="tab:task:6_2_2:more",
    index=False,
)

In [50]:
task_6_2_3_df.to_latex(
    "report/task_6_2_3.tex",
    float_format="%.2f",
    caption="Further Dev Set Result for Task 6.2.3 for various models and approaches.",
    label="tab:task:6_2_3:more",
    index=False,
)