In [60]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
import sys

sys.path.append("..")

In [None]:
from constrerl.evaluate import (
    eval_submission_6_1_NER,
    eval_submission_6_3_ternary_tag_RE,
    eval_submission_6_4_ternary_mention_RE,
    eval_submission_6_2_binary_tag_RE,
)
from constrerl.erl_schema import convert_to_output, Article
import glob
from pathlib import Path
import json
import pandas as pd
from collections.abc import Callable, Awaitable


In [284]:
results_dir = "../data/results_dev"
ground_truth_file = "../data/annotations/dev/dev.json"
report_dir = Path("report")
results_dir = Path(results_dir)
ner_result_dir = Path("../data/results_ner_dev")
ground_truth_file = Path(ground_truth_file)
lbl_xtra = ":more"

top_results = 0

with open(ground_truth_file) as f:
    ground_truth = json.load(f)

In [285]:
eval_results: list[dict] = []

import re


def scoring_to_dict(
    f: str | Path, eval_f: Callable[[str | Path, dict], tuple[float]]
) -> dict:
    precision, recall, f1, micro_precision, micro_recall, micro_f1 = eval_f(
        f, ground_truth
    )
    return {
        "$P$": precision,
        "$R$": recall,
        "$F_1$": f1,
        "$P_{micro}$": micro_precision,
        "$R_{micro}$": micro_recall,
        "$F_{1,micro}$": micro_f1,
    }


def scoring_to_df(
    eval_f: Callable[[str | Path, dict], tuple[float]], res_dir=results_dir
) -> pd.DataFrame:
    eval_results: list[dict] = []
    merge_mode = "merge" in str(res_dir)
    further_mode = not (("old" in str(res_dir)) or ("merge" in str(res_dir)))
    print(f"Further mode: {further_mode}")
    for result_file in res_dir.glob("*.json"):
        set_mode = False
        result_file = Path(result_file)
        if "union" in result_file.name or "intersection" in result_file.name:
            set_mode = True
        set_op = "$\cup$" if "union" in result_file.name else "$\cap$"
        eval_result = scoring_to_dict(result_file, eval_f)
        model_name = result_file.name.rstrip(".json")
        if set_mode:
            model_name = model_name.rstrip("_intersection").rstrip("_union")
        model_name = (
            " ".join(model_name.rstrip(".json").split("-")[:2])
            if "openai" not in model_name
            else " ".join(model_name.rstrip(".json").split("-")[:3])
        )
        # capitalize the first letter of the name
        model_name = " ".join(
            [
                word.capitalize() if i == 0 else word
                for i, word in enumerate(model_name.split(" "))
            ]
        )
        model_name = re.sub(r"(\d)b", "\\1B", model_name)
        splits = model_name.split(" ")
        if len(splits) > 2:
            model_name = splits[0] + " " + "-".join(splits[1:])
        low_tokens = "low-tokens" in result_file.name
        entity_labels = "entity-labels" in result_file.name
        result_dict = {
            "Model": model_name,
            "LoRA": "\checkmark" if "lora" in result_file.name else "$\\times$",
            "RAG": "\checkmark" if "rag" in result_file.name else "$\\times$",
            "Reorder": "\checkmark" if "reorder" in result_file.name else "$\\times$",
            # "Low Tokens": "\checkmark"
            # if "low-tokens" in result_file.name
            # else "$\\times$",
            # "Entity Labels": "\checkmark"
            # if "entity-labels" in result_file.name
            # else "$\\times$",
        }
        if further_mode:
            result_dict["Low Tokens"] = "\checkmark" if low_tokens else "$\\times$"
            result_dict["Entity Labels"] = (
                "\checkmark" if entity_labels else "$\\times$"
            )
        if set_mode:
            result_dict["Set"] = set_op
        result_dict.update(eval_result)
        # result_dict.update({f"6_2_2_{k}": v for k, v in ternary_tag_score.items()})
        # result_dict.update({f"6_2_3_{k}": v for k, v in ternary_mention_score.items()})
        eval_results.append(result_dict)
    eval_df = pd.DataFrame(eval_results)
    if further_mode:
        eval_df = eval_df[
            (eval_df["Low Tokens"] == "\checkmark")
            | (eval_df["Entity Labels"] == "\checkmark")
        ]
    if merge_mode:
        eval_df = eval_df[eval_df["Model"].str.contains("Hermes 3B")]
    valid_cols = [
        c
        for c in [
            "Set",
            "Model",
            "RAG",
            "LoRA",
            "Reorder",
            "Low Tokens",
            "Entity Labels",
        ]
        if c in eval_df.columns
    ]
    eval_df.set_index(valid_cols, inplace=True)
    eval_df = eval_df.sort_index()
    # if "$F_{1,micro}$" in eval_df.columns:
    #     eval_df = eval_df.sort_values("$F_{1,micro}$")
    return eval_df


task_6_1_1_df = scoring_to_df(eval_submission_6_1_NER, res_dir=ner_result_dir)
task_6_2_1_df = scoring_to_df(eval_submission_6_2_binary_tag_RE)
task_6_2_2_df = scoring_to_df(eval_submission_6_3_ternary_tag_RE)
task_6_2_3_df = scoring_to_df(eval_submission_6_4_ternary_mention_RE)

Further mode: True
=== Removed 458 duplicated entities from predictions ===
=== Removed 111 overlapping entities ===
=== Removed 335 duplicated entities from predictions ===
=== Removed 86 overlapping entities ===
=== Removed 89 duplicated entities from predictions ===
=== Removed 35 overlapping entities ===
=== Removed 458 duplicated entities from predictions ===
=== Removed 111 overlapping entities ===
=== Removed 399 duplicated entities from predictions ===
=== Removed 61 overlapping entities ===
=== Removed 231 duplicated entities from predictions ===
=== Removed 82 overlapping entities ===
=== Removed 101 duplicated entities from predictions ===
=== Removed 48 overlapping entities ===
=== Removed 85 duplicated entities from predictions ===
=== Removed 64 overlapping entities ===
=== Removed 231 duplicated entities from predictions ===
=== Removed 82 overlapping entities ===
=== Removed 92 duplicated entities from predictions ===
=== Removed 61 overlapping entities ===
=== Removed 

  set_op = "$\cup$" if "union" in result_file.name else "$\cap$"
  set_op = "$\cup$" if "union" in result_file.name else "$\cap$"
  "LoRA": "\checkmark" if "lora" in result_file.name else "$\\times$",
  "RAG": "\checkmark" if "rag" in result_file.name else "$\\times$",
  "Reorder": "\checkmark" if "reorder" in result_file.name else "$\\times$",
  result_dict["Low Tokens"] = "\checkmark" if low_tokens else "$\\times$"
  "\checkmark" if entity_labels else "$\\times$"
  (eval_df["Low Tokens"] == "\checkmark")
  | (eval_df["Entity Labels"] == "\checkmark")


In [290]:
task_6_1_1_df[top_results:].to_latex(
    report_dir / "task_6_1_1.tex",
    float_format="%.2f",
    caption="Dev Set Result for Task 6.1.1 (NER) for various models and approaches.",
    label=f"tab:task:6_1_1{lbl_xtra}",
)
task_6_1_1_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Model,RAG,LoRA,Reorder,Low Tokens,Entity Labels,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Hermes 3B,$\times$,$\times$,$\times$,$\times$,\checkmark,0.048768,0.01877,0.025532,0.055046,0.021486,0.030908
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,$\times$,0.015713,0.017106,0.013724,0.037415,0.009848,0.015592
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,\checkmark,0.045376,0.01666,0.022781,0.057221,0.0188,0.028302
Hermes 3B,$\times$,$\times$,\checkmark,$\times$,\checkmark,0.048768,0.01877,0.025532,0.055046,0.021486,0.030908
Hermes 3B,$\times$,$\times$,\checkmark,\checkmark,\checkmark,0.045376,0.01666,0.022781,0.057221,0.0188,0.028302
Hermes 3B,\checkmark,$\times$,$\times$,$\times$,\checkmark,0.212756,0.081851,0.112518,0.25,0.119964,0.162129
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,$\times$,0.318396,0.066426,0.099484,0.2925,0.104745,0.154252
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,\checkmark,0.252185,0.054985,0.083981,0.30897,0.083259,0.131171
Hermes 3B,\checkmark,$\times$,\checkmark,$\times$,\checkmark,0.212756,0.081851,0.112518,0.25,0.119964,0.162129
Hermes 3B,\checkmark,$\times$,\checkmark,\checkmark,\checkmark,0.252185,0.054985,0.083981,0.30897,0.083259,0.131171


In [287]:
task_6_2_1_df[top_results:].to_latex(
    report_dir / "task_6_2_1.tex",
    float_format="%.2f",
    caption="Dev Set Result for Task 6.2.1 for various models and approaches.",
    label=f"tab:task:6_2_1{lbl_xtra}",
)
task_6_2_1_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Model,RAG,LoRA,Reorder,Low Tokens,Entity Labels,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Hermes 3B,$\times$,$\times$,$\times$,$\times$,\checkmark,0.064286,0.086162,0.057367,0.153846,0.081818,0.106825
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,$\times$,0.044444,0.010582,0.016667,0.1,0.009091,0.016667
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,\checkmark,0.066049,0.069638,0.047191,0.138614,0.063636,0.087227
Hermes 3B,$\times$,$\times$,\checkmark,$\times$,\checkmark,0.064286,0.086162,0.057367,0.153846,0.081818,0.106825
Hermes 3B,$\times$,$\times$,\checkmark,\checkmark,\checkmark,0.066049,0.069638,0.047191,0.138614,0.063636,0.087227
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,$\times$,0.240278,0.091983,0.124187,0.654545,0.163636,0.261818
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,\checkmark,0.158418,0.072784,0.088042,0.465753,0.154545,0.232082
Hermes 3B,\checkmark,$\times$,\checkmark,\checkmark,\checkmark,0.158418,0.072784,0.088042,0.465753,0.154545,0.232082


In [288]:
task_6_2_2_df[top_results:].to_latex(
    report_dir / "task_6_2_2.tex",
    float_format="%.2f",
    caption="Further Dev Set Result for Task 6.2.2 for various models and approaches.",
    label=f"tab:task:6_2_2{lbl_xtra}",
)
task_6_2_2_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Model,RAG,LoRA,Reorder,Low Tokens,Entity Labels,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Hermes 3B,$\times$,$\times$,$\times$,$\times$,\checkmark,0.073313,0.084183,0.057211,0.140351,0.069565,0.093023
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,$\times$,0.041667,0.009921,0.015625,0.105263,0.008696,0.016064
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,\checkmark,0.074074,0.067895,0.046027,0.12,0.052174,0.072727
Hermes 3B,$\times$,$\times$,\checkmark,$\times$,\checkmark,0.073313,0.084183,0.057211,0.140351,0.069565,0.093023
Hermes 3B,$\times$,$\times$,\checkmark,\checkmark,\checkmark,0.074074,0.067895,0.046027,0.12,0.052174,0.072727
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,$\times$,0.214844,0.082479,0.112799,0.636364,0.152174,0.245614
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,\checkmark,0.148516,0.069416,0.08423,0.465753,0.147826,0.224422
Hermes 3B,\checkmark,$\times$,\checkmark,\checkmark,\checkmark,0.148516,0.069416,0.08423,0.465753,0.147826,0.224422


In [289]:
task_6_2_3_df[top_results:].to_latex(
    report_dir / "task_6_2_3.tex",
    float_format="%.2f",
    caption="Dev Set Result for Task 6.2.3 for various models and approaches.",
    label=f"tab:task:6_2_3{lbl_xtra}",
)
task_6_2_3_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Model,RAG,LoRA,Reorder,Low Tokens,Entity Labels,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Hermes 3B,$\times$,$\times$,$\times$,$\times$,\checkmark,0.000631,0.003472,0.001068,0.002924,0.001786,0.002217
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,$\times$,0.0,0.0,0.0,0.0,0.0,0.0
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,\checkmark,0.001736,0.003472,0.002315,0.004525,0.001786,0.002561
Hermes 3B,$\times$,$\times$,\checkmark,$\times$,\checkmark,0.000631,0.003472,0.001068,0.002924,0.001786,0.002217
Hermes 3B,$\times$,$\times$,\checkmark,\checkmark,\checkmark,0.001736,0.003472,0.002315,0.004525,0.001786,0.002561
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,$\times$,0.028837,0.00806,0.010926,0.062802,0.023214,0.033898
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,\checkmark,0.031675,0.011175,0.015035,0.084071,0.033929,0.048346
Hermes 3B,\checkmark,$\times$,\checkmark,\checkmark,\checkmark,0.031675,0.011175,0.015035,0.084071,0.033929,0.048346
