In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("..")

In [3]:
from constrerl.evaluate import (
    eval_submission_6_1_NER,
    eval_submission_6_3_ternary_tag_RE,
    eval_submission_6_4_ternary_mention_RE,
    eval_submission_6_2_binary_tag_RE,
)
from constrerl.erl_schema import convert_to_output, Article
import glob
from pathlib import Path
import json
import pandas as pd
from collections.abc import Callable, Awaitable


In [4]:
results_dir = "../data/merge_dev"
ground_truth_file = "../data/annotations/dev/dev.json"
report_dir = Path("report_ontug")
results_dir = Path(results_dir)
ner_result_dir = Path("../data/results_ner_dev")
ground_truth_file = Path(ground_truth_file)
lbl_xtra = ":ontug"


top_results = 0

k_limits = [None]

with open(ground_truth_file) as f:
    ground_truth = json.load(f)

In [20]:
eval_results: list[dict] = []

import re


def scoring_to_dict(
    predictions: dict, eval_f: Callable[[dict, dict], tuple[float]], gt=ground_truth
) -> dict:
    precision, recall, f1, micro_precision, micro_recall, micro_f1 = eval_f(
        predictions, gt
    )
    return {
        "$P$": precision,
        "$R$": recall,
        "$F_1$": f1,
        "$P_{micro}$": micro_precision,
        "$R_{micro}$": micro_recall,
        "$F_{1,micro}$": micro_f1,
    }


def scoring_to_df(
    eval_f: Callable[[str | Path, dict], tuple[float]],
    key: str,
    res_dir=results_dir,
    k_limits_set=None,
    cap_gt=False,
) -> pd.DataFrame:
    eval_results: list[dict] = []
    merge_mode = "merge" in str(res_dir)
    further_mode = not (("old" in str(res_dir)) or ("merge" in str(res_dir)))
    print(f"Further mode: {further_mode}")
    files = res_dir.glob("*.json")
    if not res_dir.is_dir():
        files = [res_dir]
    for result_file in files:
        set_mode = False
        result_file = Path(result_file)
        try:
            with open(result_file, "r", encoding="utf-8") as file:
                predictions: dict = json.load(file)
        except OSError:
            raise OSError(f"Error in opening the specified json file: {result_file}")

        if k_limits_set is None or len(k_limits_set) == 0:
            k_list = [None]
        else:
            k_list = k_limits_set
        for k in k_list:
            predictions_limited = predictions.copy()
            predictions_limited = {
                id: {key: predictions_limited[id][key][:k]}
                for id in predictions_limited.keys()
            }
            k_gt: dict = ground_truth.copy()
            if cap_gt:
                k_gt = {id: {key: k_gt[id][key][:k]} for id in k_gt.keys()}
            if "union" in result_file.name or "intersection" in result_file.name:
                set_mode = True
            set_op = "$\cup$" if "union" in result_file.name else "$\cap$"
            eval_result = scoring_to_dict(predictions_limited, eval_f, gt=k_gt)
            model_name = result_file.name.rstrip(".json")
            if set_mode:
                model_name = model_name.rstrip("_intersection").rstrip("_union")
            model_name = (
                " ".join(model_name.rstrip(".json").split("-")[:2])
                if "openai" not in model_name
                else " ".join(model_name.rstrip(".json").split("-")[:3])
            )
            # capitalize the first letter of the name
            model_name = " ".join(
                [
                    word.capitalize() if i == 0 else word
                    for i, word in enumerate(model_name.split(" "))
                ]
            )
            model_name = re.sub(r"(\d)b", "\\1B", model_name)
            splits = model_name.split(" ")
            if len(splits) > 2:
                model_name = splits[0] + " " + "-".join(splits[1:])
            low_tokens = "low-tokens" in result_file.name
            high_tokens = "high-tokens" in result_file.name
            entity_labels = "entity-labels" in result_file.name
            result_dict = {
                "Model": model_name,
                "RAG": "\checkmark" if "rag" in result_file.name else "$\\times$",
                "Reorder": "\checkmark"
                if "reorder" in result_file.name
                else "$\\times$",
                # "Low Tokens": "\checkmark"
                # if "low-tokens" in result_file.name
                # else "$\\times$",
                # "Entity Labels": "\checkmark"
                # if "entity-labels" in result_file.name
                # else "$\\times$",
            }
            if further_mode:
                result_dict["LoRA+"] = (
                    "\checkmark" if "long" in result_file.name else "$\\times$"
                )
                result_dict["Low $t$"] = "\checkmark" if low_tokens else "$\\times$"
                result_dict["High $t$"] = "\checkmark" if high_tokens else "$\\times$"
                result_dict["Entities"] = "\checkmark" if entity_labels else "$\\times$"
                if k_limits_set is not None and len(k_limits_set) > 0:
                    result_dict["$k$"] = k if k is not None else "$\\times$"
            else:
                result_dict["LoRA"] = (
                    "\checkmark" if "lora" in result_file.name else "$\\times$"
                )
            if set_mode:
                result_dict["Set"] = set_op
            result_dict.update(eval_result)
            # result_dict.update({f"6_2_2_{k}": v for k, v in ternary_tag_score.items()})
            # result_dict.update({f"6_2_3_{k}": v for k, v in ternary_mention_score.items()})
            eval_results.append(result_dict)
    eval_df = pd.DataFrame(eval_results)
    if further_mode and "$k$" not in eval_df.columns:
        eval_df = eval_df[
            (eval_df["Low $t$"] == "\checkmark")
            | (eval_df["Entities"] == "\checkmark")
            | (eval_df["LoRA+"] == "\checkmark")
        ]
    if merge_mode:
        eval_df = eval_df[eval_df["Model"].str.contains("Hermes 3B")]
    valid_cols = [
        c
        for c in [
            "Set",
            "Model",
            "RAG",
            "LoRA",
            "LoRA+",
            "Reorder",
            "Low $t$",
            "High $t$",
            "Entities",
            "$k$",
        ]
        if c in eval_df.columns
    ]
    eval_df.set_index(valid_cols, inplace=True)
    eval_df = eval_df.sort_index()
    # if "$F_{1,micro}$" in eval_df.columns:
    #     eval_df = eval_df.sort_values("$F_{1,micro}$")
    return eval_df


task_6_1_1_df = scoring_to_df(
    eval_submission_6_1_NER, "entities", res_dir=ner_result_dir
)
task_6_2_1_df = scoring_to_df(
    eval_submission_6_2_binary_tag_RE, "binary_tag_based_relations"
)
task_6_2_2_df = scoring_to_df(
    eval_submission_6_3_ternary_tag_RE, "ternary_tag_based_relations"
)
task_6_2_3_df = scoring_to_df(
    eval_submission_6_4_ternary_mention_RE, "ternary_mention_based_relations"
)

Further mode: True
=== Removed 458 duplicated entities from predictions ===
=== Removed 111 overlapping entities ===
=== Removed 335 duplicated entities from predictions ===
=== Removed 86 overlapping entities ===
=== Removed 89 duplicated entities from predictions ===
=== Removed 35 overlapping entities ===
=== Removed 458 duplicated entities from predictions ===
=== Removed 111 overlapping entities ===
=== Removed 399 duplicated entities from predictions ===
=== Removed 61 overlapping entities ===
=== Removed 231 duplicated entities from predictions ===
=== Removed 82 overlapping entities ===
=== Removed 101 duplicated entities from predictions ===
=== Removed 48 overlapping entities ===
=== Removed 85 duplicated entities from predictions ===
=== Removed 64 overlapping entities ===
=== Removed 231 duplicated entities from predictions ===
=== Removed 82 overlapping entities ===
=== Removed 92 duplicated entities from predictions ===
=== Removed 61 overlapping entities ===
=== Removed 

  set_op = "$\cup$" if "union" in result_file.name else "$\cap$"
  set_op = "$\cup$" if "union" in result_file.name else "$\cap$"
  "RAG": "\checkmark" if "rag" in result_file.name else "$\\times$",
  "Reorder": "\checkmark"
  "\checkmark" if "long" in result_file.name else "$\\times$"
  result_dict["Low $t$"] = "\checkmark" if low_tokens else "$\\times$"
  result_dict["High $t$"] = "\checkmark" if high_tokens else "$\\times$"
  result_dict["Entities"] = "\checkmark" if entity_labels else "$\\times$"
  "\checkmark" if "lora" in result_file.name else "$\\times$"
  (eval_df["Low $t$"] == "\checkmark")
  | (eval_df["Entities"] == "\checkmark")
  | (eval_df["LoRA+"] == "\checkmark")


In [15]:
task_6_1_1_df[top_results:].style.highlight_max(axis=0, props="textbf:--rwrap;").format(precision=2).to_latex(
    report_dir / "task_6_1_1.tex",
    # float_format="%.2f",
    caption="Dev Set Result for Task 6.1.1 (NER) for various models and approaches.",
    label=f"tab:task:6_1_1{lbl_xtra}",
    clines="all;data",
    hrules=True
)
task_6_1_1_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Model,RAG,LoRA+,Reorder,Low $t$,High $t$,Entities,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Hermes 3B,$\times$,$\times$,$\times$,$\times$,$\times$,\checkmark,0.048768,0.01877,0.025532,0.055046,0.021486,0.030908
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,$\times$,$\times$,0.015713,0.017106,0.013724,0.037415,0.009848,0.015592
Hermes 3B,$\times$,$\times$,$\times$,\checkmark,$\times$,\checkmark,0.045376,0.01666,0.022781,0.057221,0.0188,0.028302
Hermes 3B,$\times$,$\times$,\checkmark,$\times$,$\times$,\checkmark,0.048768,0.01877,0.025532,0.055046,0.021486,0.030908
Hermes 3B,$\times$,$\times$,\checkmark,\checkmark,$\times$,\checkmark,0.045376,0.01666,0.022781,0.057221,0.0188,0.028302
Hermes 3B,\checkmark,$\times$,$\times$,$\times$,$\times$,\checkmark,0.212756,0.081851,0.112518,0.25,0.119964,0.162129
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,$\times$,$\times$,0.318396,0.066426,0.099484,0.2925,0.104745,0.154252
Hermes 3B,\checkmark,$\times$,$\times$,\checkmark,$\times$,\checkmark,0.252185,0.054985,0.083981,0.30897,0.083259,0.131171
Hermes 3B,\checkmark,$\times$,\checkmark,$\times$,$\times$,\checkmark,0.212756,0.081851,0.112518,0.25,0.119964,0.162129
Hermes 3B,\checkmark,$\times$,\checkmark,\checkmark,$\times$,\checkmark,0.252185,0.054985,0.083981,0.30897,0.083259,0.131171


In [16]:
task_6_2_1_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Set,Model,RAG,LoRA,Reorder,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
$\cap$,Hermes 3B,$\times$,$\times$,$\times$,0.033333,0.011111,0.016667,0.666667,0.009091,0.017937
$\cap$,Hermes 3B,$\times$,$\times$,\checkmark,0.011111,0.003704,0.005556,0.5,0.004545,0.009009
$\cap$,Hermes 3B,$\times$,\checkmark,$\times$,0.177778,0.040509,0.062239,0.961538,0.113636,0.203252
$\cap$,Hermes 3B,$\times$,\checkmark,\checkmark,0.177778,0.040509,0.062239,0.961538,0.113636,0.203252
$\cap$,Hermes 3B,\checkmark,$\times$,$\times$,0.251634,0.07104,0.104302,0.846154,0.15,0.254826
$\cap$,Hermes 3B,\checkmark,$\times$,\checkmark,0.133951,0.050731,0.067101,0.692308,0.122727,0.208494
$\cap$,Hermes 3B,\checkmark,\checkmark,$\times$,0.173016,0.040553,0.061778,0.888889,0.109091,0.194332
$\cap$,Hermes 3B,\checkmark,\checkmark,\checkmark,0.145556,0.040201,0.05813,0.857143,0.109091,0.193548
$\cap$,Hermes 8B,$\times$,$\times$,$\times$,0.055556,0.007326,0.012879,0.625,0.022727,0.04386
$\cap$,Hermes 8B,$\times$,$\times$,\checkmark,0.055556,0.007326,0.012879,0.625,0.022727,0.04386


In [21]:
task_6_2_1_df[top_results:].style.highlight_max(axis=0, props="textbf:--rwrap;").format(precision=2).to_latex(
    report_dir / "task_6_2_1.tex",
    # float_format="%.2f",
    caption="Dev Set Result for Task 6.2.1 for various models and approaches.",
    label=f"tab:task:6_2_1{lbl_xtra}",
    clines="all;data",
    hrules=True
)
task_6_2_1_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Set,Model,RAG,LoRA,Reorder,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
$\cap$,Hermes 3B,$\times$,$\times$,$\times$,0.033333,0.011111,0.016667,0.666667,0.009091,0.017937
$\cap$,Hermes 3B,$\times$,$\times$,\checkmark,0.011111,0.003704,0.005556,0.5,0.004545,0.009009
$\cap$,Hermes 3B,$\times$,\checkmark,$\times$,0.177778,0.040509,0.062239,0.961538,0.113636,0.203252
$\cap$,Hermes 3B,$\times$,\checkmark,\checkmark,0.177778,0.040509,0.062239,0.961538,0.113636,0.203252
$\cap$,Hermes 3B,\checkmark,$\times$,$\times$,0.251634,0.07104,0.104302,0.846154,0.15,0.254826
$\cap$,Hermes 3B,\checkmark,$\times$,\checkmark,0.133951,0.050731,0.067101,0.692308,0.122727,0.208494
$\cap$,Hermes 3B,\checkmark,\checkmark,$\times$,0.173016,0.040553,0.061778,0.888889,0.109091,0.194332
$\cap$,Hermes 3B,\checkmark,\checkmark,\checkmark,0.145556,0.040201,0.05813,0.857143,0.109091,0.193548
$\cup$,Hermes 3B,$\times$,$\times$,$\times$,0.546896,0.530845,0.510897,0.652381,0.622727,0.637209
$\cup$,Hermes 3B,$\times$,$\times$,\checkmark,0.537235,0.530845,0.508799,0.649289,0.622727,0.635731


In [22]:
task_6_2_2_df[top_results:].style.highlight_max(axis=0, props="textbf:--rwrap;").format(precision=2).to_latex(
    report_dir / "task_6_2_2.tex",
    # float_format="%.2f",
    caption="Further Dev Set Result for Task 6.2.2 for various models and approaches.",
    label=f"tab:task:6_2_2{lbl_xtra}",
    clines="all;data",
    hrules=True
)
task_6_2_2_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Set,Model,RAG,LoRA,Reorder,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
$\cap$,Hermes 3B,$\times$,$\times$,$\times$,0.03125,0.010417,0.015625,0.666667,0.008696,0.017167
$\cap$,Hermes 3B,$\times$,$\times$,\checkmark,0.010417,0.003472,0.005208,0.5,0.004348,0.008621
$\cap$,Hermes 3B,$\times$,\checkmark,$\times$,0.166667,0.038863,0.059721,0.961538,0.108696,0.195312
$\cap$,Hermes 3B,$\times$,\checkmark,\checkmark,0.166667,0.038863,0.059721,0.961538,0.108696,0.195312
$\cap$,Hermes 3B,\checkmark,$\times$,$\times$,0.235907,0.066895,0.098323,0.846154,0.143478,0.245353
$\cap$,Hermes 3B,\checkmark,$\times$,\checkmark,0.125579,0.047856,0.063403,0.692308,0.117391,0.200743
$\cap$,Hermes 3B,\checkmark,\checkmark,$\times$,0.162202,0.039199,0.059608,0.888889,0.104348,0.18677
$\cap$,Hermes 3B,\checkmark,\checkmark,\checkmark,0.136458,0.038574,0.055868,0.857143,0.104348,0.186047
$\cup$,Hermes 3B,$\times$,$\times$,$\times$,0.54781,0.526916,0.51178,0.657143,0.6,0.627273
$\cup$,Hermes 3B,$\times$,$\times$,\checkmark,0.537133,0.526916,0.50896,0.650943,0.6,0.624434


In [23]:
task_6_2_3_df[top_results:].style.highlight_max(axis=0, props="textbf:--rwrap;").format(precision=2).to_latex(
    report_dir / "task_6_2_3.tex",
    # float_format="%.2f",
    caption="Dev Set Result for Task 6.2.3 for various models and approaches.",
    label=f"tab:task:6_2_3{lbl_xtra}",
    clines="all;data",
    hrules=True
)
task_6_2_3_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,$P$,$R$,$F_1$,$P_{micro}$,$R_{micro}$,"$F_{1,micro}$"
Set,Model,RAG,LoRA,Reorder,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
$\cap$,Hermes 3B,$\times$,$\times$,$\times$,0.001302,0.003472,0.001894,0.023256,0.001786,0.003317
$\cap$,Hermes 3B,$\times$,$\times$,\checkmark,0.0,0.0,0.0,0.0,0.0,0.0
$\cap$,Hermes 3B,$\times$,\checkmark,$\times$,0.056471,0.036594,0.040812,0.339713,0.126786,0.184655
$\cap$,Hermes 3B,$\times$,\checkmark,\checkmark,0.056471,0.036594,0.040812,0.339713,0.126786,0.184655
$\cap$,Hermes 3B,\checkmark,$\times$,$\times$,0.107129,0.051278,0.059708,0.268382,0.130357,0.175481
$\cap$,Hermes 3B,\checkmark,$\times$,\checkmark,0.019151,0.026367,0.02002,0.175758,0.103571,0.130337
$\cap$,Hermes 3B,\checkmark,\checkmark,$\times$,0.054417,0.027528,0.033838,0.306878,0.103571,0.154873
$\cap$,Hermes 3B,\checkmark,\checkmark,\checkmark,0.056013,0.029934,0.03457,0.319588,0.110714,0.164456
$\cup$,Hermes 3B,$\times$,$\times$,$\times$,0.281717,0.287139,0.254697,0.312789,0.3625,0.335815
$\cup$,Hermes 3B,$\times$,$\times$,\checkmark,0.27465,0.287139,0.248833,0.275815,0.3625,0.313272


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tikzplotlib
import re

# style to sbn
plt.style.use("seaborn-v0_8")


def tikzplotlib_fix_ncols(obj):
    """
    workaround for matplotlib 3.6 renamed legend's _ncol to _ncols, which breaks tikzplotlib
    """
    if hasattr(obj, "_ncols"):
        obj._ncol = obj._ncols
    for child in obj.get_children():
        tikzplotlib_fix_ncols(child)


k = list(range(1, 20))


task_6_2_1_df_f1k = scoring_to_df(
    eval_submission_6_2_binary_tag_RE,
    "binary_tag_based_relations",
    k_limits_set=k,
    cap_gt=True,
    res_dir=results_dir / "hermes-3b-rag.json",
)
k_s=task_6_2_1_df_f1k.reset_index()['$k$'].to_numpy()
f1k=task_6_2_1_df_f1k["$F_{1,micro}$"]
p1k=task_6_2_1_df_f1k["$P_{micro}$"]
r1k=task_6_2_1_df_f1k["$R_{micro}$"]

fig = plt.figure(figsize=(8, 4))
ax = fig.add_subplot(111)
ax.plot(k_s, f1k, label="$F_{1,micro}$", color="blue")
ax.plot(k_s, p1k, label="$P_{micro}$", color="red")
ax.plot(k_s, r1k, label="$R_{micro}$", color="orange")
ax.set_ylabel("Score")
ax.set_xlabel("$k$")
ax.set_title("Scores @  $k$ for Task 6.2.1")
ax.legend()
ax.set_xticklabels(k_s)
tikzplotlib_fix_ncols(fig)
# tikzplotlib.save(
#     "f1_at_k_rerun.tex",
#     figure=fig,
#     axis_width="0.8\\linewidth",
#     axis_height="0.4\\linewidth",
#     # extra_axis_parameters={
#     #     "legend style": r"legend pos=outer north east",
#     #     "legend columns": 1,
#     #     "legend cell align": "left",
#     #     "legend to name": "loss_legend",
#     #     "legend entries": ["loss"],
#     # },
# )

Further mode: False


OSError: Error in opening the specified json file: ../data/merge_dev/hermes-3b-rag.json

In [None]:
task_6_2_1_df_f1k.reset_index()['$k$']

NameError: name 'task_6_2_1_df_f1k' is not defined