In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from constrerl.evaluate import (
    eval_submission_6_3_ternary_tag_RE,
    eval_submission_6_4_ternary_mention_RE,
    eval_submission_6_2_binary_tag_RE
)
from constrerl.erl_schema import convert_to_output, Article
import glob
from pathlib import Path
import json
import pandas as pd
from collections.abc import Callable, Awaitable


In [10]:
results_dir = "data/results_dev"
ground_truth_file = "data/annotations/dev/dev.json"

results_dir = Path(results_dir)
ground_truth_file = Path(ground_truth_file)

with open(ground_truth_file) as f:
    ground_truth = json.load(f)

In [11]:
eval_results: list[dict] = []


def scoring_to_dict(
    f: str | Path, eval_f: Callable[[str | Path, dict], tuple[float]]
) -> dict:
    precision, recall, f1, micro_precision, micro_recall, micro_f1 = eval_f(
        f, ground_truth
    )
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
    }

def scoring_to_df(eval_f: Callable[[str | Path, dict], tuple[float]]) -> pd.DataFrame:
    eval_results: list[dict] = []
    for result_file in results_dir.glob("*.json"):
        result_file = Path(result_file)
        eval_result = scoring_to_dict(result_file, eval_f)
        result_dict = {
            "file": result_file.name,
            "rag": "rag" in result_file.name,
            "reorder": "reorder" in result_file.name,
        }
        result_dict.update(eval_result)
        # result_dict.update({f"6_2_2_{k}": v for k, v in ternary_tag_score.items()})
        # result_dict.update({f"6_2_3_{k}": v for k, v in ternary_mention_score.items()})
        eval_results.append(result_dict)
    eval_df = pd.DataFrame(eval_results)
    if "micro_f1" in eval_df.columns:
        eval_df = eval_df.sort_values("micro_f1")
    return eval_df

task_6_2_1_df = scoring_to_df(eval_submission_6_2_binary_tag_RE)
task_6_2_2_df = scoring_to_df(eval_submission_6_3_ternary_tag_RE)
task_6_2_3_df = scoring_to_df(eval_submission_6_4_ternary_mention_RE)

In [12]:
task_6_2_1_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
6,openai-4o-mini.json,False,False,0.071532,0.089578,0.069568,0.190722,0.168182,0.178744
14,openai-4o-mini-reorder.json,False,True,0.088782,0.118038,0.084662,0.222222,0.209091,0.215457
10,hermes-3b.json,False,False,0.025792,0.591358,0.048901,0.174901,0.804545,0.287338
15,hermes-3b-reorder.json,False,True,0.025792,0.591358,0.048901,0.174901,0.804545,0.287338
8,openai-4o-mini-rag-reorder.json,True,True,0.112397,0.24436,0.136528,0.360202,0.65,0.463533
0,hermes-8b.json,False,False,0.037835,0.445503,0.069305,0.310507,1.504545,0.514774
9,hermes-8b-reorder.json,False,True,0.037835,0.445503,0.069305,0.310507,1.504545,0.514774
7,openai-4o-mini-rag.json,True,False,0.150639,0.383199,0.194968,0.434316,0.736364,0.546374
11,hermes-8b-rag.json,True,False,0.274049,0.546014,0.292109,0.472701,1.495455,0.718341
1,hermes-3b-lora-rag.json,True,False,0.13473,0.416447,0.168426,0.629032,1.063636,0.790541


In [13]:
task_6_2_2_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
6,openai-4o-mini.json,False,False,0.061996,0.068411,0.054581,0.154639,0.130435,0.141509
14,openai-4o-mini-reorder.json,False,True,0.085503,0.108876,0.079173,0.183575,0.165217,0.173913
10,hermes-3b.json,False,False,0.023787,0.552083,0.045173,0.183525,0.765217,0.296047
15,hermes-3b-reorder.json,False,True,0.023787,0.552083,0.045173,0.183525,0.765217,0.296047
9,hermes-8b-reorder.json,False,True,0.047415,0.396825,0.073126,0.238512,0.947826,0.381119
0,hermes-8b.json,False,False,0.047415,0.396825,0.073126,0.238512,0.947826,0.381119
8,openai-4o-mini-rag-reorder.json,True,True,0.124024,0.233758,0.1307,0.347607,0.6,0.440191
7,openai-4o-mini-rag.json,True,False,0.156477,0.367247,0.19519,0.402145,0.652174,0.497512
1,hermes-3b-lora-rag.json,True,False,0.126309,0.201771,0.135066,0.514085,0.634783,0.568093
11,hermes-8b-rag.json,True,False,0.266985,0.537313,0.284005,0.448127,1.352174,0.67316


In [14]:
task_6_2_3_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
6,openai-4o-mini.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
14,openai-4o-mini-reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
15,hermes-3b-reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
10,hermes-3b.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
9,hermes-8b-reorder.json,False,True,0.001894,0.000514,0.000809,0.002188,0.003571,0.002714
0,hermes-8b.json,False,False,0.001894,0.000514,0.000809,0.002188,0.003571,0.002714
8,openai-4o-mini-rag-reorder.json,True,True,0.001644,0.002072,0.001809,0.012594,0.008929,0.010449
7,openai-4o-mini-rag.json,True,False,0.001708,0.002171,0.001879,0.016086,0.010714,0.012862
13,hermes-3b-rag-reorder.json,True,True,0.001852,0.007431,0.002888,0.017804,0.032143,0.022915
1,hermes-3b-lora-rag.json,True,False,0.029263,0.015542,0.018589,0.102113,0.051786,0.06872
