In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
from constrerl.evaluate import eval_submission_6_3_ternary_tag_RE, eval_submission_6_4_ternary_mention_RE
from constrerl.erl_schema import convert_to_output, Article
import glob
from pathlib import Path
import json
import pandas as pd
from collections.abc import Callable, Awaitable


In [19]:
results_dir = "data/results"
ground_truth_file = "data/annotations/dev/dev.json"

results_dir = Path(results_dir)
ground_truth_file = Path(ground_truth_file)

with open(ground_truth_file) as f:
    ground_truth = json.load(f)

In [24]:
eval_results: list[dict] = []


def scoring_to_dict(
    f: str | Path, eval_f: Callable[[str | Path, dict], tuple[float]]
) -> dict:
    precision, recall, f1, micro_precision, micro_recall, micro_f1 = eval_f(
        f, ground_truth
    )
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
    }

def scoring_to_df(eval_f: Callable[[str | Path, dict], tuple[float]]) -> pd.DataFrame:
    eval_results: list[dict] = []
    for result_file in results_dir.glob("*.json"):
        result_file = Path(result_file)
        eval_result = scoring_to_dict(result_file, eval_f)
        result_dict = {
            "file": result_file.name,
            "rag": "rag" in result_file.name,
            "reorder": "reorder" in result_file.name,
        }
        result_dict.update(eval_result)
        # result_dict.update({f"6_2_2_{k}": v for k, v in ternary_tag_score.items()})
        # result_dict.update({f"6_2_3_{k}": v for k, v in ternary_mention_score.items()})
        eval_results.append(result_dict)
    eval_df = pd.DataFrame(eval_results).sort_values("micro_f1")
    return eval_df

task_6_2_2_df = scoring_to_df(eval_submission_6_3_ternary_tag_RE)
task_6_2_3_df = scoring_to_df(eval_submission_6_4_ternary_mention_RE)

In [25]:
task_6_2_2_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
0,hermes-3b-.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
4,hermes-3b--reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
7,openai-4o-mini-.json,False,False,0.060109,0.074479,0.057142,0.161458,0.134783,0.146919
3,openai-4o-mini--reorder.json,False,True,0.065913,0.083211,0.064466,0.180851,0.147826,0.162679
1,hermes-8b-.json,False,False,0.039283,0.117471,0.04713,0.276657,0.417391,0.332756
8,hermes-8b--reorder.json,False,True,0.039283,0.117471,0.04713,0.276657,0.417391,0.332756
11,openai-4o-mini--rag-reorder.json,True,True,0.067913,0.197386,0.097095,0.29484,0.521739,0.376766
2,openai-4o-mini--rag.json,True,False,0.149836,0.345451,0.182326,0.379679,0.617391,0.470199
5,hermes-8b--rag.json,True,False,0.268969,0.410813,0.263918,0.502513,0.869565,0.636943
10,hermes-3b--rag.json,True,False,0.159979,0.347654,0.157223,0.66185,0.995652,0.795139


In [26]:
task_6_2_3_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
0,hermes-3b-.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
3,openai-4o-mini--reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
7,openai-4o-mini-.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
4,hermes-3b--reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
1,hermes-8b-.json,False,False,0.002451,0.000514,0.00085,0.005764,0.003571,0.00441
8,hermes-8b--reorder.json,False,True,0.002451,0.000514,0.00085,0.005764,0.003571,0.00441
11,openai-4o-mini--rag-reorder.json,True,True,0.0012,0.001267,0.001217,0.007371,0.005357,0.006205
6,hermes-3b--rag-reorder.json,True,True,0.000979,0.002391,0.001389,0.013333,0.0125,0.012903
2,openai-4o-mini--rag.json,True,False,0.002264,0.002634,0.00236,0.018717,0.0125,0.014989
5,hermes-8b--rag.json,True,False,0.053018,0.032605,0.035244,0.115578,0.082143,0.096033
