In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from constrerl.evaluate import eval_submission_6_3_ternary_tag_RE, eval_submission_6_4_ternary_mention_RE
from constrerl.erl_schema import convert_to_output, Article
import glob
from pathlib import Path
import json
import pandas as pd
from collections.abc import Callable, Awaitable


In [3]:
results_dir = "data/results"
ground_truth_file = "data/annotations/dev/dev.json"

results_dir = Path(results_dir)
ground_truth_file = Path(ground_truth_file)

with open(ground_truth_file) as f:
    ground_truth = json.load(f)

In [4]:
eval_results: list[dict] = []


def scoring_to_dict(
    f: str | Path, eval_f: Callable[[str | Path, dict], tuple[float]]
) -> dict:
    precision, recall, f1, micro_precision, micro_recall, micro_f1 = eval_f(
        f, ground_truth
    )
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
    }

def scoring_to_df(eval_f: Callable[[str | Path, dict], tuple[float]]) -> pd.DataFrame:
    eval_results: list[dict] = []
    for result_file in results_dir.glob("*.json"):
        result_file = Path(result_file)
        eval_result = scoring_to_dict(result_file, eval_f)
        result_dict = {
            "file": result_file.name,
            "rag": "rag" in result_file.name,
            "reorder": "reorder" in result_file.name,
        }
        result_dict.update(eval_result)
        # result_dict.update({f"6_2_2_{k}": v for k, v in ternary_tag_score.items()})
        # result_dict.update({f"6_2_3_{k}": v for k, v in ternary_mention_score.items()})
        eval_results.append(result_dict)
    eval_df = pd.DataFrame(eval_results).sort_values("micro_f1")
    return eval_df

task_6_2_2_df = scoring_to_df(eval_submission_6_3_ternary_tag_RE)
task_6_2_3_df = scoring_to_df(eval_submission_6_4_ternary_mention_RE)

In [5]:
task_6_2_2_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
10,openai-4o-mini-.json,False,False,0.069329,0.068055,0.062146,0.169399,0.134783,0.150121
4,openai-4o-mini--reorder.json,False,True,0.09537,0.126696,0.097575,0.178744,0.16087,0.169336
5,hermes-3b--reorder.json,False,True,0.023787,0.552083,0.045173,0.183525,0.765217,0.296047
1,hermes-3b-.json,False,False,0.023787,0.552083,0.045173,0.183525,0.765217,0.296047
11,hermes-8b--reorder.json,False,True,0.047415,0.396825,0.073126,0.238512,0.947826,0.381119
2,hermes-8b-.json,False,False,0.047415,0.396825,0.073126,0.238512,0.947826,0.381119
14,openai-4o-mini--rag-reorder.json,True,True,0.112885,0.236343,0.117626,0.327366,0.556522,0.412238
3,openai-4o-mini--rag.json,True,False,0.201678,0.395558,0.221541,0.407792,0.682609,0.510569
9,hermes-3b-lora--rag.json,True,False,0.126309,0.201771,0.135066,0.514085,0.634783,0.568093
7,hermes-8b--rag.json,True,False,0.266985,0.537313,0.284005,0.448127,1.352174,0.67316


In [6]:
task_6_2_3_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
1,hermes-3b-.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
4,openai-4o-mini--reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
5,hermes-3b--reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
10,openai-4o-mini-.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
11,hermes-8b--reorder.json,False,True,0.001894,0.000514,0.000809,0.002188,0.003571,0.002714
2,hermes-8b-.json,False,False,0.001894,0.000514,0.000809,0.002188,0.003571,0.002714
14,openai-4o-mini--rag-reorder.json,True,True,0.006882,0.005203,0.005844,0.012788,0.008929,0.010515
3,openai-4o-mini--rag.json,True,False,0.00217,0.002634,0.002323,0.018182,0.0125,0.014815
8,hermes-3b--rag-reorder.json,True,True,0.001852,0.007431,0.002888,0.017804,0.032143,0.022915
9,hermes-3b-lora--rag.json,True,False,0.029263,0.015542,0.018589,0.102113,0.051786,0.06872
