In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from constrerl.evaluate import (
    eval_submission_6_3_ternary_tag_RE,
    eval_submission_6_4_ternary_mention_RE,
    eval_submission_6_2_binary_tag_RE
)
from constrerl.erl_schema import convert_to_output, Article
import glob
from pathlib import Path
import json
import pandas as pd
from collections.abc import Callable, Awaitable


In [9]:
results_dir = "data/results_dev"
ground_truth_file = "data/annotations/dev/dev.json"

results_dir = Path(results_dir)
ground_truth_file = Path(ground_truth_file)

with open(ground_truth_file) as f:
    ground_truth = json.load(f)

In [10]:
eval_results: list[dict] = []


def scoring_to_dict(
    f: str | Path, eval_f: Callable[[str | Path, dict], tuple[float]]
) -> dict:
    precision, recall, f1, micro_precision, micro_recall, micro_f1 = eval_f(
        f, ground_truth
    )
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
    }

def scoring_to_df(eval_f: Callable[[str | Path, dict], tuple[float]]) -> pd.DataFrame:
    eval_results: list[dict] = []
    for result_file in results_dir.glob("*.json"):
        result_file = Path(result_file)
        eval_result = scoring_to_dict(result_file, eval_f)
        result_dict = {
            "file": result_file.name,
            "rag": "rag" in result_file.name,
            "reorder": "reorder" in result_file.name,
        }
        result_dict.update(eval_result)
        # result_dict.update({f"6_2_2_{k}": v for k, v in ternary_tag_score.items()})
        # result_dict.update({f"6_2_3_{k}": v for k, v in ternary_mention_score.items()})
        eval_results.append(result_dict)
    eval_df = pd.DataFrame(eval_results)
    if "micro_f1" in eval_df.columns:
        eval_df = eval_df.sort_values("micro_f1")
    return eval_df

task_6_2_1_df = scoring_to_df(eval_submission_6_2_binary_tag_RE)
task_6_2_2_df = scoring_to_df(eval_submission_6_3_ternary_tag_RE)
task_6_2_3_df = scoring_to_df(eval_submission_6_4_ternary_mention_RE)

=== Removed 299 duplicated binary tag-based relations from predictions ===
=== Removed 1082 duplicated binary tag-based relations from predictions ===
=== Removed 327 duplicated binary tag-based relations from predictions ===
=== Removed 1059 duplicated binary tag-based relations from predictions ===
=== Removed 744 duplicated binary tag-based relations from predictions ===
=== Removed 1105 duplicated binary tag-based relations from predictions ===
=== Removed 70 duplicated binary tag-based relations from predictions ===
=== Removed 447 duplicated binary tag-based relations from predictions ===
=== Removed 7 duplicated binary tag-based relations from predictions ===
=== Removed 225 duplicated binary tag-based relations from predictions ===
=== Removed 269 duplicated binary tag-based relations from predictions ===
=== Removed 1082 duplicated binary tag-based relations from predictions ===
=== Removed 1403 duplicated binary tag-based relations from predictions ===
=== Removed 630 duplica

In [11]:
task_6_2_1_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
12,hermes-3b.json,False,False,0.035556,0.012522,0.016695,0.142857,0.018182,0.032258
19,hermes-3b-reorder.json,False,True,0.035556,0.012522,0.016695,0.142857,0.018182,0.032258
11,hermes-8b-reorder.json,False,True,0.042593,0.013309,0.018967,0.307692,0.054545,0.092664
1,hermes-8b.json,False,False,0.042593,0.013309,0.018967,0.307692,0.054545,0.092664
14,openai-4-1-reorder.json,False,True,0.043147,0.054826,0.041006,0.186335,0.136364,0.15748
8,openai-4o-mini.json,False,False,0.072238,0.087869,0.068457,0.187166,0.159091,0.17199
6,openai-4-1.json,False,False,0.083042,0.108601,0.072636,0.225,0.163636,0.189474
18,openai-4o-mini-reorder.json,False,True,0.089421,0.114619,0.083147,0.21,0.190909,0.2
2,hermes-3b-lora-rag.json,True,False,0.133492,0.048682,0.065617,0.644444,0.131818,0.218868
16,hermes-3b-rag-reorder.json,True,True,0.111164,0.063552,0.075944,0.425,0.154545,0.226667


In [12]:
task_6_2_2_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
12,hermes-3b.json,False,False,0.022917,0.009425,0.011863,0.115385,0.013043,0.023437
19,hermes-3b-reorder.json,False,True,0.022917,0.009425,0.011863,0.115385,0.013043,0.023437
11,hermes-8b-reorder.json,False,True,0.046875,0.011866,0.017359,0.242424,0.034783,0.060837
1,hermes-8b.json,False,False,0.046875,0.011866,0.017359,0.242424,0.034783,0.060837
14,openai-4-1-reorder.json,False,True,0.038196,0.049485,0.036115,0.151515,0.108696,0.126582
8,openai-4o-mini.json,False,False,0.061996,0.068411,0.054581,0.157068,0.130435,0.142518
6,openai-4-1.json,False,False,0.082771,0.103962,0.069528,0.202532,0.13913,0.164948
18,openai-4o-mini-reorder.json,False,True,0.085503,0.108876,0.079173,0.183575,0.165217,0.173913
2,hermes-3b-lora-rag.json,True,False,0.125149,0.042781,0.057305,0.627907,0.117391,0.197802
16,hermes-3b-rag-reorder.json,True,True,0.080407,0.054739,0.063039,0.397436,0.134783,0.201299


In [13]:
task_6_2_3_df

Unnamed: 0,file,rag,reorder,precision,recall,f1,micro_precision,micro_recall,micro_f1
6,openai-4-1.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
8,openai-4o-mini.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
12,hermes-3b.json,False,False,0.0,0.0,0.0,0.0,0.0,0.0
19,hermes-3b-reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
18,openai-4o-mini-reorder.json,False,True,0.0,0.0,0.0,0.0,0.0,0.0
1,hermes-8b.json,False,False,0.000992,0.000257,0.000408,0.002066,0.001786,0.001916
11,hermes-8b-reorder.json,False,True,0.000992,0.000257,0.000408,0.002066,0.001786,0.001916
0,openai-4-1-rag-reorder.json,True,True,0.001121,0.000804,0.000852,0.004484,0.003571,0.003976
14,openai-4-1-reorder.json,False,True,0.001068,0.008333,0.001894,0.009091,0.003571,0.005128
17,openai-4-1-rag.json,True,False,0.000508,0.001025,0.000679,0.007732,0.005357,0.006329
