In [2]:
import shutil
import os

In [3]:
from zipfile import ZipFile
from pathlib import Path
import glob
import json
from constrerl.annotator import load_test, load_train, Article

In [18]:
onto_path = Path("data/onto")
result_file_types=["union","intersection"]
for rft in result_file_types:
    ontug_files = onto_path.glob(f"ontug-{rft}*.json")
    combined_articles: dict[str, Article] = {}
    for ontug_file in ontug_files:
        articles = load_train(ontug_file)
        for id, article in articles.items():
            if id not in combined_articles:
                combined_articles[id] = article
            else:
                existing_article = combined_articles[id].model_dump()
                new_article = article.model_dump()
                for key, value in article.model_dump().items():
                    if key not in existing_article or existing_article[key] is None:
                        existing_article[key] = new_article[key]
                combined_articles[id] = Article.model_validate(existing_article)

    with open(onto_path / f"ontug_test_{rft}_results.json", "w") as f:
        json.dump(
            {id: article.model_dump() for id, article in combined_articles.items()},
            f,
            indent=2,
        )

In [20]:
results_path = "data/onto/ontug_test_*.json"
results = glob.glob(results_path)
results = [Path(p) for p in results]

In [21]:
results

[PosixPath('data/onto/ontug_test_union_results.json'),
 PosixPath('data/onto/ontug_test_intersection_results.json')]

In [22]:
task_ids = {
    "T621": "binary_tag_based_relations",
    "T622": "ternary_tag_based_relations",
    "T623": "ternary_mention_based_relations",
}
run_ids = [str(p.name).split("_")[2] for p in results]
system_id = "ElectraCLEANR"
team_id = "ONTUG"
run_ids

['union', 'intersection']

In [23]:
import chevron
import json

In [24]:
staging_dir = Path("./staging_ontug")
for task_id, task_key in task_ids.items():
    for run_id, result_path in zip(run_ids, results):
        run_id_simples = run_id.replace("-", "")

        identifier = f"{team_id}_{task_id}_{run_id_simples}_{system_id}"
        identifier_dir = staging_dir / identifier
        if not identifier_dir.exists():
            os.mkdir(identifier_dir)
        desc_data = ""
        with open("description_ontug.md", "r") as f:
            desc_data = f.read(-1)
        flags = ["lora", "rag", "reorder"]
        if "rag" in run_id:
            flags.append("RAG")
        if "reorder" in run_id:
            flags.append("Reordered")
        if "lora" in run_id:
            flags.append("Finetuned using LoRA")
        rendered_desc = chevron.render(
            desc_data,
            {
                "task_id": task_id,
                "run_id": run_id,
                "system_id": system_id,
                "team_id": team_id,
                "flags": flags,
            },
        )
        desc_file = identifier_dir / f"{identifier}.meta"
        out_file = identifier_dir / f"{identifier}.json"
        with open(desc_file, "w") as f:
            f.write(rendered_desc)
        run_data: dict[str, dict[str, any]] = None
        with open(result_path, "r") as rf:
            run_data = json.load(rf)
        stratified_res = {}
        for k, res in run_data.items():
            stratified_res[k] = {task_key: res[task_key]}
        with open(out_file, "w") as rf:
            json.dump(stratified_res, rf)

        # zip_path = staging_dir / f"{identifier}.zip"
        # zf = ZipFile(zip_path, "w")
        # zf.write(desc_file, f"{identifier}.md")
        # zf.write(out_file, f"{identifier}.json")

In [25]:
rendered_desc

'# GutBrain IE Challenge @ CLEF 2025: ElectraCLEANR\n\n* Team ID: ONTUG\n* TaskID: T623\n* RunID: intersection\n* Run Flags\n  - lora\n  - rag\n  - reorder\n* GitHub: https://github.com/Dakantz/CLEANR\n## Our appraoch\n* Use a RAG approach to prompt a LM to return the relations\n  - fetch similar articles from VectorDB to give good examples (if the run ID contains `rag`)\n  - reorder the RAG data to improve the handling of the model, i.e. put Gold annotations before Silver (if the run ID contains `reorder`)\n  - finetune the Hermes model on the train data combinations, with text+annotation pairs (if the run ID contains `lora`)\n* We also use different models:\n  - `NousResearch/Hermes-3-Llama-3.2-3B` + a finetuned LoRA-version\n  - `NousResearch/Hermes-3-Llama-3.1-8B`\n  - `gpt-4o-mini-2024-07-18`\n* Merged with the Graphwise team, strategy based on run ID (either intersection or union):\n  - Type of training applied. Finetuning `microsoft/BiomedNLP-BiomedELECTRA-base-uncased-abstract`