In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from constrerl.annotator import (
    Annotator,
    load_train,
    Article
)
import json

In [4]:
import pathlib as pl

In [5]:
train_data_dir = pl.Path("data/annotations/train")
test_data_dir = pl.Path("data/annotations/dev")
train_data_files = list(train_data_dir.glob("*.json"))
test_data_files = list(test_data_dir.glob("*.json"))

train_data: dict[str, Article] = {}
for file in train_data_files:
    train_data.update(load_train(file))

In [6]:
len(train_data)

1567

In [7]:
annotator = Annotator()
failed_articles: list[tuple[Article, Exception]] = []

def export_train_date(train_data: dict[str, Article], path: pl.Path):
    train_data_chats = []
    for article in train_data.values():
        try:
            chat = Annotator.prompt_and_respone(article)
            train_data_chats.append({"article": article.model_dump(), "messages": chat})
        except Exception as e:
            print(e)
            failed_articles.append(
                (
                    article,
                    e,
                )
            )

    with open(path, "w") as f:
        json.dump(train_data_chats, f)
    # jsonl dump
    with open(path.with_suffix(".jsonl"), "w") as f:
        for chat in train_data_chats:
            #with system prompt
            del chat["article"]
            chat["messages"] = annotator.example_messages + chat["messages"]
            f.write(json.dumps(chat) + "\n")


for train_data_file in train_data_files+test_data_files:
    collection_name = train_data_file.name.split(".")[0].split("_")[-1]
    export_train_date(
        load_train(train_data_file),
        pl.Path(
            *train_data_file.parts[:-3], f"train_data_chats_{collection_name}.json"
        ),
    )
export_train_date(train_data, pl.Path("data/annotations/train_data_chats.json"))

In [None]:
missing_relations_all = {}
for article, err in failed_articles:
    missing_relations: dict[str, dict[str, str]] = {}
    for e in err.errors():
        index = e["loc"][1]
        spo_id = e["loc"][-1]
        if index not in missing_relations:
            missing_relations[index] = {
                "subject_label": None,
                "predicate": None,
                "object_label": None,
            }
        missing_relations[index][spo_id] = e["input"]
    for index, missing_relation in missing_relations.items():
        names = missing_relation.values()
        spo= "->".join(names)
        if spo not in missing_relations_all:
            missing_relations_all[ "->".join(names)] = []
        missing_relations_all[spo].append(article.metadata.title)

In [None]:

relations_missing = []
for spo, articles in missing_relations_all.items():
    s, p, o = spo.split("->")
    relations_missing.append(   
            {
                "heads": [s],
                "tails": [o],
                "predicate": [p],
            }
    )
print(json.dumps(relations_missing, indent=2))

[]


In [12]:
with open("missing_relations.txt", "w") as f:
    for spo, titles in missing_relations_all.items():
        f.write(f"{spo}:\n")
        for title in titles:
            f.write(f"\t{title}\n")
        f.write("\n")

In [13]:
with open("err.log", "w") as f:
    for article, e in failed_articles:
        f.write(f"{article.model_dump_json(indent=2)}\n {e}\n")