In [None]:
import json
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

df_preds = pd.read_csv(r"C:\Users\bauke\flair_ner_results.csv") 
df_preds["people"] = df_preds["people"].fillna("")
df_preds["locations"] = df_preds["locations"].fillna("")

with open(r"C:\Users\bauke\OneDrive - KU Leuven\Documents\Documenten\5 digital humanities\stage\Marcel150_clean.json", encoding="utf-8") as f:
    annotated = json.load(f)

gold_entities = {}
for item in annotated:
    filename = item["data"]["filename"]
    entities = item["annotations"][0]["result"]
    gold_per = set()
    gold_loc = set()
    for ent in entities:
        label = ent["value"]["labels"][0]
        text = ent["value"]["text"].strip()
        if label == "PER":
            gold_per.add(text)
        elif label == "LOC":
            gold_loc.add(text)
    gold_entities[filename] = {"PER": gold_per, "LOC": gold_loc}

y_true = []
y_pred = []

for _, row in df_preds.iterrows():
    filename = row["filename"]
    pred_per = set(x.strip() for x in str(row["people"]).split(",") if x.strip())
    pred_loc = set(x.strip() for x in str(row["locations"]).split(",") if x.strip())

    gold = gold_entities.get(filename, {"PER": set(), "LOC": set()})
    gold_per = gold["PER"]
    gold_loc = gold["LOC"]

    for ent in pred_per:
        y_pred.append("PER")
        y_true.append("PER" if ent in gold_per else "O")
    for ent in gold_per:
        if ent not in pred_per:
            y_pred.append("O")
            y_true.append("PER")

    for ent in pred_loc:
        y_pred.append("LOC")
        y_true.append("LOC" if ent in gold_loc else "O")
    for ent in gold_loc:
        if ent not in pred_loc:
            y_pred.append("O")
            y_true.append("LOC")

labels = ["PER", "LOC"]
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, labels=labels, zero_division=0)

print("\nEntity-Level Evaluation Report:")
for i, label in enumerate(labels):
    print(f"{label}:")
    print(f"  Precision: {precision[i]:.4f}")
    print(f"  Recall:    {recall[i]:.4f}")
    print(f"  F1 Score:  {f1[i]:.4f}")
    print(f"  Support:   {support[i]}")


In [None]:
print(df_preds["filename"].head())

0    BE-KBR00_12126493_19340520_00_00_00_0_01_0001_...
1    BE-KBR00_12126493_19340520_00_00_00_0_01_0001_...
2    BE-KBR00_12126493_19340708_00_00_00_0_01_0024_...
3    BE-KBR00_12126493_19340909_00_00_00_0_01_0002_...
4    BE-KBR00_12126493_19341223_00_00_00_0_01_0001_...
Name: filename, dtype: object
['file838.txt', 'file839.txt', 'file840.txt', 'file841.txt', 'file842.txt']
