
# Triple Extraction Evaluation — Monument Ontology

This notebook evaluates **predicted triples** against **ground-truth triples** for the *Monument* ontology.
- **Triples-only** scoring (no mentions/spans).
- Exact-match after light normalization (lowercase; remove spaces/underscores).
- Reports **per-example** metrics and **aggregate** (macro & micro) precision/recall/F1.
- No relation filtering and no "selected IDs" logic.


In [None]:

# Imports
import json
import re
from pathlib import Path
from typing import List, Dict, Tuple, Set
from collections import defaultdict, Counter


In [None]:

# File configuration — adjust as needed
ONTOLOGY_PATH = Path('/mnt/data/12_monument_ontology.json')
SYSTEM_OUTPUT_PATH = Path('/mnt/data/ont_12_monument_output.jsonl')
GROUND_TRUTH_PATH = Path('/mnt/data/ont_12_monument_ground_truth.jsonl')

# Output paths
PER_EXAMPLE_OUT = Path('/mnt/data/eval_monument_per_example.jsonl')
AGGREGATE_OUT = Path('/mnt/data/eval_monument_aggregate.json')


In [None]:

# I/O helpers
def read_jsonl(path: Path) -> List[Dict]:
    data = []
    with path.open('r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

def write_jsonl(items: List[Dict], path: Path) -> None:
    with path.open('w', encoding='utf-8') as f:
        for it in items:
            f.write(json.dumps(it, ensure_ascii=False) + "\n")

def write_json(obj: Dict, path: Path) -> None:
    with path.open('w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


In [None]:

# Normalization utilities
_WS_UNDERSCORE = re.compile(r'(_|\s+)')

def norm_value(text: str) -> str:
    if text is None:
        return ''
    # remove spaces/underscores and lowercase
    return _WS_UNDERSCORE.sub('', str(text)).lower()

def norm_triple(sub: str, rel: str, obj: str) -> Tuple[str, str, str]:
    return (norm_value(sub), norm_value(rel), norm_value(obj))


In [None]:

# Metrics helpers
from dataclasses import dataclass

@dataclass
class PRF:
    precision: float
    recall: float
    f1: float
    tp: int
    fp: int
    fn: int

def prf_from_sets(gold: Set[Tuple[str,str,str]], pred: Set[Tuple[str,str,str]]) -> PRF:
    tp = len(gold & pred)
    fp = len(pred - gold)
    fn = len(gold - pred)
    denom_p = tp + fp
    denom_r = tp + fn
    precision = tp/denom_p if denom_p > 0 else (1.0 if len(gold)==0 else 0.0)
    recall = tp/denom_r if denom_r > 0 else 1.0
    f1 = (2*precision*recall/(precision+recall)) if (precision+recall)>0 else 0.0
    return PRF(precision, recall, f1, tp, fp, fn)


In [None]:

# Extract triples from system & gold records

def extract_system_triples(rec: Dict) -> List[Tuple[str,str,str]]:
    """Return list of raw (sub, rel, obj) from system record. Skip entries with null object."""
    out = []
    resp = rec.get('response', {})
    j = resp.get('json')
    if isinstance(j, dict):
        for tr in j.get('triples', []):
            triple = tr.get('triple')
            if (isinstance(triple, list) or isinstance(triple, tuple)) and len(triple) == 3:
                s, r, o = triple
                if o is None or s is None or r is None:
                    continue
                out.append((str(s), str(r), str(o)))
    return out

def extract_gold_triples(rec: Dict) -> List[Tuple[str,str,str]]:
    out = []
    for tr in rec.get('triples', []):
        s = tr.get('sub')
        r = tr.get('rel')
        o = tr.get('obj')
        if s is None or r is None or o is None:
            continue
        out.append((str(s), str(r), str(o)))
    return out


In [None]:

# Evaluation core
def evaluate(system_recs: List[Dict], gold_recs: List[Dict]) -> Dict:
    sys_by_id = {r.get('id'): r for r in system_recs}
    gold_by_id = {r.get('id'): r for r in gold_recs}
    
    ids = sorted(set(sys_by_id.keys()) & set(gold_by_id.keys()))
    missing_sys = sorted(set(gold_by_id.keys()) - set(sys_by_id.keys()))
    missing_gold = sorted(set(sys_by_id.keys()) - set(gold_by_id.keys()))
    
    per_items = []
    micro_tp = micro_fp = micro_fn = 0
    
    for sid in ids:
        sys_tr_raw = extract_system_triples(sys_by_id[sid])
        gt_tr_raw = extract_gold_triples(gold_by_id[sid])
        
        sys_tr = {norm_triple(*t) for t in sys_tr_raw}
        gt_tr  = {norm_triple(*t) for t in gt_tr_raw}
        
        prf = prf_from_sets(gt_tr, sys_tr)
        micro_tp += prf.tp
        micro_fp += prf.fp
        micro_fn += prf.fn
        
        per_items.append({
            "id": sid,
            "pred_count": len(sys_tr),
            "gold_count": len(gt_tr),
            "precision": prf.precision,
            "recall": prf.recall,
            "f1": prf.f1,
            "tp": prf.tp,
            "fp": prf.fp,
            "fn": prf.fn
        })
    
    n = len(per_items)
    macro = {
        "precision": sum(d["precision"] for d in per_items)/n if n else 0.0,
        "recall":    sum(d["recall"]    for d in per_items)/n if n else 0.0,
        "f1":        sum(d["f1"]        for d in per_items)/n if n else 0.0,
    }
    
    denom_p = micro_tp + micro_fp
    denom_r = micro_tp + micro_fn
    micro_precision = micro_tp/denom_p if denom_p>0 else 1.0 if (micro_tp+micro_fn)==0 else 0.0
    micro_recall    = micro_tp/denom_r if denom_r>0 else 1.0
    micro_f1 = (2*micro_precision*micro_recall/(micro_precision+micro_recall)) if (micro_precision+micro_recall)>0 else 0.0
    
    aggregate = {
        "macro": macro,
        "micro": {
            "precision": micro_precision,
            "recall": micro_recall,
            "f1": micro_f1,
            "tp": micro_tp,
            "fp": micro_fp,
            "fn": micro_fn
        },
        "counts": {
            "evaluated_examples": n,
            "missing_in_system": len(missing_sys),
            "missing_in_gold": len(missing_gold)
        },
        "missing_ids": {
            "in_system_only": missing_gold,
            "in_gold_only": missing_sys
        }
    }
    return {"per_items": per_items, "aggregate": aggregate}


In [None]:

# Run evaluation
system_recs = read_jsonl(SYSTEM_OUTPUT_PATH)
gold_recs = read_jsonl(GROUND_TRUTH_PATH)

results = evaluate(system_recs, gold_recs)

# Save outputs
write_jsonl(results["per_items"], PER_EXAMPLE_OUT)
write_json(results["aggregate"], AGGREGATE_OUT)

print("Saved:")
print(" - Per-example:", PER_EXAMPLE_OUT)
print(" - Aggregate  :", AGGREGATE_OUT)
results["aggregate"]


In [None]:

# Top-5 hardest examples by F1 (ascending)
hard = sorted(results["per_items"], key=lambda d: d["f1"])[:5]
hard
