In [103]:
# =========================================
# Cell 1 — Imports & setup
# =========================================
import os
import json
import re
from typing import List, Dict, Set, Tuple

# NLTK for tokenization/stemming (hallucination metrics)
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Ensure 'punkt' is available for word_tokenize
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")


In [114]:
# =========================================
# Cell 2 — File configuration (EDIT THESE PATHS)
# =========================================
from pathlib import Path

# EXAMPLE (replace with your own; matches the style you asked for)
ONTOLOGY_PATH = Path('/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/13_food_ontology.json')
SYSTEM_OUTPUT_PATH = Path('/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/dbpedia/prompt1/ont_13_food_output.jsonl')
GROUND_TRUTH_PATH = Path('/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/ground_truth/ont_13_food_ground_truth.jsonl')

# Output paths
PER_EXAMPLE_OUT = Path('/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/dbpedia/prompt1/evaluations/per_example_eval/eval_13_food_per_example.jsonl')
AGGREGATE_OUT   = Path('/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/dbpedia/prompt1/evaluations/aggregate_eval/eval_13_food_aggregate.json')


In [115]:
# =========================================
# Cell 3 — I/O helpers
# =========================================
def read_jsonl(jsonl_path: Path, is_json: bool = True) -> List:
    data = []
    with jsonl_path.open('r', encoding='utf-8') as in_file:
        for line in in_file:
            if is_json:
                data.append(json.loads(line))
            else:
                data.append(line.strip())
    return data

def save_jsonl(data: List, jsonl_path: Path) -> None:
    jsonl_path.parent.mkdir(parents=True, exist_ok=True)
    with jsonl_path.open("w", encoding="utf-8") as out_file:
        for item in data:
            out_file.write(f"{json.dumps(item, ensure_ascii=False)}\n")

def append_jsonl(data: Dict, jsonl_path: Path) -> None:
    jsonl_path.parent.mkdir(parents=True, exist_ok=True)
    with jsonl_path.open("a+", encoding="utf-8") as out_file:
        out_file.write(f"{json.dumps(data, ensure_ascii=False)}\n")

def read_json(json_path: Path) -> Dict:
    with json_path.open('r', encoding='utf-8') as in_file:
        return json.load(in_file)

def convert_to_dict(data: List[Dict], id_name: str = "id") -> Dict:
    return {item[id_name]: item for item in data}


In [116]:
# =========================================
# Cell 4 — Benchmark-normalization & metrics
# =========================================
def calculate_precision_recall_f1(gold: Set, pred: Set) -> Tuple[float, float, float]:
    """
    Benchmark behavior:
      P = |gold ∩ pred| / |pred|
      R = |gold ∩ pred| / |gold|
      F1 = 2PR/(P+R)
    """
    if len(pred) == 0:
        return 0.0, 0.0, 0.0
    correct = len(gold.intersection(pred))
    p = correct / len(pred)
    r = correct / len(gold)
    if p + r > 0:
        f1 = 2 * ((p * r) / (p + r))
    else:
        f1 = 0.0
    return p, r, f1

def normalize_triple(sub_label: str, rel_label: str, obj_label: str) -> str:
    """
    Benchmark behavior:
      - strip spaces/underscores
      - lowercase
      - concatenate (may collide; kept for parity)
    """
    sub_label = re.sub(r"(_|\s+)", '', sub_label).lower()
    rel_label = re.sub(r"(_|\s+)", '', rel_label).lower()
    obj_label = re.sub(r"(_|\s+)", '', obj_label).lower()
    tr_key = f"{sub_label}{rel_label}{obj_label}"
    return tr_key

def clean_entity_string(ps: PorterStemmer, entity: str) -> str:
    """
    Benchmark behavior for hallucination checks.
    """
    stemmed_entity = "".join([ps.stem(word) for word in word_tokenize(entity)])
    normalized_stemmed_entity = re.sub(r"(_|\s+)", '', stemmed_entity).lower()
    return normalized_stemmed_entity.replace("01januari", "")


In [117]:
# =========================================
# Cell 5 — Hallucination & ontology conformance (benchmark)
# =========================================
def get_subject_object_hallucinations(ps: PorterStemmer,
                                      ontology: Dict,
                                      test_sentence: str,
                                      triples: List[List[str]]) -> Tuple[float, float]:
    """
    Benchmark behavior:
      - Context = sentence + ontology concept labels
      - Stem + strip spaces/underscores + lowercase
      - Count subject/object substrings not found in context
    """
    if len(triples) == 0:
        return 0.0, 0.0

    test_sentence = (test_sentence or "") + " " + " ".join([c["label"] for c in ontology.get('concepts', [])])
    stemmed_sentence = "".join([ps.stem(word) for word in word_tokenize(test_sentence)])
    normalized_stemmed_sentence = re.sub(r"(_|\s+)", '', stemmed_sentence).lower()

    num_subj_hallucinations, num_obj_hallucinations = 0, 0
    for triple in triples:
        normalized_stemmed_subject = clean_entity_string(ps, str(triple[0]))
        normalized_stemmed_object  = clean_entity_string(ps, str(triple[2]))
        if normalized_stemmed_sentence.find(normalized_stemmed_subject) == -1:
            num_subj_hallucinations += 1
        if normalized_stemmed_sentence.find(normalized_stemmed_object) == -1:
            num_obj_hallucinations += 1

    subj_hallucination = num_subj_hallucinations / len(triples)
    obj_hallucination  = num_obj_hallucinations / len(triples)
    return subj_hallucination, obj_hallucination

def get_ontology_conformance(ontology: Dict, triples: List[List[str]]) -> Tuple[float, float]:
    """
    Benchmark behavior:
      - Conformance = (# predicted relations in ontology) / (# predicted triples)
      - Relation hallucination = 1 - conformance
    """
    if len(triples) == 0:
        return 1.0, 0.0
    ont_rels = [rel['label'].replace(" ", "_") for rel in ontology.get('relations', [])]
    num_rels_conformant = len([tr for tr in triples if tr[1] in ont_rels])
    ont_conformance = num_rels_conformant / len(triples)
    rel_hallucination = 1.0 - ont_conformance
    return ont_conformance, rel_hallucination


In [118]:
# =========================================
# Cell 6 — Core evaluation for a single ontology (benchmark logic)
# =========================================
def evaluate_single_ontology(ONTOLOGY_PATH: Path,
                             SYSTEM_OUTPUT_PATH: Path,
                             GROUND_TRUTH_PATH: Path,
                             PER_EXAMPLE_OUT: Path,
                             AGGREGATE_OUT: Path) -> None:
    """
    Same as benchmark, but single-ontology and file-path driven.
    Keeps:
      - relation filtering to GT relations before P/R/F1
      - hallucination metrics
      - ontology conformance
      - outputs as strings with 2 decimals
    """
    # Load
    ontology = read_json(ONTOLOGY_PATH)
    system_output = convert_to_dict(read_jsonl(SYSTEM_OUTPUT_PATH))
    ground_truth  = convert_to_dict(read_jsonl(GROUND_TRUTH_PATH))
    ps = PorterStemmer()

    eval_metrics_list: List[Dict] = []
    t_p = t_r = t_f1 = 0.0
    t_onto_conf = t_rel_halluc = t_sub_halluc = t_obj_halluc = 0.0

    # Iterate GT ids
    for sent_id in list(ground_truth.keys()):
        gt_triples = [[tr['sub'], tr['rel'], tr['obj']] for tr in ground_truth[sent_id]['triples']]
        sentence   = ground_truth[sent_id].get("sent", "")

        if sent_id in system_output:
            sys_rec = system_output[sent_id]

            # Expect 'triples' at top-level (benchmark). If absent, try to adapt from response.json.triples.
            system_triples = sys_rec.get('triples')
            if system_triples is None and isinstance(sys_rec.get('response', {}).get('json'), dict):
                # adapt from your generation format: response.json.triples[].triple
                extracted = []
                for tr in sys_rec['response']['json'].get('triples', []):
                    tpl = tr.get('triple')
                    if isinstance(tpl, (list, tuple)) and len(tpl) == 3:
                        extracted.append([str(tpl[0]), str(tpl[1]), str(tpl[2])])
                system_triples = extracted

            if system_triples is None:
                # nothing to score for this id
                continue

            # GT relations (spaces -> underscore) for comparability with system triples
            gt_relations = {tr[1].replace(" ", "_") for tr in gt_triples}

            # FILTER predicted triples to only GT relations (benchmark behavior)
            filtered_system_triples = [tr for tr in system_triples if tr[1] in gt_relations]

            # Normalize for P/R/F1
            normalized_system_triples = {normalize_triple(tr[0], tr[1], tr[2]) for tr in filtered_system_triples}
            normalized_gt_triples     = {normalize_triple(tr[0], tr[1], tr[2]) for tr in gt_triples}

            # P/R/F1
            precision, recall, f1 = calculate_precision_recall_f1(normalized_gt_triples, normalized_system_triples)

            # Ontology conformance & relation hallucination on ALL predicted triples
            ont_conformance, rel_hallucination = get_ontology_conformance(ontology, system_triples)

            # Subject/Object hallucinations on ALL predicted triples
            subj_hallucination, obj_hallucination = get_subject_object_hallucinations(ps, ontology, sentence, system_triples)

            if  f1 < 1 and len(filtered_system_triples) > 0 and subj_hallucination == 0 and obj_hallucination == 0:
                print(f"sent: {sentence}\nf1: {f1}\nsys:{filtered_system_triples}\nground:{gt_triples}\n")

            # Per-item record (strings with 2 decimals, as in benchmark)
            eval_metrics = {
                "id": sent_id,
                "precision": f"{precision:.2f}",
                "recall":    f"{recall:.2f}",
                "f1":        f"{f1:.2f}",
                "onto_conf": f"{ont_conformance:.2f}",
                "rel_halluc":f"{rel_hallucination:.2f}",
                "sub_halluc":f"{subj_hallucination:.2f}",
                "obj_halluc":f"{obj_hallucination:.2f}",
                "llm_triples": system_triples,
                "filtered_llm_triples": filtered_system_triples,
                "gt_triples": gt_triples,
                "sent": sentence
            }
            eval_metrics_list.append(eval_metrics)

            # Aggregate sums (for averages later)
            t_p += precision
            t_r += recall
            t_f1 += f1
            t_onto_conf += ont_conformance
            t_rel_halluc += rel_hallucination
            t_sub_halluc += subj_hallucination
            t_obj_halluc += obj_hallucination

    # Save per-example metrics
    save_jsonl(eval_metrics_list, PER_EXAMPLE_OUT)

    # Averages over all GT test cases (benchmark macro averaging over items)
    total_test_cases = len(ground_truth)
    average_metrics = {
        "onto": ONTOLOGY_PATH.stem,
        "type": "all_test_cases",
        "avg_precision": f"{t_p/total_test_cases:.2f}",
        "avg_recall":    f"{t_r/total_test_cases:.2f}",
        "avg_f1":        f"{t_f1/total_test_cases:.2f}",
        "avg_onto_conf": f"{t_onto_conf/total_test_cases:.2f}",
        "avg_sub_halluc":f"{t_sub_halluc/total_test_cases:.2f}",
        "avg_rel_halluc":f"{t_rel_halluc/total_test_cases:.2f}",
        "avg_obj_halluc":f"{t_obj_halluc/total_test_cases:.2f}",
    }

    # Save aggregate (single JSON object)
    AGGREGATE_OUT.parent.mkdir(parents=True, exist_ok=True)
    with AGGREGATE_OUT.open('w', encoding='utf-8') as f:
        json.dump(average_metrics, f, ensure_ascii=False, indent=2)

    print("Saved:")
    print(" - Per-example:", PER_EXAMPLE_OUT)
    print(" - Aggregate  :", AGGREGATE_OUT)


In [119]:
# =========================================
# Cell 7 — Run it
# =========================================
evaluate_single_ontology(
    ONTOLOGY_PATH=ONTOLOGY_PATH,
    SYSTEM_OUTPUT_PATH=SYSTEM_OUTPUT_PATH,
    GROUND_TRUTH_PATH=GROUND_TRUTH_PATH,
    PER_EXAMPLE_OUT=PER_EXAMPLE_OUT,
    AGGREGATE_OUT=AGGREGATE_OUT
)


sent: Tomatoes, guanciale, cheese, olive oil are the main ingredients of the traditional Italian Amatriciana sauce.
f1: 0.0
sys:[['Amatriciana', 'mainIngredient', 'Tomatoes'], ['Amatriciana', 'mainIngredient', 'guanciale'], ['Amatriciana', 'mainIngredient', 'cheese'], ['Amatriciana', 'mainIngredient', 'olive oil']]
ground:[['Amatriciana_sauce', 'country', 'Italy'], ['Amatriciana_sauce', 'ingredient', 'Guanciale'], ['Amatriciana_sauce', 'mainIngredient', '"Tomatoes, guanciale, cheese, olive oil"']]

sent: Tomatoes, guanciale, cheese, olive oil are the main ingredients of Amatriciana sauce, which can be found in Italy.
f1: 0.25
sys:[['Amatriciana sauce', 'mainIngredient', 'Tomatoes'], ['Amatriciana sauce', 'mainIngredient', 'guanciale'], ['Amatriciana sauce', 'mainIngredient', 'cheese'], ['Amatriciana sauce', 'mainIngredient', 'olive oil'], ['Amatriciana sauce', 'country', 'Italy']]
ground:[['Amatriciana_sauce', 'country', 'Italy'], ['Amatriciana_sauce', 'ingredient', 'Olive_oil'], ['Ama