In [63]:
import json
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from typing import Dict, List

nltk.download('punkt')
nltk.download('wordnet')

# === Utility Normalization Methods === #

def lemmatize_and_normalize(text, lemmatizer):
    """Lowercase, replace underscores, remove all non-alphanumerics except space,
    then tokenize, lemmatize, and join without spaces."""
    text = text.lower()
    text = text.replace("_", " ")              # underscores → spaces
    text = re.sub(r"[^a-z0-9 ]", "", text)    # remove special chars except space
    text = text.strip()
    tokens = word_tokenize(text)
    lemmatized = "".join(lemmatizer.lemmatize(token) for token in tokens)
    return lemmatized

def normalize_triple(sub_label, rel_label, obj_label, lemmatizer, stem_rel=False):
    """Normalize triple components consistently."""
    sub_label = lemmatize_and_normalize(sub_label, lemmatizer)
    obj_label = lemmatize_and_normalize(obj_label, lemmatizer)

    rel_label_clean = rel_label.lower().replace("_", " ")
    rel_label_clean = re.sub(r"[^a-z0-9 ]", "", rel_label_clean).strip()
    if stem_rel:
        rel_label_clean = " ".join([lemmatizer.lemmatize(w) for w in word_tokenize(rel_label_clean)])
    rel_label_clean = re.sub(r"\s+", "", rel_label_clean)

    return f"{sub_label}{rel_label_clean}{obj_label}"

# === Core Evaluation Metrics === #

def calculate_precision_recall_f1(gold_set, pred_set):
    #print("gold_set", gold_set)
    #print("pred_set", pred_set)
    if not pred_set:
        return 0.0, 0.0, 0.0
    intersection = gold_set.intersection(pred_set)
    p = len(intersection) / len(pred_set)
    r = len(intersection) / len(gold_set)
    f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0
    return p, r, f1

def get_subject_object_hallucinations(lemmatizer, ontology, sentence, triples):
    if not triples:
        return 0, 0
    extended_sentence = sentence + " " + " ".join([c["label"] for c in ontology['concepts']])
    normalized_sentence = lemmatize_and_normalize(extended_sentence, lemmatizer)

    subj_halluc, obj_halluc = 0, 0
    for sub, rel, obj in triples:
        norm_sub = lemmatize_and_normalize(sub, lemmatizer)
        norm_obj = lemmatize_and_normalize(obj, lemmatizer)
        if norm_sub not in normalized_sentence:
            subj_halluc += 1
        if norm_obj not in normalized_sentence:
            obj_halluc += 1

    return subj_halluc / len(triples), obj_halluc / len(triples)

def get_ontology_conformance(ontology, triples):
    if not triples:
        return 1, 0
    lemmatizer = WordNetLemmatizer()
    ont_rels = {lemmatize_and_normalize(rel['label'], lemmatizer) for rel in ontology['relations']}
    num_conformant = sum(
        1 for tr in triples if lemmatize_and_normalize(tr[1], lemmatizer) in ont_rels
    )
    #print("len(triples)",len(triples))
    conformance = num_conformant / len(triples)
    return conformance, 1 - conformance

# === Main Evaluation Pipeline === #

def evaluate_and_save_results(ground_truth_data, ontology, model_data, output_file):
    lemmatizer = WordNetLemmatizer()
    results = []

    for gt_entry, model_entry in zip(ground_truth_data, model_data):
        if not gt_entry.get('triples'):
            continue

        gt_triples = [[tr['sub'], tr['rel'], tr['obj']] for tr in gt_entry['triples']]
        #print(model_entry['triples'])
        system_triples = [[tr['sub'], tr['rel'], tr['obj']] for tr in model_entry['triples']]
        #system_triples = model_entry['triples']
        #print("gt_triples", gt_triples)
        #print("system_triples", system_triples)

        # Filter system triples by GT relations (normalized)
        gt_relations = {lemmatize_and_normalize(tr[1], lemmatizer) for tr in gt_triples}
        filtered_system_triples = [tr for tr in system_triples if lemmatize_and_normalize(tr[1], lemmatizer) in gt_relations]

        normalized_gt_triples = {normalize_triple(tr[0], tr[1], tr[2], lemmatizer) for tr in gt_triples}
        normalized_system_triples = {normalize_triple(tr[0], tr[1], tr[2], lemmatizer) for tr in filtered_system_triples}

        #print("normalized_gt_triples", normalized_gt_triples)
        #print("normalized_system_triples", normalized_system_triples)

        precision, recall, f1 = calculate_precision_recall_f1(normalized_gt_triples, normalized_system_triples)
        ont_conformance, rel_hallucination = get_ontology_conformance(ontology, system_triples)
        subj_hallucination, obj_hallucination = get_subject_object_hallucinations(lemmatizer, ontology, gt_entry['sent'], system_triples)

        result = {
            "id": gt_entry['id'],
            "precision": f"{precision:.2f}",
            "recall": f"{recall:.2f}",
            "f1": f"{f1:.2f}",
            "onto_conf": f"{ont_conformance:.2f}",
            "rel_halluc": f"{rel_hallucination:.2f}",
            "sub_halluc": f"{subj_hallucination:.2f}",
            "obj_halluc": f"{obj_hallucination:.2f}",
            "llm_triples": system_triples,
            "filtered_llm_triples": filtered_system_triples,
            "gt_triples": gt_triples,
            "sent": gt_entry['sent']
        }

        results.append(result)

    with open(output_file, "w") as f:
        for res in results:
            f.write(json.dumps(res) + "\n")

# === File Reading Utilities === #

def read_jsonl(file_path, required_keys=None):
    with open(file_path, "r") as file:
        data = [json.loads(line) for line in file]
        if required_keys:
            for entry in data:
                if not all(key in entry for key in required_keys):
                    raise ValueError(f"Missing keys in entry: {entry}")
        return data

def read_ontology_json(json_path):
    with open(json_path) as file:
        return json.load(file)

[nltk_data] Downloading package punkt to
[nltk_data]     /upb/users/b/balram/profiles/unix/cs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /upb/users/b/balram/profiles/unix/cs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
if __name__ == "__main__":
    run_evaluations_for_all_categories()


In [None]:
#Wikidata

In [65]:
def run_evaluations_for_all_categories_wikidata():
    categories = [
        "movie", "music", "sport", "book", "military",
        "computer", "space", "politics", "nature", "culture"
    ]

    for i, category in enumerate(categories, start=1):
        print(f"\n=== Running Evaluation for Category: {i} - {category} ===")

        output_filepath = f"../data/wikidata/improvised_evaluation_statistics/Llama/without_missing_GT/run/improved_evaluation/ont_{i}_{category}_llm_stats_improved.jsonl"
        ground_truth_filepath = f"../data/wikidata/ground_truth/ont_{i}_{category}_ground_truth.jsonl"
        ontology_filepath = f"/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/ontology/{i}_{category}_ontology.json"
        model_response_filepath = f"/upb/users/b/balram/profiles/unix/cs/Text2KG/withont/data/wikidata/response_run2/Llama3/cot_response_without_quant_batch/ont_{i}_{category}_llm_response_improved.jsonl"

        try:
            # Load Data
            ground_truth_data = read_jsonl(ground_truth_filepath)
            ontology_data = read_ontology_json(ontology_filepath)
            model_data = read_jsonl(model_response_filepath, required_keys=['id', 'triples'])

            # Run Evaluation
            evaluate_and_save_results(ground_truth_data, ontology_data, model_data, output_filepath)
            print(f"✅ Successfully evaluated and saved results for {category}")
        except Exception as e:
            print(f"❌ Error processing category '{category}': {e}")


In [66]:
if __name__ == "__main__":
    run_evaluations_for_all_categories_wikidata()



=== Running Evaluation for Category: 1 - movie ===
✅ Successfully evaluated and saved results for movie

=== Running Evaluation for Category: 2 - music ===
✅ Successfully evaluated and saved results for music

=== Running Evaluation for Category: 3 - sport ===
✅ Successfully evaluated and saved results for sport

=== Running Evaluation for Category: 4 - book ===
✅ Successfully evaluated and saved results for book

=== Running Evaluation for Category: 5 - military ===
✅ Successfully evaluated and saved results for military

=== Running Evaluation for Category: 6 - computer ===
✅ Successfully evaluated and saved results for computer

=== Running Evaluation for Category: 7 - space ===
✅ Successfully evaluated and saved results for space

=== Running Evaluation for Category: 8 - politics ===
✅ Successfully evaluated and saved results for politics

=== Running Evaluation for Category: 9 - nature ===
✅ Successfully evaluated and saved results for nature

=== Running Evaluation for Category: