Implementing evaluation mertics on a simple example.

In [3]:
# !pip install openai
# !pip install langchain
# !pip install pandas
# !pip install -U sentence-transformers
# !pip install rdflib
# !pip install dotenv

In [4]:
import os
import pandas as pd
import json
from openai import OpenAI
from langchain import PromptTemplate
from sentence_transformers import SentenceTransformer, util
import re
from rdflib import Graph

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
# # Define the folder path
# folder_path = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/RDF-modelling-examples/Annotated_variables/"
# # Loop through all files in the folder
# for filename in os.listdir(folder_path):
#     if filename.endswith(".csv"):
#         csv_path = os.path.join(folder_path, filename)
        
#         # Read the CSV file
#         df = pd.read_csv(csv_path)
        
#         # Define JSON output path
#         json_filename = filename.replace(".csv", ".json")
#         json_path = os.path.join("/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/", json_filename)
        
#         # Convert to JSON
#         df.to_json(json_path, orient="records", lines=True)
        
#         print(f"Converted {filename} to {json_filename}")


In [4]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

# Load from .env file in current directory
load_dotenv()

# # Get the token from the environment
# hf_token = os.getenv("hugging_face_api_key")

# # Optional: check if token is loaded
# if not hf_token:
#     raise ValueError("HUGGINGFACE_HUB_TOKEN not found in .env")

# # Login to Hugging Face Hub
# login(token=hf_token)


True

In [22]:
# 1) Initialize OpenRouter client (replace <OPENROUTER_API_KEY> with your actual API key).
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),    
)

# client = OpenAI(
#     api_key=os.getenv("OPENAI_API_KEY"),    
# )

# 2) Folder containing JSON files (each file has one ground-truth variable record)
json_folder = "data/"
# json_folder = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data/one_variable/"

# 3) Models you want to compare on OpenRouter or OpenAI
# model_names = ["deepseek/deepseek-v3-base:free", "google/gemini-2.5-pro-exp-03-25:free"] # OpenRouter models
# model_names = ["deepseek/deepseek-v3-base:free"] # OpenRouter models
# model_names = ["gpt-4o-mini", "gpt-4o"]
model_names = ["meta-llama/llama-4-scout:free", "qwen/qwq-32b:free"]


# 4) Prompt template for asking the model to decompose the variable
PROMPT_PATH = "prompts/prompt_with_examples.txt"
with open(PROMPT_PATH,"r") as file:
    template = file.read()

prompt_template = PromptTemplate(
    template=template,
    input_variables=["variable", "description"]
)

# 5) LLM call helper using OpenRouter's chat endpoint
def call_model_openrouter(model_name, user_prompt):
    response = client.chat.completions.create(
        model=model_name,
        temperature=0,
        messages=[
            {"role": "user", "content": user_prompt}
        ]
    )
    return response.choices[0].message.content

# 6) Embedding model (SentenceTransformer) for checking similarity
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embedding_similarity(text1, text2):
    """Compute cosine similarity between two pieces of text."""
    emb1 = embed_model.encode(text1, convert_to_tensor=True)
    emb2 = embed_model.encode(text2, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()


In [16]:
# 7) We'll evaluate each of these keys with a threshold for correctness
# ONTO_KEYS = ["hasObjectOfInterest", "hasProperty",  "hasMatrix", "hasConstraint", "hasContext"]
ONTO_KEYS = ["hasObjectOfInterest", "objectOfInterestURI", "hasProperty", "hasPropertyURI", "hasMatrix", "MatrixURI", "hasConstraint", "ConstraintURI", "constrain1", "hasContext", "ContextURI"]
THRESHOLD = 0.90

In [17]:

def compute_confusion_for_field(gt_val, pred_val, threshold=0.90):
    """
    Correct Logic:
      True Positive (TP):   GT not empty, pred not empty, similarity >= threshold
      False Positive (FP):  GT empty, pred not empty
      False Negative (FN):  GT not empty and (pred empty OR similarity < threshold)
      True Negative (TN):   GT empty, pred empty
    """
    # Strip leading/trailing whitespace
    gt_val = gt_val.strip()
    pred_val = pred_val.strip()
    
    # If ground truth is non-empty => label is "present".
    if gt_val:
        # Prediction non-empty => check similarity
        if pred_val:
            sim = embedding_similarity(gt_val, pred_val)
            if sim >= threshold:
                return (1, 0, 0, 0)  # TP
            else:
                return (0, 0, 1, 0)  # FN (prediction too dissimilar)
        else:
            # Prediction empty => definitely FN
            return (0, 0, 1, 0)
    
    # If ground truth is empty => label is "absent".
    else:
        if pred_val:
            # Predicted something when nothing was needed => FP
            return (0, 1, 0, 0)
        else:
            # Both empty => TN
            return (0, 0, 0, 1)

    

# Helper to compute precision, recall, f1 from confusion matrix totals
def precision_recall_f1(tp, fp, fn, tn):
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

In [18]:
def extract_component(graph, component):
    assert component in ["hasObjectOfInterest","hasProperty","hasMatrix","hasConstraint","hasContext"]

    q = '''
        PREFIX iadopt: <https://w3id.org/iadopt/ont/>
        
        SELECT ?name
        WHERE {
            ?p rdf:type iadopt:Variable .
        
            ?p iadopt:{component} ?name .
        }
    '''
    q = q.replace("{component}",component)
    
    output = None
    for r in g.query(q):
        ## We should admit multiple hasMatrix, hasContraint, and hasContext. 
        #if component in ["hasObjectOfInterest","hasProperty"]:
        if component in ["hasObjectOfInterest","hasProperty","hasMatrix","hasConstraint","hasContext"]:
            output = r["name"].rsplit('/')[-1]
            output = output.rsplit('#')[-1]
            break
        else:
            if output is None:
                output = []
            output.append(r["name"].replace("https://w3id.org/iadopt/ont/",""))
    return output

In [23]:
# 8) Main loop over JSON files
all_rows = []  # We'll store row-based results to build a DF

for file_name in os.listdir(json_folder):
    if file_name.endswith(".json"):
        with open(os.path.join(json_folder, file_name), "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                data = json.loads(line)

                variable_text = data.get("Variable", "")
                description_text = data.get("description", "")
                ground_truth = {k: data.get(k, "") for k in ONTO_KEYS}
                prompt_text = prompt_template.format(
                    variable=variable_text,
                    description=description_text
                )

                # For each model, get predictions and compute confusion matrix
                for model_name in model_names:
                    llm_output = call_model_openrouter(model_name, prompt_text)

                    pattern = r"```(\w*)\n(.*?)\n```"
                    matches = re.findall(pattern, llm_output, re.DOTALL)
                    for match in matches:
                        language = match[0]
                        context = match[1]
                        break

                    with open("output.ttl","w") as file:
                        if "@prefix : <https://w3id.org/iadopt/ont/> ." not in context:
                            context = "@prefix : <https://w3id.org/iadopt/ont/> .\n" + context
                        file.write(context)
                    
                    g = Graph()
                    g.parse("output.ttl",format="turtle") # Maybe if the rdf file cannot be parse retry
                    
                    predicted_json = {"hasObjectOfInterest":None,"hasProperty":None, "hasMatrix": None, "hasConstraint":None, "hasContext": None}
                    
                    for elem in predicted_json:
                        predicted_json[elem]=extract_component(g,elem)

                    # Accumulate confusion counts across all keys
                    total_tp = total_fp = total_fn = total_tn = 0
                    for key in ONTO_KEYS:
                        gt_val = ground_truth.get(key, "") or ""
                        pred_val = predicted_json.get(key, "") or ""
                        tp, fp, fn, tn = compute_confusion_for_field(gt_val, pred_val)
                        total_tp += tp
                        total_fp += fp
                        total_fn += fn
                        total_tn += tn

                    prec, rec, f1 = precision_recall_f1(total_tp, total_fp, total_fn, total_tn)

                    # Store everything in all_rows, including ground truth & the predicted JSON
                    row_dict = {
                        "File": file_name,
                        "Variable": variable_text,
                        "Model": model_name,
                        "TP": total_tp,
                        "FP": total_fp,
                        "FN": total_fn,
                        "TN": total_tn,
                        "Precision": round(prec, 3),
                        "Recall": round(rec, 3),
                        "F1": round(f1, 3),
                        # Store ground truth & predicted as strings for easy reference
                        "GroundTruth": json.dumps(ground_truth),
                        "LLMOutput": json.dumps(predicted_json)
                    }
                    all_rows.append(row_dict)

In [24]:
# 9) Create a DataFrame with aggregated results
df_results = pd.DataFrame(all_rows)
print("\n=== Final Results DataFrame ===\n")
print(df_results)

# Group by [File, Model] to see average metrics if multiple lines in one file
summary = df_results.groupby(["File", "Model"]).agg({
    "Precision": "mean",
    "Recall": "mean",
    "F1": "mean"
}).reset_index()
summary = summary.round(3)

print("\n=== Summary (Grouped by File, Model) ===\n")
print(summary)


=== Final Results DataFrame ===

          File                                           Variable  \
0    var3.json                                        Cloud cover   
1    var3.json                                        Cloud cover   
2    var1.json                 Electron density in the solar wind   
3    var1.json                 Electron density in the solar wind   
4    var5.json  Atmosphere_optical_thickness_due_to_particulat...   
5    var5.json  Atmosphere_optical_thickness_due_to_particulat...   
6   var17.json  Docosahexaenoic acid content per dry weight (D...   
7   var17.json  Docosahexaenoic acid content per dry weight (D...   
8    var4.json                 Atmospheric boundary layer heights   
9    var4.json                 Atmospheric boundary layer heights   
10   var2.json                      Air daily maximum temperature   
11   var2.json                      Air daily maximum temperature   

                            Model  TP  FP  FN  TN  Precision  Recall

In [25]:
# 10) Finally, show ground truth and LLM outputs after the summary
print("\n=== Ground Truth vs. LLM Output Details ===\n")
for idx, row in df_results.iterrows():
    print("File:", row["File"])
    print("Model:", row["Model"])
    print("Variable:", row["Variable"])
    print("Ground Truth:", row["GroundTruth"])
    print("LLM Output:", row["LLMOutput"])
    print("-" * 50)


=== Ground Truth vs. LLM Output Details ===

File: var3.json
Model: meta-llama/llama-4-scout:free
Variable: Cloud cover
Ground Truth: {"hasObjectOfInterest": "sky", "objectOfInterestURI": "https://example.org/ex/sky", "hasProperty": "cloudiness", "hasPropertyURI": "https://example.org/cloudiness", "hasMatrix": "study site", "MatrixURI": "http://purl.bioontology.org/ontology/LNC/MTHU054795", "hasConstraint": null, "ConstraintURI": null, "constrain1": null, "hasContext": null, "ContextURI": null}
LLM Output: {"hasObjectOfInterest": "clouds", "hasProperty": "cloud_cover", "hasMatrix": null, "hasConstraint": null, "hasContext": null}
--------------------------------------------------
File: var3.json
Model: qwen/qwq-32b:free
Variable: Cloud cover
Ground Truth: {"hasObjectOfInterest": "sky", "objectOfInterestURI": "https://example.org/ex/sky", "hasProperty": "cloudiness", "hasPropertyURI": "https://example.org/cloudiness", "hasMatrix": "study site", "MatrixURI": "http://purl.bioontology.org