Implementing evaluation mertics on a simple example.

In [7]:
# !pip install openai
# !pip install langchain
# !pip install pandas
# !pip install -U sentence-transformers
# !pip install rapidfuzz
# !pip install openpyxl 

In [30]:
import os
import pandas as pd
import json
from openai import OpenAI
from langchain import PromptTemplate
from sentence_transformers import SentenceTransformer, util
import re
from rapidfuzz import fuzz 
from datetime import datetime
import pathlib
import openpyxl  
from openpyxl import load_workbook

In [9]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

# Load from .env file in current directory
load_dotenv()

# Get the token from the environment
hf_token = os.getenv("hugging_face_api_key")

# Optional: check if token is loaded
if not hf_token:
    raise ValueError("HUGGINGFACE_HUB_TOKEN not found in .env")

# Login to Hugging Face Hub
login(token=hf_token)


Variable 1, 4 and 10 are given as examples. 

In [10]:
# 4) Prompt template for asking the model to decompose the variable
prompt_template = PromptTemplate(
    template="""
You are a knowledge engineer and a climate change scientist. I need you to model a climate change variable using the I-ADOPT framework. A variable is usually the result of a sensor measurement or of a laboratory analysis on a sample. The task is to generate Json code with a description of the variable according to the I-ADOPT ontology. See the following examples: 
Example 1:
{{
  "Provided Variable Name": "Electron density in the solar wind",
  "Definition": "Density (particle per cm3) of electrons measured in the Solar Wind.",
  "Property (Label)": "volumetric number density",
  "ObjectOfInterest (Label)": "electron",
  "Matrix (Label)": "solar wind",
  "Applicable units of measure": "cm-3"
}}
Example 2:
{{
  "Provided Variable Name": "Atmospheric boundary layer height defined by temperature inversion",
  "Definition": "Atmospheric boundary layer height defined by temperature inversion.",
  "Property (Label)": "height",
  "ObjectOfInterest (Label)": "atmospheric boundary layer",
  "Matrix (Label)": "atmosphere",
  "constraint": [
    "defined by temperature inversion",
    "above the valley floor"
  ],
  "Constraint component": [
    "atmospheric boundary layer",
    "height"
  ],
  "Applicable units of measure": "m"
}}
Example 3:
{{
  "Provided Variable Name": "Peak ground acceleration",
  "Definition": "Peak acceleration measured on the earth surface when facing seismic events, like earthquakes.",
  "Statistical Modifier": "maximum",
  "Property (Label)": "acceleration",
  "ObjectOfInterest (Label)": "ground",
  "constraint": [
    "surface",
    "during seismic events"
  ],
  "Constraint component": [
    "ground",
    "acceleration"
  ],
  "Applicable units of measure": "m s-2"
}}
Given the example above. model the following variable:
Variable: {variable}
Description: {description}
Decompose it into:
- Statistical Modifier (Label) --OPTIONAL--
- Property (Label)
- ObjectOfInterest (Label)
- Matrix (Label) --OPTIONAL--
- ContextObject (Label) --OPTIONAL--
- constraint --OPTIONAL--
- Constraint component --OPTIONAL--
- Applicable units of measure --OPTIONAL--
Return a JSON in this exact format:
{{
  "Provided Variable Name": "...",
  "Definition": "...",
  "Statistical Modifier (Label)": "...",
  "Property (Label)": "...", 
  "ObjectOfInterest (Label)": "...",
  "Matrix (Label)": "...",
  "ContextObject (Label)": "...",
  "constraint": "..."
  "Constraint component": "...",
  "Applicable units of measure": "..."
}}
""",
    input_variables=["variable", "description"]
)


In [11]:
def extract_json_from_response(response_text):
    """
    Extract the first valid JSON object from the response text.
    Cleans up common formatting like ```json or plain ``` fences.
    """
    # Remove code block markers like ```json or ```
    cleaned_text = re.sub(r"```(?:json)?", "", response_text).strip()

    # Attempt to extract a JSON-like structure using regex
    match = re.search(r"\{.*\}", cleaned_text, re.DOTALL)
    if match:
        json_str = match.group(0)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print("JSON decoding failed:", e)
            return {}
    else:
        print("No JSON found in response.")
        return {}


In [12]:
# 2) Folder containing JSON files (each file has one ground-truth variable record)
json_folder = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data/"
json_folder = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data/five_variables/"


In [None]:
# 3) Models you want to compare on OpenRouter or OpenAI
# model_names = ["deepseek/deepseek-v3-base:free", "google/gemini-2.5-pro-exp-03-25:free"] # OpenRouter models
# model_names = ["deepseek/deepseek-v3-base:free"] # OpenRouter models
####### ----
model_names = [
    "mistralai/mistral-7b-instruct",
    "mistralai/mistral-small-24b-instruct-2501",
    "meta-llama/llama-3.2-11b-vision-instruct",
    "meta-llama/llama-3.3-70b-instruct",
    "openai/gpt-4o-mini",
    "openai/gpt-4.1-mini"
]
### Initialize OpenRouter client (replace <OPENROUTER_API_KEY> with your actual API key).
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),    
)

############ -----
# model_names = ["gpt-4o-mini", "gpt-4o"]
# client = OpenAI(
#     api_key=os.getenv("OPENAI_API_KEY"),    
# )

In [14]:
# 5) LLM call helper using OpenRouter's chat endpoint
def call_model_openrouter(model_name, user_prompt):
    response = client.chat.completions.create(
        extra_headers={
            "HTTP-Referer": "<YOUR_SITE_URL>",  # optional
            "X-Title": "<YOUR_SITE_NAME>"       # optional
        },
        model=model_name,
        temperature=0,
        messages=[
            {"role": "user", "content": user_prompt}
        ]
    )
    # print(f"Model: {model_name}, Response: {response}")
    return response.choices[0].message.content

# 6) Embedding model (SentenceTransformer) for checking similarity
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embedding_similarity(text1, text2):
    """Compute cosine similarity between two pieces of text."""
    emb1 = embed_model.encode(text1, convert_to_tensor=True)
    emb2 = embed_model.encode(text2, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()


In [None]:
EXACT_THRESHOLD = 0.90          # tweak if you want it stricter/looser

def exact_similarity(s1: str, s2: str) -> float:
    """CaseInsensitive Levenshtein ratio in [0,1]."""
    return fuzz.ratio(s1.lower().strip(), s2.lower().strip()) / 100.0


def compute_confusion_for_field_exact(gt_val, pred_val, threshold=EXACT_THRESHOLD):
    """
    Same logic as your embed version, but uses exact_similarity.
    """
    # list → comma‑separated str
    if isinstance(gt_val, list):
        gt_val = ", ".join(str(v).strip() for v in gt_val if v)
    if isinstance(pred_val, list):
        pred_val = ", ".join(str(v).strip() for v in pred_val if v)

    gt_val  = gt_val.strip()  if isinstance(gt_val,  str) else ""
    pred_val = pred_val.strip() if isinstance(pred_val, str) else ""

    if gt_val:                                          # ground‑truth present
        if pred_val:                                    # prediction present
            sim = exact_similarity(gt_val, pred_val)
            return (1,0,0,0) if sim >= threshold else (0,0,1,0)   # TP or FN
        else:
            return (0,0,1,0)                            # FN
    else:                                               # ground‑truth absent
        return (0,1,0,0) if pred_val else (0,0,0,1)     # FP or TN


In [16]:
# 7) We'll evaluate each of these keys with a threshold for correctness
# ONTO_KEYS = ["hasObjectOfInterest", "hasProperty", "hasMatrix", "hasConstraint", "constrain1", "hasContext"]
ONTO_KEYS =  ["Statistical Modifier (Label)", "Property (Label)", "ObjectOfInterest (Label)", "Matrix (Label)", "ContextObject (Label)", "constraint", "Constraint component", "Applicable units of measure"]
THRESHOLD = 0.90

In [17]:
def compute_confusion_for_field(gt_val, pred_val, threshold=0.90):
    """
    Correct Logic:
      True Positive (TP):   GT not empty, pred not empty, similarity >= threshold
      False Positive (FP):  GT empty, pred not empty
      False Negative (FN):  GT not empty and (pred empty OR similarity < threshold)
      True Negative (TN):   GT empty, pred empty
    """

    # Convert lists to string representations
    if isinstance(gt_val, list):
        gt_val = ", ".join(str(v).strip() for v in gt_val if v)
    if isinstance(pred_val, list):
        pred_val = ", ".join(str(v).strip() for v in pred_val if v)

    # Strip leading/trailing whitespace
    gt_val = gt_val.strip() if isinstance(gt_val, str) else ""
    pred_val = pred_val.strip() if isinstance(pred_val, str) else ""

    # If ground truth is non-empty => label is "present".
    if gt_val:
        # Prediction non-empty => check similarity
        if pred_val:
            sim = embedding_similarity(gt_val, pred_val)
            if sim >= threshold:
                return (1, 0, 0, 0)  # TP
            else:
                return (0, 0, 1, 0)  # FN (prediction too dissimilar)
        else:
            # Prediction empty => definitely FN
            return (0, 0, 1, 0)
    
    # If ground truth is empty => label is "absent".
    else:
        if pred_val:
            # Predicted something when nothing was needed => FP
            return (0, 1, 0, 0)
        else:
            # Both empty => TN
            return (0, 0, 0, 1)



# Helper to compute precision, recall, f1 from confusion matrix totals
def precision_recall_f1(tp, fp, fn, tn):
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

In [18]:
# 8) Main loop over JSON files
all_rows = []  # We'll store row-based results to build a DF

for file_name in os.listdir(json_folder):
    if file_name.endswith(".json"):
        with open(os.path.join(json_folder, file_name), "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Failed to parse {file_name}: {e}")
                continue

            variable_text = data.get("Provided Variable Name", "")
            description_text = data.get("Definition", "")
            ground_truth = {k: data.get(k, "") for k in ONTO_KEYS}
            prompt_text = prompt_template.format(
                variable=variable_text,
                description=description_text
            )

            # For each model, get predictions and compute confusion matrix
            for model_name in model_names:
                llm_output = call_model_openrouter(model_name, prompt_text)
                # --- Remove code fences if present ---
                # This will remove any ``` or ```json lines
                # cleaned_output = re.sub(r"```(\w+)?", "", llm_output).strip()

                # Attempt to parse the cleaned string as JSON
                predicted_json = extract_json_from_response(llm_output)

                # Accumulate confusion counts across all keys
                total_tp_embed = total_fp_embed = total_fn_embed = total_tn_embed = 0
                total_tp_exact = total_fp_exact = total_fn_exact = total_tn_exact = 0   
                for key in ONTO_KEYS:
                    gt_val = ground_truth.get(key, "") or ""
                    pred_val = predicted_json.get(key, "") or ""
                    # print(f"GT: {gt_val}, Pred: {pred_val}")
                    # --- Embedding version ---
                    tp_e, fp_e, fn_e, tn_e = compute_confusion_for_field(gt_val, pred_val)
                    total_tp_embed += tp_e
                    total_fp_embed += fp_e
                    total_fn_embed += fn_e
                    total_tn_embed += tn_e

                    # --- Exact‑match version ---
                    tp_x, fp_x, fn_x, tn_x = compute_confusion_for_field_exact(gt_val, pred_val)
                    total_tp_exact += tp_x
                    total_fp_exact += fp_x
                    total_fn_exact += fn_x
                    total_tn_exact += tn_x

                prec_embed, rec_embed, f1_embed = precision_recall_f1(total_tp_embed, total_fp_embed, total_fn_embed, total_tn_embed)
                prec_exact, rec_exact, f1_exact = precision_recall_f1(total_tp_exact, total_fp_exact, total_fn_exact, total_tn_exact)
                filtered_output = {k: v for k, v in predicted_json.items() if k not in ["Provided Variable Name", "Definition"]}
                # Store everything in all_rows, including ground truth & the predicted JSON
                row_dict = {
                "File": file_name,
                "Variable": variable_text,
                "Model": model_name,

                # --- embedding metrics ---
                "TP_embed": total_tp_embed, "FP_embed": total_fp_embed,
                "FN_embed": total_fn_embed, "TN_embed": total_tn_embed,
                "Precision_embed": round(prec_embed, 3),
                "Recall_embed":    round(rec_embed, 3),
                "F1_embed":        round(f1_embed, 3),

                # --- exact‑match metrics ---
                "TP_exact": total_tp_exact, "FP_exact": total_fp_exact,
                "FN_exact": total_fn_exact, "TN_exact": total_tn_exact,
                "Precision_exact": round(prec_exact, 3),
                "Recall_exact":    round(rec_exact, 3),
                "F1_exact":        round(f1_exact, 3),

                # diagnostic text
                "GroundTruth": json.dumps(ground_truth),
                "LLMOutput":   json.dumps(filtered_output)
                }
                all_rows.append(row_dict)

JSON decoding failed: Extra data: line 19 column 1 (char 628)


In [19]:
# 9) Create a DataFrame with aggregated results
df_results = pd.DataFrame(all_rows)
# print("\n=== Final Results DataFrame ===\n")
# print(df_results)

# 10) Compute average metrics for each model (across all files)
# Embedding
summary_embed = df_results.groupby(["Model"]).agg({
    "Precision_embed":"mean",
    "Recall_embed":"mean",
    "F1_embed":"mean"
}).reset_index().round(3).sort_values("F1_embed", ascending=False)

# Exact match
summary_exact = df_results.groupby(["Model"]).agg({
    "Precision_exact":"mean",
    "Recall_exact":"mean",
    "F1_exact":"mean"
}).reset_index().round(3).sort_values("F1_exact", ascending=False)

print("\n=== PerModel Averages EMBEDDING SIMILARITY ===")
print(summary_embed)

print("\n=== PerModel Averages EXACT MATCH (case/typoInsensitive) ===")
print(summary_exact)



=== PerModel Averages EMBEDDING SIMILARITY ===
                                       Model  Precision_embed  Recall_embed  \
1   meta-llama/llama-3.2-11b-vision-instruct            0.353         0.263   
4  mistralai/mistral-small-24b-instruct-2501            0.310         0.257   
5                        openai/gpt-4.1-mini            0.337         0.223   
0      deepseek/deepseek-r1-distill-qwen-14b            0.327         0.223   
3              mistralai/mistral-7b-instruct            0.277         0.223   
6                         openai/gpt-4o-mini            0.440         0.190   
2          meta-llama/llama-3.3-70b-instruct            0.240         0.230   

   F1_embed  
1     0.293  
4     0.278  
5     0.259  
0     0.253  
3     0.242  
6     0.240  
2     0.233  

=== PerModel Averages EXACT MATCH (case/typoInsensitive) ===
                                       Model  Precision_exact  Recall_exact  \
4  mistralai/mistral-small-24b-instruct-2501            0.310     

In [20]:
# 10) Finally, show ground truth and LLM outputs after the summary
print("\n=== Ground Truth vs. LLM Output Details ===\n")
for idx, row in df_results.iterrows():
    print("File:", row["File"])
    print("Model:", row["Model"])
    print("Variable:", row["Variable"])
    print("Ground Truth:", row["GroundTruth"])
    print("LLM Output:", row["LLMOutput"])
    print("-" * 50)


=== Ground Truth vs. LLM Output Details ===

File: variable_21.0.json
Model: mistralai/mistral-7b-instruct
Variable: Resting systolic blood pressure
Ground Truth: {"Statistical Modifier (Label)": "", "Property (Label)": "pressure", "ObjectOfInterest (Label)": "blood", "Matrix (Label)": "human", "ContextObject (Label)": "", "constraint": ["systolic", "resting"], "Constraint component": ["pressure", "human"], "Applicable units of measure": ""}
LLM Output: {"Statistical Modifier (Label)": "systolic", "Property (Label)": "blood pressure", "ObjectOfInterest (Label)": "blood", "Matrix (Label)": "body", "ContextObject (Label)": "resting", "constraint": ["during rest"], "Constraint component": ["blood pressure", "systolic"], "Applicable units of measure": "mmHg"}
--------------------------------------------------
File: variable_21.0.json
Model: mistralai/mistral-small-24b-instruct-2501
Variable: Resting systolic blood pressure
Ground Truth: {"Statistical Modifier (Label)": "", "Property (Labe

In [34]:
# ---------- run‑specific metadata ----------
run_id         = datetime.now().strftime("%Y%m%d_%H%M%S")
experiment_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
variables_ran   = sorted({f for f in df_results["File"]})
models_ran      = model_names                        # already defined

df_meta = pd.DataFrame(
    {
        "run_id":      [run_id],
        "timestamp":   [experiment_time],
        "variables":   [", ".join(variables_ran)],
        "models":      [", ".join(models_ran)],
    }
)

# Create output dir if it doesn't exist
out_dir = pathlib.Path("benchmarking_outputs")
out_dir.mkdir(exist_ok=True)
history_file = out_dir / "i_adopt_benchmark_history.xlsx"

summary_combined = pd.merge(
    summary_embed, summary_exact,
    on="Model",
    suffixes=("_embed", "_exact")
)

# 1. Write or append to the workbook
if not history_file.exists():
    # First run: create file and write everything from scratch
    with pd.ExcelWriter(history_file, engine="openpyxl", mode="w") as writer:
        # Create a persistent meta sheet with headers
        df_meta.to_excel(writer, sheet_name="meta", index=False)

        # First summary sheet (combined metrics)
        summary_combined.to_excel(writer, sheet_name=f"summary_{run_id}", index=False)

        # Full results
        df_results.to_excel(writer, sheet_name=f"rows_{run_id}", index=False)

else:
    # File exists: append one row to meta, add new result sheets
    with pd.ExcelWriter(history_file, engine="openpyxl", mode="a", if_sheet_exists="overlay") as writer:
        # Add new sheets
        summary_combined.to_excel(writer, sheet_name=f"summary_{run_id}", index=False)
        df_results.to_excel(writer, sheet_name=f"rows_{run_id}", index=False)

    # Now append meta manually using openpyxl
    wb = load_workbook(history_file)
    ws = wb["meta"]
    next_row = ws.max_row + 1
    for col_idx, val in enumerate(df_meta.iloc[0], start=1):
        ws.cell(row=next_row, column=col_idx, value=val)
    wb.save(history_file)
    wb.close()

print(f"✔︎ Appended results and metadata to {history_file.resolve()}")

✔︎ Appended results and metadata to /Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/benchmarking_outputs/i_adopt_benchmark_history.xlsx
