Implementing evaluation mertics on a simple example.

In [19]:
# !pip install openai
# !pip install langchain
# !pip install pandas
# !pip install -U sentence-transformers


In [20]:
import os
import pandas as pd
import json
from openai import OpenAI
from langchain import PromptTemplate
from sentence_transformers import SentenceTransformer, util
import re

In [21]:
# Path to your CSV file
csv_path = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data/Challenge variable descriptions version 2 - agreed solutions 2.csv"

# Output folder
output_folder = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data"
os.makedirs(output_folder, exist_ok=True)

# Read the CSV
df = pd.read_csv(csv_path)

# Forward-fill the first column to group related rows
df.iloc[:, 0] = df.iloc[:, 0].fillna(method="ffill")

# Get column names
columns = df.columns.tolist()

# Group by first column (variable ID or index)
grouped = df.groupby(columns[0])

for var_id, group in grouped:
    # Choose the variable name from the first non-null "Variable Name" entry
    var_name = str(group.iloc[0].get("Variable Name", f"variable_{var_id}")).strip()
    var_name_clean = var_name.replace(" ", "_").replace("/", "_") or f"variable_{var_id}"

    json_filename = f"{var_name_clean}.json"
    json_path = os.path.join(output_folder, json_filename)

    # Create a dictionary where each key is a column name
    # and each value is:
    # - a single string if only one unique non-null value
    # - a list if multiple unique non-null values
    variable_dict = {}
    for col in columns[1:]:  # Skip the index/grouping column
        values = group[col].dropna().unique().tolist()
        if not values:
            continue
        variable_dict[col] = values[0] if len(values) == 1 else values

    # Save JSON
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(variable_dict, f, indent=2, ensure_ascii=False)

print(f"Created {len(grouped)} JSON files in {output_folder}")


Created 24 JSON files in /Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data


  df.iloc[:, 0] = df.iloc[:, 0].fillna(method="ffill")


In [4]:
# # Define the folder path
# folder_path = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/RDF-modelling-examples/Annotated_variables/"
# # Loop through all files in the folder
# for filename in os.listdir(folder_path):
#     if filename.endswith(".csv"):
#         csv_path = os.path.join(folder_path, filename)
        
#         # Read the CSV file
#         df = pd.read_csv(csv_path)
        
#         # Define JSON output path
#         json_filename = filename.replace(".csv", ".json")
#         json_path = os.path.join("/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/", json_filename)
        
#         # Convert to JSON
#         df.to_json(json_path, orient="records", lines=True)
        
#         print(f"Converted {filename} to {json_filename}")


In [5]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

# Load from .env file in current directory
load_dotenv()

# Get the token from the environment
hf_token = os.getenv("hugging_face_api_key")

# Optional: check if token is loaded
if not hf_token:
    raise ValueError("HUGGINGFACE_HUB_TOKEN not found in .env")

# Login to Hugging Face Hub
login(token=hf_token)


Variable 1, 4 and 10 are given as examples. 

In [None]:
# 4) Prompt template for asking the model to decompose the variable
prompt_template = PromptTemplate(
    template="""
You are a knowledge engineer and a climate change scientist. I need you to model a climate change variable using the I-ADOPT framework. A variable is usually the result of a sensor measurement or of a laboratory analysis on a sample. The task is to generate Json code with a description of the variable according to the I-ADOPT ontology. See the following examples: 
Example 1:
{{
  "Provided Variable Name": "Electron density in the solar wind",
  "Definition": "Density (particle per cm3) of electrons measured in the Solar Wind.",
  "Property (Label)": "volumetric number density",
  "ObjectOfInterest (Label)": "electron",
  "Matrix (Label)": "solar wind",
  "Applicable units of measure": "cm-3"
}}
Example 2:
{{
  "Provided Variable Name": "Atmospheric boundary layer height defined by temperature inversion",
  "Definition": "Atmospheric boundary layer height defined by temperature inversion.",
  "Property (Label)": "height",
  "ObjectOfInterest (Label)": "atmospheric boundary layer",
  "Matrix (Label)": "atmosphere",
  "constraint": [
    "defined by temperature inversion",
    "above the valley floor"
  ],
  "Constraint component": [
    "atmospheric boundary layer",
    "height"
  ],
  "Applicable units of measure": "m"
}}
Example 3:
{{
  "Provided Variable Name": "Peak ground acceleration",
  "Definition": "Peak acceleration measured on the earth surface when facing seismic events, like earthquakes.",
  "Statistical Modifier": "maximum",
  "Property (Label)": "acceleration",
  "ObjectOfInterest (Label)": "ground",
  "constraint": [
    "surface",
    "during seismic events"
  ],
  "Constraint component": [
    "ground",
    "acceleration"
  ],
  "Applicable units of measure": "m s-2"
}}
Given the example above. model the following variable:
Variable: {variable}
Description: {description}
Decompose it into:
- Statistical Modifier (Label) --OPTIONAL--
- Property (Label)
- ObjectOfInterest (Label)
- Matrix (Label) --OPTIONAL--
- ContextObject (Label) --OPTIONAL--
- constraint --OPTIONAL--
- Constraint component --OPTIONAL--
- Applicable units of measure --OPTIONAL--
Return a JSON in this exact format:
{{
  "Provided Variable Name": "...",
  "Definition": "...",
  "Statistical Modifier (Label)": "...",
  "Property (Label)": "...", 
  "ObjectOfInterest (Label)": "...",
  "Matrix (Label)": "...",
  "ContextObject (Label)": "...",
  "constraint": "..."
  "Constraint component": "...",
  "Applicable units of measure": "..."
}}
""",
    input_variables=["variable", "description"]
)


In [7]:
def extract_json_from_response(response_text):
    """
    Extract the first valid JSON object from the response text.
    Cleans up common formatting like ```json or plain ``` fences.
    """
    # Remove code block markers like ```json or ```
    cleaned_text = re.sub(r"```(?:json)?", "", response_text).strip()

    # Attempt to extract a JSON-like structure using regex
    match = re.search(r"\{.*\}", cleaned_text, re.DOTALL)
    if match:
        json_str = match.group(0)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print("JSON decoding failed:", e)
            return {}
    else:
        print("No JSON found in response.")
        return {}


In [8]:
# 2) Folder containing JSON files (each file has one ground-truth variable record)
json_folder = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data/"
json_folder = "/Users/rastegar-a/Documents/GitHub/i-adopt-llm-based-service/benchmarking_example/data/five_variables/"


In [9]:
# 3) Models you want to compare on OpenRouter or OpenAI
# model_names = ["deepseek/deepseek-v3-base:free", "google/gemini-2.5-pro-exp-03-25:free"] # OpenRouter models
# model_names = ["deepseek/deepseek-v3-base:free"] # OpenRouter models
####### ----
model_names = [
    "mistralai/mistral-7b-instruct",
    "mistralai/mistral-small-24b-instruct-2501",
    "meta-llama/llama-3.2-11b-vision-instruct",
    "meta-llama/llama-3.3-70b-instruct",
    "deepseek/deepseek-r1-distill-qwen-14b",
    "openai/gpt-4o-mini",
    "openai/gpt-4.1-mini"
]
### Initialize OpenRouter client (replace <OPENROUTER_API_KEY> with your actual API key).
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),    
)

############ -----
# model_names = ["gpt-4o-mini", "gpt-4o"]
# client = OpenAI(
#     api_key=os.getenv("OPENAI_API_KEY"),    
# )

In [10]:


# 5) LLM call helper using OpenRouter's chat endpoint
def call_model_openrouter(model_name, user_prompt):
    response = client.chat.completions.create(
        extra_headers={
            "HTTP-Referer": "<YOUR_SITE_URL>",  # optional
            "X-Title": "<YOUR_SITE_NAME>"       # optional
        },
        model=model_name,
        temperature=0,
        messages=[
            {"role": "user", "content": user_prompt}
        ]
    )
    # print(f"Model: {model_name}, Response: {response}")
    return response.choices[0].message.content

# 6) Embedding model (SentenceTransformer) for checking similarity
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embedding_similarity(text1, text2):
    """Compute cosine similarity between two pieces of text."""
    emb1 = embed_model.encode(text1, convert_to_tensor=True)
    emb2 = embed_model.encode(text2, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()


In [11]:
# 7) We'll evaluate each of these keys with a threshold for correctness
# ONTO_KEYS = ["hasObjectOfInterest", "hasProperty", "hasMatrix", "hasConstraint", "constrain1", "hasContext"]
ONTO_KEYS =  ["Statistical Modifier (Label)", "Property (Label)", "ObjectOfInterest (Label)", "Matrix (Label)", "Matrix (System)", "ContextObject (Label)", "constraint", "Constraint component", "Applicable units of measure"]
THRESHOLD = 0.90

In [12]:
def compute_confusion_for_field(gt_val, pred_val, threshold=0.90):
    """
    Correct Logic:
      True Positive (TP):   GT not empty, pred not empty, similarity >= threshold
      False Positive (FP):  GT empty, pred not empty
      False Negative (FN):  GT not empty and (pred empty OR similarity < threshold)
      True Negative (TN):   GT empty, pred empty
    """

    # Convert lists to string representations
    if isinstance(gt_val, list):
        gt_val = ", ".join(str(v).strip() for v in gt_val if v)
    if isinstance(pred_val, list):
        pred_val = ", ".join(str(v).strip() for v in pred_val if v)

    # Strip leading/trailing whitespace
    gt_val = gt_val.strip() if isinstance(gt_val, str) else ""
    pred_val = pred_val.strip() if isinstance(pred_val, str) else ""

    # If ground truth is non-empty => label is "present".
    if gt_val:
        # Prediction non-empty => check similarity
        if pred_val:
            sim = embedding_similarity(gt_val, pred_val)
            if sim >= threshold:
                return (1, 0, 0, 0)  # TP
            else:
                return (0, 0, 1, 0)  # FN (prediction too dissimilar)
        else:
            # Prediction empty => definitely FN
            return (0, 0, 1, 0)
    
    # If ground truth is empty => label is "absent".
    else:
        if pred_val:
            # Predicted something when nothing was needed => FP
            return (0, 1, 0, 0)
        else:
            # Both empty => TN
            return (0, 0, 0, 1)



# Helper to compute precision, recall, f1 from confusion matrix totals
def precision_recall_f1(tp, fp, fn, tn):
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)
    return precision, recall, f1

In [13]:
# 8) Main loop over JSON files
all_rows = []  # We'll store row-based results to build a DF

for file_name in os.listdir(json_folder):
    if file_name.endswith(".json"):
        with open(os.path.join(json_folder, file_name), "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Failed to parse {file_name}: {e}")
                continue

            variable_text = data.get("Provided Variable Name", "")
            description_text = data.get("Definition", "")
            ground_truth = {k: data.get(k, "") for k in ONTO_KEYS}
            prompt_text = prompt_template.format(
                variable=variable_text,
                description=description_text
            )

            # For each model, get predictions and compute confusion matrix
            for model_name in model_names:
                llm_output = call_model_openrouter(model_name, prompt_text)
                # --- Remove code fences if present ---
                # This will remove any ``` or ```json lines
                # cleaned_output = re.sub(r"```(\w+)?", "", llm_output).strip()

                # Attempt to parse the cleaned string as JSON
                predicted_json = extract_json_from_response(llm_output)

                # Accumulate confusion counts across all keys
                total_tp = total_fp = total_fn = total_tn = 0
                for key in ONTO_KEYS:
                    gt_val = ground_truth.get(key, "") or ""
                    pred_val = predicted_json.get(key, "") or ""
                    # print(f"GT: {gt_val}, Pred: {pred_val}")
                    tp, fp, fn, tn = compute_confusion_for_field(gt_val, pred_val)
                    total_tp += tp
                    total_fp += fp
                    total_fn += fn
                    total_tn += tn

                prec, rec, f1 = precision_recall_f1(total_tp, total_fp, total_fn, total_tn)

                # Store everything in all_rows, including ground truth & the predicted JSON
                row_dict = {
                    "File": file_name,
                    "Variable": variable_text,
                    "Model": model_name,
                    "TP": total_tp,
                    "FP": total_fp,
                    "FN": total_fn,
                    "TN": total_tn,
                    "Precision": round(prec, 3),
                    "Recall": round(rec, 3),
                    "F1": round(f1, 3),
                    # Store ground truth & predicted as strings for easy reference
                    "GroundTruth": json.dumps(ground_truth),
                    "LLMOutput": json.dumps(predicted_json)
                }
                all_rows.append(row_dict)

JSON decoding failed: Extra data: line 10 column 1 (char 364)
JSON decoding failed: Extra data: line 12 column 1 (char 331)


In [14]:
# 9) Create a DataFrame with aggregated results
df_results = pd.DataFrame(all_rows)
print("\n=== Final Results DataFrame ===\n")
print(df_results)

# Group by [File, Model] to see average metrics per file-model combo
summary = df_results.groupby(["File", "Model"]).agg({
    "Precision": "mean",
    "Recall": "mean",
    "F1": "mean"
}).reset_index()
summary = summary.round(3)

print("\n=== Summary (Grouped by File, Model) ===\n")
print(summary)

# 10) Compute average metrics for each model (across all files)
model_summary = df_results.groupby("Model").agg({
    "Precision": "mean",
    "Recall": "mean",
    "F1": "mean"
}).reset_index()
model_summary = model_summary.round(3)

print("\n=== Per-Model Averages (Overall Performance) ===\n")
print(model_summary)



=== Final Results DataFrame ===

                  File                                           Variable  \
0   variable_15.0.json                       Foraminifera, planktic, size   
1   variable_15.0.json                       Foraminifera, planktic, size   
2   variable_15.0.json                       Foraminifera, planktic, size   
3   variable_15.0.json                       Foraminifera, planktic, size   
4   variable_15.0.json                       Foraminifera, planktic, size   
5   variable_15.0.json                       Foraminifera, planktic, size   
6   variable_15.0.json                       Foraminifera, planktic, size   
7    variable_2.0.json                      Air daily maximum temperature   
8    variable_2.0.json                      Air daily maximum temperature   
9    variable_2.0.json                      Air daily maximum temperature   
10   variable_2.0.json                      Air daily maximum temperature   
11   variable_2.0.json                    

In [15]:
# 10) Finally, show ground truth and LLM outputs after the summary
print("\n=== Ground Truth vs. LLM Output Details ===\n")
for idx, row in df_results.iterrows():
    print("File:", row["File"])
    print("Model:", row["Model"])
    print("Variable:", row["Variable"])
    print("Ground Truth:", row["GroundTruth"])
    print("LLM Output:", row["LLMOutput"])
    print("-" * 50)


=== Ground Truth vs. LLM Output Details ===

File: variable_15.0.json
Model: mistralai/mistral-7b-instruct
Variable: Foraminifera, planktic, size
Ground Truth: {"Statistical Modifier (Label)": "", "Property (Label)": "size", "ObjectOfInterest (Label)": "foraminfers", "Matrix (Label)": "sediment", "Matrix (System)": "", "ContextObject (Label)": "", "constraint": ["planctonic", "surface"], "Constraint component": ["foraminfers", "sediment"], "Applicable units of measure": "\u00b5m"}
LLM Output: {"Provided Variable Name": "Size of planktic foraminifers in surface sediments", "Definition": "Size of planktic foraminifers found in surface sediments, as reported in the study at https://doi.pangaea.de/10.1594/PANGAEA.126730", "Statistical Modifier (Label)": "dimension", "Property (Label)": "size", "ObjectOfInterest (Label)": "foraminifers", "Matrix (Label)": "surface sediments", "Matrix (System)": "marine", "ContextObject (Label)": "sediment core", "constraint": "planktic", "Constraint compon

         Model  Precision  Recall     F1
0       gpt-4o      0.667   0.425  0.481
1  gpt-4o-mini      0.443   0.364  0.369


                                      Model  Precision  Recall     F1
0     deepseek/deepseek-r1-distill-qwen-14b      0.457   0.223  0.274
1  meta-llama/llama-3.2-11b-vision-instruct      0.450   0.230  0.289
2             mistralai/mistral-7b-instruct      0.367   0.297  0.321




=== Per-Model Averages (Overall Performance) ===

                                       Model  Precision  Recall     F1
0      deepseek/deepseek-r1-distill-qwen-14b      0.600   0.372  0.403
1   meta-llama/llama-3.2-11b-vision-instruct      0.220   0.170  0.187
2          meta-llama/llama-3.3-70b-instruct      0.444   0.313  0.312
3              mistralai/mistral-7b-instruct      0.433   0.325  0.361
4  mistralai/mistral-small-24b-instruct-2501      0.339   0.370  0.349
5                        openai/gpt-4.1-mini      0.700   0.354  0.433
6                         openai/gpt-4o-mini      0.460   0.275  0.303

          File        Model  Precision  Recall     F1
0    var1.json       gpt-4o      1.000   0.500  0.667
1    var1.json  gpt-4o-mini      0.333   0.167  0.222
2   var17.json       gpt-4o      0.500   0.167  0.250
3   var17.json  gpt-4o-mini      0.000   0.000  0.000
4    var2.json       gpt-4o      0.600   0.333  0.429
5    var2.json  gpt-4o-mini      0.500   0.222  0.308
6    var3.json       gpt-4o      0.000   0.000  0.000
7    var3.json  gpt-4o-mini      1.000   0.500  0.667
8    var4.json       gpt-4o      0.000   0.000  0.000
9    var4.json  gpt-4o-mini      0.167   0.167  0.167
10   var5.json       gpt-4o      1.000   0.125  0.222
11   var5.json  gpt-4o-mini      0.250   0.125  0.167



          File        Model  Precision  Recall     F1
0    var1.json       gpt-4o      1.000   1.000  1.000
1    var1.json  gpt-4o-mini      1.000   1.000  1.000
2   var17.json       gpt-4o      0.250   0.333  0.286
3   var17.json  gpt-4o-mini      0.000   0.000  0.000
4    var2.json       gpt-4o      0.750   0.600  0.667
5    var2.json  gpt-4o-mini      0.667   0.400  0.500
6    var3.json       gpt-4o      0.000   0.000  0.000
7    var3.json  gpt-4o-mini      0.667   0.667  0.667
8    var4.json       gpt-4o      0.000   0.000  0.000
9    var4.json  gpt-4o-mini      0.500   0.333  0.400
10   var5.json       gpt-4o      0.500   0.500  0.500
11   var5.json  gpt-4o-mini      1.000   0.500  0.667


