In [1]:
# %%
import logging
import os
import sys
from collections import defaultdict
import torch
import yaml
import json
import os
import re
import logging
import pandas as pd
from dotenv import load_dotenv
from dotenv import load_dotenv
from functools import partial
from rdflib import Graph, Literal, URIRef
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import dycomutils as common_utils
from typing import List, Dict, Any, Optional, Set, Tuple, DefaultDict
import openai
import ollama
import random

sys.path.append("/home/desild/work/research/chatbs/v2")

from src.utils.helpers import setup_logger
from src.utils.parser import (
    graph_query_to_sexpr,
    is_inv_rel,
    get_inv_rel,
    graph_query_to_sparql,
)
from src.utils.kg import (
    get_readable_relation,
    get_readable_class,
    get_non_literals,
    get_nodes_by_class,
    get_reverse_relation,
    get_reverse_readable_relation,
    prune_graph_query,
    legal_class,
    legal_relation,
)
from src.utils.arguments import Arguments
from src.utils.sparql import (
    SPARQLUtil,
    get_freebase_label,
    get_freebase_literals_by_cls_rel,
    get_freebase_entid_lbl_by_cls,
)
from src.utils.maps import literal_map

from transformers import set_seed
from tqdm import tqdm
from sentence_transformers.util import semantic_search

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
log = logging.getLogger(__name__)


from src.explorer_updates import Explorer, ExecutableProgram
from src.utils.graph_manager import GraphManager, regex_add_strings


12/01/2025 10:30:37 - INFO - src.explorer_updates -   Loading .env file from: /home/desild/work/research/chatbs/v2/.env
12/01/2025 10:30:37 - INFO - src.explorer_updates -   Loading config: /home/desild/work/research/chatbs/v2/prov.config.yaml
12/01/2025 10:30:37 - INFO - src.explorer_updates -   Loading metadata: /home/desild/work/research/chatbs/v2/data/workflow/10_sample_graph/chatbs_sample_metadata.json
12/01/2025 10:30:37 - INFO - src.explorer_updates -   Initializing GraphManager...
12/01/2025 10:30:37 - INFO - src.explorer_updates -   Graph loaded with 24073 triples.


In [2]:
ROOT_DIR = os.path.abspath("/home/desild/work/research/chatbs")
V2_DIR = os.path.join(ROOT_DIR, "v2")

run_info = {}
_temp = {}
gt = common_utils.serialization.load_json(os.path.join("evalutions", "GT", "questions_results.json"))

for k,v in gt.items():
    _temp[v["question"]] = v
    
gt = _temp
for q in os.listdir(os.path.join(V2_DIR, "logs/v3")):
    run_info[q.replace(".json", "")] = common_utils.serialization.load_json(os.path.join(V2_DIR, "logs/v3", q))

In [3]:
set(run_info.keys()) - set(gt.keys())

set()

In [4]:
def extract_json_from_markdown(text: str) -> str | None:
    """
    Extracts JSON from a markdown code block.
    R: extract_json_from_markdown_stringr (simulated)
    """
    match = re.search(r"```json\s*([\s\S]*?)\s*```", text)
    if match:
        return match.group(1)
    return None

def return_json_formatted(model_response: str):
    """
    Parses a JSON string, with retries for markdown blocks.
    R: return_json_formatted
    """
    try:
        # R: tryCatch({ fromJSON(model_response) })
        return json.loads(model_response)
    except json.JSONDecodeError as e:
        log.warning(
            f"Error parsing JSON (layer 1): {e}. Trying to extract from markdown."
        )
        try:
            # R: tryCatch({ ... extract_json ... })
            json_content = extract_json_from_markdown(model_response)
            if json_content:
                return json.loads(json_content)
            else:
                raise ValueError("No JSON markdown content extracted.")
        except Exception as e2:
            # R: ... return(data.frame(question = NA, explanation = NA))
            log.error(f"Error in parsing JSON (layer 2): {e2}")
            # Return a list as the prompt expects, even on failure
            return []

def llm_chat(
    system_prompt: str,
    user_prompt: str,
    model_version: str,
    structured_output: bool = False,
) -> str:
    """
    Sends a chat request to an OpenAI-compatible API.
    R: llm_chat
    """
    client = None
    # R: if ((startsWith(model_version, "gpt-")) || (startsWith(model_version, "o1-")))
    if model_version.startswith("gpt-") or model_version.startswith("o1-"):
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY not set in .env file")
        client = openai.OpenAI(api_key=api_key)
    else:
        # R: base_url = "http://idea-llm-01.idea.rpi.edu:5000/v1/"
        client = openai.OpenAI(
            base_url="http://idea-llm-01.idea.rpi.edu:5000/v1/",
            api_key=os.getenv(
                "LOCAL_LLM_API_KEY", "no-key-needed"
            ),  # Add your local key to .env if needed
        )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    chat_params = {"model": model_version, "messages": messages}

    # R: if (!is.null(structured_output))
    if structured_output:
        log.info("Requesting structured (JSON) output from LLM.")
        # This is the modern way to request JSON from OpenAI
        chat_params["response_format"] = {"type": "json_object"}

    try:
        response = client.chat.completions.create(**chat_params)
        answer = response.choices[0].message.content
        return answer
    except Exception as e:
        log.error(f"Error in LLM chat: {e}")
        return f"Error: {e}"


def get_plan(ques_info) -> Dict[str, Any]:
    plan = ques_info["planning_agent"]
    plan = plan['sub_question']['final_response']
    return plan

def collect_entities_retrieved(ques_info) -> Dict[str, Set[str]]:
    entities_retrieved = {}
    agent = ques_info["sparql_executor_agent"]
    
    for step in agent.keys():
        if "step" not in step:
            continue
        
        step_info = agent[step]
        
        if isinstance(step_info["_results"], str):
            entities_retrieved[step] = set()
            continue
        
        if "linked_entities" in step_info["_results"]:
            entities_retrieved[step] = set(step_info["_results"]["linked_entities"])
            
        elif "important_entities" in step_info["_results"]:
            entities_retrieved[step] = set(step_info["_results"]["important_entities"])
            
        else:
            for _rk,_rv in step_info["_results"].items():
                entities_retrieved[step] = set([_rk])
                for _ve in _rv:
                    if "o" in _ve:
                        entities_retrieved[step].add(_ve["o"])
                    else:
                        entities_retrieved[step].add(_ve["po"])
                                           
    return entities_retrieved

def get_gt_info(gt_item) -> Dict[str, Any]:
    answer = gt_item["answer"]
    results = gt_item["results"]
    entity = gt_item["entity"]
    
    ANS_TEMPLATE = f"""
    {answer}
    
    The results obtained are: 
    {results}
    
    Entities involved are:
    {entity}
    """
    
    return {"entity": entity, "answer": ANS_TEMPLATE}

def llm_evaluation(llm_answer, gt_answer) -> Dict[str, float]:
    system_prompt = """
    You are an evaluation agent that compares the output of a language model against a ground truth answer. 
    Your task is to assess the quality of the language model's answer.
    
    Output Format:
    Provide your evaluation in the following JSON format:
    {
        "completeness": <score from 1 to 5>,
        "accuracy": <score from 1 to 5>,
        "relevance": <score from 1 to 5>
    }
    
    Guidelines for Evaluation:
    1. Completeness: Check if the LLM's answer covers all aspects of the ground truth answer.
    2. Accuracy: Verify that the information provided in the LLM's answer is correct and aligns with the ground truth.
    3. Relevance: Ensure that the LLM's answer is pertinent to the question asked and does not include extraneous information.
    
    Scoring:
     - give a score from 0 to 1 for each of the following metrics: completeness, accuracy, relevance.
    """
    
    user_prompt = f"""
    Ground Truth Answer:
    
    {gt_answer}
    
    LLM Answer:
    
    {llm_answer}
    """
    
    response = llm_chat(system_prompt, user_prompt, "gpt-4o")
    response = return_json_formatted(response)
    return response

def prec_recall(llm_answer, gt_answer) -> Dict[str, float]:
    max_step = list(map(lambda x: int(x.replace("step", "")), llm_answer.keys()))
    final_step_ent = llm_answer[f"step{max(max_step)}"]
    total_retrived_entities = [x for y in llm_answer.values() for x in y]
    gt_ent = set(gt_answer["entity"])
    
    print(f"GT Entities: {gt_ent}")
    print(f"Final Step Entities: {set(final_step_ent)}")
    print(f"Total Retrieved Entities: {set(total_retrived_entities)}")
    
    true_positives_final = len(set(final_step_ent) & gt_ent)
    false_negatives = len(gt_ent - set(final_step_ent))
    recall_final = true_positives_final / (true_positives_final + false_negatives) if (true_positives_final + false_negatives) > 0 else 0.0
    
    true_positive_all = len(set(total_retrived_entities) & gt_ent)
    false_negatives_all = len(gt_ent - set(total_retrived_entities))
    recall_total = true_positive_all / (true_positive_all + false_negatives_all) if (true_positive_all + false_negatives_all) > 0 else 0.0
    
    return {"recall_final": recall_final, "recall_total": recall_total, "total_retrived_entities": len(total_retrived_entities)}

In [5]:
full_res = []
for k,v in run_info.items():
    # if not "what is the program that takes the system prompt created by the program system_prompt_generator as an input ?" == k:
    #     continue
    #print(k)
    #print(v["final_answer"])
    
    results = {}
    
    plan_ext = get_plan(v)
    collected_entities = collect_entities_retrieved(v)
    
    gt_info = get_gt_info(gt[k])
    #print(gt_info["entity"])
    #print(collected_entities)
    
    results["question"] = v["question"]
    results["tags"] = gt[k].get("tags", [])
    llm_eval = llm_evaluation(v["final_answer"], gt_info["answer"])
    stats_eval = prec_recall(collected_entities, gt_info)
    
    print(llm_eval)
    print(stats_eval)
    
    results.update(llm_eval)
    results.update(stats_eval)
    
    full_res.append(results)

12/01/2025 10:30:39 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#AI_Task-information_extractor-Input-text'}
Final Step Entities: {"Extract all the main ingredients without any subsidiary; return in singular noun form and First letter capital. Extract mean and dairy product in its' base form. (e.g., from 'egg white' extract 'Egg')@en^^<xsd:string>", 'http://testwebsite/testProgram#AI_Task-information_extractor-Input-text'}
Total Retrieved Entities: {'http://testwebsite/testProgram#AI_Task-Pipeline', "Extract all the main ingredients without any subsidiary; return in singular noun form and First letter capital. Extract mean and dairy product in its' base form. (e.g., from 'egg white' extract 'Egg')@en^^<xsd:string>", 'http://testwebsite/testProgram#AI_Task-llm_chat', 'http://testwebsite/testProgram#AI_Task-information_extractor-Input-text', 'http://testwebsite/testProgram#AI_Task-information_extractor-Input-builder_llm', 'http://testwebsite/testProgram#AI_Task-system_prompt_generator', 'http://testwebsite/

12/01/2025 10:30:40 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#AI_Task-system_prompt_generator-Input-builder_llm'}
Final Step Entities: {'gpt-4o@en^^<xsd:string>', 'http://testwebsite/testProgram#AI_Task-system_prompt_generator-Input-builder_llm'}
Total Retrieved Entities: {'gpt-4o@en^^<xsd:string>', 'http://testwebsite/testProgram#AI_Task-Pipeline', 'http://testwebsite/testProgram#AI_Task-llm_chat', 'http://testwebsite/testProgram#AI_Task-system_prompt_generator', 'http://testwebsite/testProgram#AI_Task-batch_sparql_query_extractor', 'http://testwebsite/testProgram#AI_Task-information_extractor', 'http://testwebsite/testProgram#AI_Task-sparql_query_extractor', 'http://testwebsite/testProgram#AI_Task-query_result_post_processor', 'http://testwebsite/testProgram#AI_Task-DF_combine', 'http://testwebsite/testProgram#AI_Task-system_prompt_generator-Input-builder_llm'}
{'completeness': 0.5, 'accuracy': 0.5, 'relevance': 0.3}
{'recall_final': 1.0, 'recall_total': 1.0, 'total_retrived_entities': 12}


12/01/2025 10:30:42 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#Data-id_20251125040052_369-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040052_494-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_856-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_606-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_718-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_793-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_493-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_508-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_594-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_455-kg_info', 'http://testwebsite/testProgram#Data-id_20251125040053_188-kg_info'}
Final Step Entities: {'@en^^<xsd:string>', 'http://testwebsite/testProgram#Collection-id_20251125040052_388-kg_info', 'NA@en^^<xsd:string>', 'http://purl.org/provone#Data'}
Total Retrieved Entities: {'http://testwebsite/testPro

12/01/2025 10:30:43 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#AI_Task-information_extractor-Input-builder_llm'}
Final Step Entities: {'http://testwebsite/testProgram#AI_Task-information_extractor-Input-builder_llm', 'gpt-4o@en^^<xsd:string>'}
Total Retrieved Entities: {'gpt-4o@en^^<xsd:string>', 'http://testwebsite/testProgram#AI_Task-Pipeline', 'http://testwebsite/testProgram#AI_Task-llm_chat', 'http://testwebsite/testProgram#AI_Task-information_extractor-Input-builder_llm', 'http://testwebsite/testProgram#AI_Task-system_prompt_generator', 'http://testwebsite/testProgram#AI_Task-batch_sparql_query_extractor', 'http://testwebsite/testProgram#AI_Task-information_extractor', 'http://testwebsite/testProgram#AI_Task-sparql_query_extractor', 'http://testwebsite/testProgram#AI_Task-query_result_post_processor', 'http://testwebsite/testProgram#AI_Task-DF_combine'}
{'completeness': 1, 'accuracy': 1, 'relevance': 1}
{'recall_final': 1.0, 'recall_total': 1.0, 'total_retrived_entities': 11}


12/01/2025 10:30:43 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#AI_Task-query_result_post_processor-Input-formatting_instruction', 'http://testwebsite/testProgram#AI_Task-query_result_post_processor-Input-processing_instruction'}
Final Step Entities: {"If there are duplicate ingredients then select row with most information on `sugarG` and `ingredientCat` columns. Fill all the missing values with '0' in `sugarG` column and with '-' in 'ingredientCat` column.@en^^<xsd:string>", 'http://testwebsite/testProgram#AI_Task-query_result_post_processor-Input-processing_instruction'}
Total Retrieved Entities: {"If there are duplicate ingredients then select row with most information on `sugarG` and `ingredientCat` columns. Fill all the missing values with '0' in `sugarG` column and with '-' in 'ingredientCat` column.@en^^<xsd:string>", 'http://testwebsite/testProgram#AI_Task-Pipeline', 'http://testwebsite/testProgram#AI_Task-llm_chat', 'http://testwebsite/testProgram#AI_Task-query_result_post_processor-Input-proc

12/01/2025 10:30:45 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#llm_chat'}
Final Step Entities: set()
Total Retrieved Entities: set()
{'completeness': 1, 'accuracy': 1, 'relevance': 1}
{'recall_final': 0.0, 'recall_total': 0.0, 'total_retrived_entities': 0}


12/01/2025 10:30:46 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#AI_Task-Pipeline', 'http://testwebsite/testProgram#AI_Task-llm_chat', 'http://testwebsite/testProgram#AI_Task-system_prompt_generator', 'http://testwebsite/testProgram#AI_Task-batch_sparql_query_extractor', 'http://testwebsite/testProgram#AI_Task-information_extractor', 'http://testwebsite/testProgram#AI_Task-sparql_query_extractor', 'http://testwebsite/testProgram#AI_Task-query_result_post_processor', 'http://testwebsite/testProgram#AI_Task-DF_combine'}
Final Step Entities: {'http://testwebsite/testProgram#AI_Task-system_prompt_generator-Input-base_sys_prompt', 'http://testwebsite/testProgram#system_prompt_generator', 'https://purl.org/heals/eo#AITask', 'http://testwebsite/testProgram#AI_Task-system_prompt_generator', 'http://testwebsite/testProgram#AI_Task-system_prompt_generator-Input-builder_llm'}
Total Retrieved Entities: {'http://testwebsite/testProgram#AI_Task-system_prompt_generator-Input-base_sys_prompt', 'http://testwebsite/testPr

12/01/2025 10:30:48 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#Data-id_20251125040052_801-extracts', 'http://testwebsite/testProgram#Data-id_20251125040052_440-extracts', 'http://testwebsite/testProgram#Data-id_20251125040052_762-extracts', 'http://testwebsite/testProgram#Data-id_20251125040052_146-extracts', 'http://testwebsite/testProgram#Data-id_20251125040052_740-extracts', 'http://testwebsite/testProgram#Data-id_20251125040052_802-extracts', 'http://testwebsite/testProgram#Data-id_20251125040052_631-extracts', 'http://testwebsite/testProgram#Data-id_20251125040052_394-extracts'}
Final Step Entities: {'Pepper@en^^<xsd:string>', 'http://purl.org/provone#Data', 'Feta cheese@en^^<xsd:string>', 'Egg@en^^<xsd:string>', 'Salt@en^^<xsd:string>', 'Spinach@en^^<xsd:string>', 'http://testwebsite/testProgram#Collection-id_20251125040052_285-extracts', 'Turkey bacon@en^^<xsd:string>', 'Parsley@en^^<xsd:string>', 'Olive oil@en^^<xsd:string>'}
Total Retrieved Entities: {'http://testwebsite/testProgram#Collection

12/01/2025 10:30:50 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: set()
Final Step Entities: {'http://testwebsite/testProgram#AI_Task-sparql_query_extractor-Input-sparql_query_template', 'PREFIX dbo: <http://dbpedia.org/ontology/>\nPREFIX dbp: <http://dbpedia.org/property/>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX dcterms: <http://purl.org/dc/terms/>\nPREFIX dbt: <http://dbpedia.org/resource/Template:>\nPREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n\nSELECT (STR(?ingredientLabel) AS ?ingredientName) \n       (STR(?sugar) AS ?sugarG) \n       (STR(?subjectLabel) AS ?ingredientCat) \nWHERE {\n  ?ingredient rdfs:label ?ingredientLabel .\n  FILTER (?ingredientLabel = "%s"@en) .\n\n  OPTIONAL { ?ingredient dbp:sugars ?sugar . }\n\n  ?ingredient dcterms:subject ?subject .\n  OPTIONAL { ?subject rdfs:label ?subjectLabel . }\n\n  ?subject dbp:wikiPageUsesTemplate ?template0 .\n  FILTER (?template0 IN (dbt:CatAutoTOC, dbt:Cookbook))\n  ?subject skos:broader ?broader .\n\n  OPTIONAL {\n    ?broader dbp:wikiPageUsesTempl

12/01/2025 10:30:51 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#Data-id_20251125040037_520-system_prompt'}
Final Step Entities: {'http://testwebsite/testProgram#id_20251125040037_323', "You are a culinary assistant that recommends one personalized dish per request, complete with ingredients and step-by-step instructions. Focus on dietary needs, allergy safety, and easy, enjoyable cooking. You will consider the user's details: 30, Male, diagnosis: Diabetic, height: 182.3, weight: 95.2, BMI: 28.6, food preferences: Carnivore, and allergies: None.@en^^<xsd:string>", 'http://testwebsite/testProgram#Data-id_20251125040037_520-system_prompt', 'http://purl.org/provone#Data'}
Total Retrieved Entities: {'http://testwebsite/testProgram#id_20251125040053_658', 'http://testwebsite/testProgram#id_20251125040053_85', 'http://purl.org/provone#Data', 'http://testwebsite/testProgram#id_20251125040037_323', 'http://testwebsite/testProgram#id_20251125040050_988', 'http://testwebsite/testProgram#id_20251125040053_196', 'ht

12/01/2025 10:30:52 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#system_prompt_generator', 'Batch SPARQL Query Extractor@en^^<xsd:string>', 'SPARQL Query Builder@en^^<xsd:string>', 'http://testwebsite/testProgram#query_result_post_processor', 'Query Result Post-Processor@en^^<xsd:string>', 'http://testwebsite/testProgram#information_extractor', 'http://testwebsite/testProgram#sparql_query_extractor', 'Information Extractor@en^^<xsd:string>', 'http://testwebsite/testProgram#DF_combine', 'System Prompt Template Generator@en^^<xsd:string>', 'Data Frame Combiner@en^^<xsd:string>', 'http://testwebsite/testProgram#batch_sparql_query_extractor'}
Final Step Entities: {'https://purl.org/heals/eo#SystemRecommendation', 'http://testwebsite/testProgram#sparql_query_extractor-entity', 'SPARQL Query Builder@en^^<xsd:string>', '3@en^^<xsd:string>', 'http://testwebsite/testProgram#sparql_query_extractor', '2025-11-09 19:30:53@en^^<xsd:string>', 'Builds SPARQL queries to extract information about the entities from the kn

12/01/2025 10:30:53 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#information_extractor'}
Final Step Entities: set()
Total Retrieved Entities: set()
{'completeness': 0, 'accuracy': 0, 'relevance': 0}
{'recall_final': 0.0, 'recall_total': 0.0, 'total_retrived_entities': 0}


12/01/2025 10:30:54 - INFO - httpx -   HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


GT Entities: {'http://testwebsite/testProgram#AI_Task-sparql_query_extractor-Input-sparql_query_template'}
Final Step Entities: {'http://testwebsite/testProgram#AI_Task-sparql_query_extractor-Input-sparql_query_template', 'PREFIX dbo: <http://dbpedia.org/ontology/>\nPREFIX dbp: <http://dbpedia.org/property/>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nPREFIX dcterms: <http://purl.org/dc/terms/>\nPREFIX dbt: <http://dbpedia.org/resource/Template:>\nPREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n\nSELECT (STR(?ingredientLabel) AS ?ingredientName) \n       (STR(?sugar) AS ?sugarG) \n       (STR(?subjectLabel) AS ?ingredientCat) \nWHERE {\n  ?ingredient rdfs:label ?ingredientLabel .\n  FILTER (?ingredientLabel = "%s"@en) .\n\n  OPTIONAL { ?ingredient dbp:sugars ?sugar . }\n\n  ?ingredient dcterms:subject ?subject .\n  OPTIONAL { ?subject rdfs:label ?subjectLabel . }\n\n  ?subject dbp:wikiPageUsesTemplate ?template0 .\n  FILTER (?template0 IN (dbt:CatAutoTOC, dbt:Cookbook))

In [6]:
full_res_df = pd.DataFrame(full_res)

In [7]:
full_res_df

Unnamed: 0,question,tags,completeness,accuracy,relevance,recall_final,recall_total,total_retrived_entities
0,what are the instructions used as an object re...,[structural],0.8,0.9,0.7,1.0,1.0,12
1,what was the builder llm used as an object rec...,[structural],0.5,0.5,0.3,1.0,1.0,12
2,what was the sparql results generated by the e...,[row-level],0.5,0.5,0.5,0.0,0.0,32
3,what was the builder llm used as an object rec...,[structural],1.0,1.0,1.0,1.0,1.0,11
4,what are the instructions used as an object re...,[structural],1.0,1.0,1.0,0.5,1.0,12
5,what is the program that takes the system prom...,[structural],1.0,1.0,1.0,0.0,0.0,0
6,What are the AI Tasks that generated programs?,[structural],0.5,0.5,0.5,0.125,1.0,21
7,what were the extracted ingredients generated ...,[row-level],0.9,0.9,1.0,0.0,0.0,32
8,what was the builder llm used as an object rec...,[structural],0.5,0.5,0.5,0.0,0.0,12
9,what was the system prompt generated by the ex...,[row-level],0.9,0.9,1.0,1.0,1.0,18


In [15]:
full_res_df.loc[(full_res_df["recall_final"] > 0) & (full_res_df["recall_total"] > 0), ["completeness", "accuracy", "relevance", "recall_final", "recall_total"]].mean()

completeness    0.762500
accuracy        0.775000
relevance       0.725000
recall_final    0.723958
recall_total    0.947917
dtype: float64

In [8]:
os.makedirs("evaluations", exist_ok=True)
full_res_df.to_csv(os.path.join("evaluations", "v3_evaluation.csv"), index=False)

In [9]:
print("To determine the order of execution for the specified programs in the system, we refer to the identifiers and timestamps provided for each program. Based on the attributes and available data, here's how the order of programs appears:\n\n1. **System Prompt Template Generator**: \n   - **Label**: \"System Prompt Template Generator\"\n   - **Description**: Generates a system prompt based on the user input and dataset structure.\n   - **Identifier**: 1\n   - **Timestamp**: 2025-11-09 19:29:12\n\n   This program initiates the sequence by generating prompts that will be leveraged by the subsequent components, indicating a foundational role in setting up necessary prompt structures that will guide the system behavior.\n\n2. **Information Extractor**: \n   - **Label**: \"Information Extractor\"\n   - **Description**: Extracts entities or relevant information from the generated text.\n   - **Identifier**: 2\n   - **Timestamp**: 2025-11-09 19:30:27\n\n   This program operates soon after the prompt template is generated, identifying and extracting key information based on interactions or generated text from the system.\n\n3. **Batch SPARQL Query Extractor**: \n   - **Label**: \"Batch SPARQL Query Extractor\"\n   - **Description**: Extracts knowledge from the graph in batch mode.\n   - **Identifier**: 3\n   - **Timestamp**: 2025-11-09 19:30:53\n\n   Alongside extracting data, the Batch SPARQL Query Extractor also involves in the initial processing phase, identifying key data from knowledge graphs in batches.\n\n   - **SPARQL Query Builder** is part of this phase as well:\n     - It is identified by the same identifier and performs SPARQL query building as a subtask of batch extraction.\n\n4. **Data Frame Combiner**: \n   - **Label**: \"Data Frame Combiner\"\n   - **Description**: Combines multiple data frames resulting from SPARQL queries.\n   - **Identifier**: 3\n\n   While operated simultaneously with the batch SPARQL extractor (given identical identifiers), this can also indicate that it processes after SPARQL queries have extracted the necessary data.\n\n5. **LLM Chat**: \n   - **Label**: \"LLM Chat\"\n   - **Description**: Used for communication with the language model (LLM).\n   - **Timestamp**: (not explicitly provided in the context, typically following extraction and combination phases to utilize processed data or prompt structure)\n\n6. **Query Result Post-Processor**: \n   - **Label**: \"Query Result Post-Processor\"\n   - **Description**: Finalizes the processing of the query results.\n   - **Identifier**: 4\n   - **Timestamp**: 2025-11-09 19:32:26\n\n   This program is the final component, refining the data set results into usable outputs.\n\nCollectively, this sequence constitutes a pipeline that begins with generating necessary prompts, analyzing and extracting relevant data, and then processing and refining the results before the interaction with an LLM model.\n\n### Important Facts:\n1. **Pipeline Steps**:\n   - \"System Prompt Template Generator\" initiated as step 1. - [Link](http://testwebsite/testProgram#system_prompt_generator)\n   - Subsequent processing involves \"Information Extractor\", \"Batch SPARQL Query Extractor\", and \"Data Frame Combiner\". - [Link](http://testwebsite/testProgram#information_extractor), [Link](http://testwebsite/testProgram#batch_sparql_query_extractor), [Link](http://testwebsite/testProgram#DF_combine)\n   - These are followed by \"LLM Chat\" and culminate with \"Query Result Post-Processor\". - [Link](http://testwebsite/testProgram#query_result_post_processor), [Link](http://testwebsite/testProgram#llm_chat)\n\n2. **Identifier and Timestamps**: \n   - Identifiers (1 to 4) suggest order and execution priority based on timestamps when available. - [Link](http://testwebsite/testProgram#sparql_query_extractor)\n\nThrough these analyses, the organization of execution steps becomes evident, forming a coherent flow for data and information processing within the system pipeline.")

To determine the order of execution for the specified programs in the system, we refer to the identifiers and timestamps provided for each program. Based on the attributes and available data, here's how the order of programs appears:

1. **System Prompt Template Generator**: 
   - **Label**: "System Prompt Template Generator"
   - **Description**: Generates a system prompt based on the user input and dataset structure.
   - **Identifier**: 1
   - **Timestamp**: 2025-11-09 19:29:12

   This program initiates the sequence by generating prompts that will be leveraged by the subsequent components, indicating a foundational role in setting up necessary prompt structures that will guide the system behavior.

2. **Information Extractor**: 
   - **Label**: "Information Extractor"
   - **Description**: Extracts entities or relevant information from the generated text.
   - **Identifier**: 2
   - **Timestamp**: 2025-11-09 19:30:27

   This program operates soon after the prompt template is gener

In [10]:
full_res_df["ques_type"] = full_res_df["tags"].apply(lambda x: "row-level" if "row-level" in x else "structural")
full_res_df = full_res_df.drop(columns=["tags", "question"])

In [11]:
full_res_df.groupby("ques_type").mean()

Unnamed: 0_level_0,completeness,accuracy,relevance,recall_final,recall_total,total_retrived_entities
ques_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
row-level,0.8,0.8,0.825,0.5,0.5,23.5
structural,0.644444,0.655556,0.611111,0.421296,0.62037,10.888889
