In [1]:
import logging
import os
import sys
from collections import defaultdict
import yaml
import json
import os
import re
import logging
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from functools import partial
from rdflib import Graph, Literal, URIRef
from rdflib.namespace import RDF, RDFS
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import dycomutils as common_utils
from typing import List, Dict, Any, Optional, Set, Tuple, DefaultDict
import openai

sys.path.append("/home/desild/work/research/chatbs/v2")

from src.utils.helpers import setup_logger
from src.utils.parser import graph_query_to_sexpr, is_inv_rel, get_inv_rel, graph_query_to_sparql
from src.utils.kg import get_readable_relation, get_readable_class, get_non_literals, get_nodes_by_class, \
    get_reverse_relation, get_reverse_readable_relation, prune_graph_query, legal_class, legal_relation
from src.utils.arguments import Arguments
from src.utils.sparql import SPARQLUtil, get_freebase_label, get_freebase_literals_by_cls_rel, \
    get_freebase_entid_lbl_by_cls
from src.utils.maps import literal_map

from transformers import set_seed
from tqdm import tqdm

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s', datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

from src.explorer_updates import Explorer, ExecutableProgram
from src.utils.graph_manager import GraphManager, regex_add_strings

11/25/2025 14:36:19 - INFO - src.explorer_updates -   Loading .env file from: /home/desild/work/research/chatbs/v2/.env
11/25/2025 14:36:19 - INFO - src.explorer_updates -   Loading config: /home/desild/work/research/chatbs/v2/prov.config.yaml
11/25/2025 14:36:19 - INFO - src.explorer_updates -   Loading metadata: /home/desild/work/research/chatbs/v2/data/workflow/10_sample_graph/chatbs_sample_metadata.json
11/25/2025 14:36:19 - INFO - src.explorer_updates -   Initializing GraphManager...
11/25/2025 14:36:20 - INFO - src.explorer_updates -   Graph loaded with 24073 triples.


In [2]:
def llm_chat(system_prompt: str, user_prompt: str, model_version: str, structured_output: bool = False) -> str:
    """
    Sends a chat request to an OpenAI-compatible API.
    R: llm_chat
    """
    client = None
    # R: if ((startsWith(model_version, "gpt-")) || (startsWith(model_version, "o1-")))
    if model_version.startswith("gpt-") or model_version.startswith("o1-"):
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY not set in .env file")
        client = openai.OpenAI(api_key=api_key)
    else:
        # R: base_url = "http://idea-llm-01.idea.rpi.edu:5000/v1/"
        client = openai.OpenAI(
            base_url="http://idea-llm-01.idea.rpi.edu:5000/v1/",
            api_key=os.getenv("LOCAL_LLM_API_KEY", "no-key-needed") # Add your local key to .env if needed
        )
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    chat_params = {
        "model": model_version,
        "messages": messages
    }
    
    # R: if (!is.null(structured_output))
    if structured_output:
        log.info("Requesting structured (JSON) output from LLM.")
        # This is the modern way to request JSON from OpenAI
        chat_params["response_format"] = {"type": "json_object"}
        
    try:
        response = client.chat.completions.create(**chat_params)
        answer = response.choices[0].message.content
        return answer
    except Exception as e:
        log.error(f"Error in LLM chat: {e}")
        return f"Error: {e}"

def update_answer(system_prompt: str, user_prompt: str, generated_answer: str, error_message: str, model_version: str) -> str:
    """
    Asks the LLM to correct a previous, failed response.
    R: update_answer
    """
    recorrection_template = f"""
    User prompt : {user_prompt}
    Incorrect generated answer : {generated_answer}
    Error message : {error_message}
    Analyze the original user prompt, the incorrect answer, and the error message. Identify where the generated response failed to meet the prompt’s intent. Then, provide a revised answer.
    """
    return llm_chat(system_prompt, recorrection_template, model_version)


def create_timestamp_id(prefix:str):
    """
    Creates a unique identifier based on the current timestamp.
    R: create_timestamp_id
    """
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    return f"{prefix}_{timestamp}"

In [3]:
# --- 1. Setup & Configuration ---
ROOT_DIR = os.path.abspath("/home/desild/work/research/chatbs")
V2_DIR = os.path.join(ROOT_DIR, "v2")
EXPLORED_PROGRAMS_PICKLE = "data/workflow/explored_programs.pkl"

# Setup basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger(__name__)

In [4]:
# Load .env file from the specified path
# R: load_dot_env("../ChatBS-NexGen/.env")
env_path = os.path.join(V2_DIR, ".env")
log.info(f"Loading .env file from: {env_path}")
load_dotenv(env_path)

# Load YAML config
# R: config <- yaml::read_yaml(...)
config_path = os.path.join(V2_DIR, "prov.config.yaml")
log.info(f"Loading config: {config_path}")
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Load JSON metadata
# R: ttl_metadata <- readLines("QGraph_metadata.json")
metadata_path = os.path.join(V2_DIR, "data/workflow/chatbs_sample_metadata.json")
log.info(f"Loading metadata: {metadata_path}")
with open(metadata_path, 'r') as f:
    ttl_metadata = json.load(f)

11/25/2025 14:36:20 - INFO - __main__ -   Loading .env file from: /home/desild/work/research/chatbs/v2/.env
11/25/2025 14:36:20 - INFO - __main__ -   Loading config: /home/desild/work/research/chatbs/v2/prov.config.yaml
11/25/2025 14:36:20 - INFO - __main__ -   Loading metadata: /home/desild/work/research/chatbs/v2/data/workflow/chatbs_sample_metadata.json


In [5]:
graph_manager = GraphManager(config, os.path.join(V2_DIR, "data/workflow/explored_programs_fno.ttl"))
schema = common_utils.serialization.load_json(os.path.join(V2_DIR, "data/workflow/schema.json"))
definitions = {'class_definitions':schema['classes'], 'relation_definitions':{k:v["description"] for k,v in schema['relations'].items()}}

11/25/2025 14:36:20 - INFO - src.utils.graph_manager -   Initializing GraphManager...
11/25/2025 14:36:20 - INFO - src.utils.graph_manager -   Graph loaded with 6686 triples.


In [6]:
import ollama


In [7]:
sparql_function_details = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX ep: <http://linkedu.eu/dedalo/explanationPattern.owl#>
        PREFIX eo: <https://purl.org/heals/eo#>
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX food: <http://purl.org/heals/food/>
        PREFIX prov: <http://www.w3.org/ns/prov#>
        PREFIX provone: <http://purl.org/provone#>
        PREFIX sio:<http://semanticscience.org/resource/>
        PREFIX cwfo: <http://cwf.tw.rpi.edu/vocab#>
        PREFIX dcterms: <http://purl.org/dc/terms#>
        PREFIX user: <http://testwebsite/testUser#>
        PREFIX DFColumn: <http://testwebsite/testDFColumn#>
        PREFIX fnom: <https://w3id.org/function/vocabulary/mapping#>
        PREFIX fnoi: <hhttps://w3id.org/function/vocabulary/implementation#>
        PREFIX fnoc: <https://w3id.org/function/vocabulary/composition/0.1.0/>
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX dbp: <http://dbpedia.org/property/>
        PREFIX dbt: <http://dbpedia.org/resource/Template:>
        PREFIX ques: <http://atomic_questions.org/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX fno: <https://w3id.org/function/vocabulary/core#>

        SELECT ?question_lbl ?paths ?example ?param_map ?param_desc ?return_map ?return_desc
        WHERE {
            <{map_id}> fno:implementation/rdfs:label ?imp ;
                       fno:function ?function ;
                       fno:parameterMapping/fnom:functionParameter/fno:predicate ?param_map ;
                       fno:parameterMapping/fnom:functionParameter/rdfs:label ?param_desc ;
                       fno:returnMapping/fnom:functionOutput/fno:predicate ?return_map ;
                       fno:returnMapping/fnom:functionOutput/rdfs:label ?return_desc .
            ?function fno:executes/rdfs:label ?example.
            ?function fno:solves/fno:name ?question_lbl .
            ?function fno:name ?paths .
                       
        }"""
                                        
sparql_get_mapping_obj = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX ep: <http://linkedu.eu/dedalo/explanationPattern.owl#>
PREFIX eo: <https://purl.org/heals/eo#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX food: <http://purl.org/heals/food/>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX provone: <http://purl.org/provone#>
PREFIX sio:<http://semanticscience.org/resource/>
PREFIX cwfo: <http://cwf.tw.rpi.edu/vocab#>
PREFIX dcterms: <http://purl.org/dc/terms#>
PREFIX user: <http://testwebsite/testUser#>
PREFIX DFColumn: <http://testwebsite/testDFColumn#>
PREFIX fnom: <https://w3id.org/function/vocabulary/mapping#>
PREFIX fnoi: <hhttps://w3id.org/function/vocabulary/implementation#>
PREFIX fnoc: <https://w3id.org/function/vocabulary/composition/0.1.0/>
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbt: <http://dbpedia.org/resource/Template:>
PREFIX ques: <http://atomic_questions.org/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX fno: <https://w3id.org/function/vocabulary/core#>
    
SELECT ?mapping
WHERE {
    ?mapping a fno:Mapping .
}
"""

In [8]:
vec_repr = []
obj_id = []

objs = graph_manager.query(sparql_get_mapping_obj)["mapping"].to_list()


for o in tqdm(objs):
    obj_info = graph_manager.query(
        regex_add_strings(sparql_function_details,
                            map_id=o)
    )
    
    obj_info = obj_info.to_records()[0]
    
    _template = f"""
    Function Answers : {obj_info[1]}
    Function Paths: {obj_info[2]}
    
    SPARQL
    {obj_info[3]}
    
    """
    
    
    #print(f"Object: {o}")
    #print(str_obj)
    
    embed = ollama.embeddings(
        model='nomic-embed-text', 
        prompt=_template
        )
    
    vec_repr.append(embed["embedding"])
    obj_id.append(_template)

  0%|          | 0/82 [00:00<?, ?it/s]11/25/2025 14:36:20 - INFO - httpx -   HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
11/25/2025 14:36:20 - INFO - httpx -   HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
11/25/2025 14:36:20 - INFO - httpx -   HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
11/25/2025 14:36:20 - INFO - httpx -   HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
11/25/2025 14:36:20 - INFO - httpx -   HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
  6%|▌         | 5/82 [00:00<00:01, 45.04it/s]11/25/2025 14:36:20 - INFO - httpx -   HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
11/25/2025 14:36:20 - INFO - httpx -   HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
11/25/2025 14:36:20 - INFO - httpx -   HTTP Request: POST http://127.0.0.1:11434/api/embeddings "HTTP/1.1 200 OK"
11/2

In [9]:
vec_repr = np.array(vec_repr)
print(vec_repr.shape)

common_utils.serialization.save_pickle(
    {'obj_id': obj_id,
     'vec_repr': vec_repr},
    os.path.join(V2_DIR, "data/workflow/function_vector_index.pkl")
)

(82, 768)
