In [None]:
import app.agent as agent

In [None]:
graph = agent.build_agent_with_router()

In [None]:
user_text = "Удали таблицу fact_sales из базы данных"

msg = {"messages": [{"role": "user", "content": user_text}]}
config = {"configurable": {"thread_id": str(1)}}
print(config)
response = graph.invoke(msg, config)
last_msg = response["messages"][-1]

In [None]:
last_msg.content

In [None]:
user_text = "Покажи минимальную и максимальную дату продаж"

msg = {"messages": [{"role": "user", "content": user_text}]}
config = {"configurable": {"thread_id": str(1)}}
print(config)
response = graph.invoke(msg, config)
last_msg = response["messages"][-1]

In [None]:
response

In [None]:
[m for in response["messages"] if m.role == ""]

In [None]:
type(response["messages"][-1])

### Tests

In [None]:
import uuid
import importlib
import sys
from httpx import request
import pytest
from typing import Dict, List, Any, Tuple
from pydantic import BaseModel, Field
from langchain.chat_models import init_chat_model

from langsmith import testing as t

from langgraph.checkpoint.memory import MemorySaver
from langgraph.store.memory import InMemoryStore
from langgraph.types import Command
import os
from dotenv import load_dotenv

load_dotenv(".env", override=True)


sys.path.append("/Users/aziz/Documents/repos/shai-hackathon")
import tests.evaluation_obs_sql
import app.agent as agent
import app.prompts as prompts


graph = agent.build_agent_with_router()

In [None]:
RESPONSE_CRITERIA_SYSTEM_PROMPT = """
You are an evaluator of SQL generation for a natural language to SQL agent.

I will give you:
1. The user request.
2. A list of calls made by the agent to tools.
3. The golden (reference) SQL query.

Your task:
- Evaluate two metrics:
  1. **Exec accuracy**: Does the candidate query produce the same result as the golden query, even if the syntax or formatting differs? (Yes/No)
  2. **Exact match**: [Ignore aliases] Is the candidate query logically similar to the golden query. Ignore whitespace and capitalization differences? Ignore differences in column aliases? (Yes/No)

Return STRICT JSON in this format:
{
  "exec_accuracy": "<Yes/No>",
  "exact_match": "<Yes/No>",
  "explanation": "<short explanation why>"
}

Example:

User request: "How many stores are in the database?"
List of calls: "SELECT COUNT(*) AS total_stores FROM dict_store;"
Golden SQL: "SELECT COUNT(*) FROM dict_store;"

Output:
{
  "exec_accuracy": "Yes",
  "exact_match": "No",
  "explanation": "The candidate query executes correctly and returns the same result, but the alias 'AS total_stores' is missing."
}
"""


In [3]:
import tests.evaluation_obs_sql as e_sql

In [4]:
test_cases = []
for req, criteria in zip(e_sql.sql_reqsuests, e_sql.sql_answers):
    test_cases.append((req, criteria))

In [5]:
evaluator_model = os.getenv("EVALUATOR_MODEL")
evaluator_api_base = os.getenv("EVALUATOR_API_BASE")
evaluator_api_key = os.getenv("EVALUATOR_API_KEY")


class SQLEvaluation(BaseModel):
    """Evaluation of candidate SQL against golden SQL across two criteria: execution accuracy and exact match."""

    exec_accuracy: bool = Field(
        description="Does the candidate SQL produce the same result as the golden SQL (ignoring syntax/formatting differences)?"
    )
    exec_accuracy_justification: str = Field(
        description="Explain why the candidate SQL does or does not produce the same result."
    )

    exact_match: bool = Field(
        description="Is the candidate SQL textually identical to the golden SQL (ignoring whitespace and capitalization)?"
    )
    exact_match_justification: str = Field(
        description="Explain why the candidate SQL matches or differs from the golden SQL text."
    )


criteria_eval_llm = init_chat_model(
    evaluator_model,
    openai_api_base=evaluator_api_base,
    openai_api_key=evaluator_api_key,
)
criteria_eval_structured_llm = criteria_eval_llm.with_structured_output(SQLEvaluation)

In [6]:
test_cases[0][0]

def format_messages_string(messages: List[Any]) -> str:
    """Format messages into a single string for analysis."""
    return "\n".join(message.pretty_repr() for message in messages)

In [7]:
req = test_cases[0][0]

msg = {"messages": [{"role": "user", "content": req}]}
config = {"configurable": {"thread_id": str(req)}}

result = graph.invoke(msg, config)
all_messages_str = format_messages_string(result["messages"])

 - TOOL CALL: list_tables()
 - TOOL CALL: describe_table(dict_store)
 - TOOL CALL: execute_query(SELECT COUNT(*) as total_stores FROM dict_store)


In [8]:
golden_sql = test_cases[0][1]

eval_result = criteria_eval_structured_llm.invoke(
        [
            {"role": "system", "content": RESPONSE_CRITERIA_SYSTEM_PROMPT},
            {
                "role": "user",
                "content": f"""\n\n Request: {req}
                            Tool calls: \n\n {all_messages_str} 
                            \n\n Golden SQL: {golden_sql} \n\n 
                Evaluate whether the assistant's response meets the criteria and provide justification for your evaluation.""",
            },
        ]
    )

In [9]:
eval_result.exec_accuracy, eval_result.exact_match

(True, False)

In [19]:
eval_result.exact_match_justification

"The candidate SQL uses a column alias 'AS total_stores', whereas the golden SQL does not. Ignoring whitespace and capitalization, the queries are not identical due to this alias."

In [10]:
eval_result.exact_match_justification

"The only difference is the use of 'as total_stores' alias in the candidate, which is not present in the golden SQL. Ignoring capitalization and whitespace, the addition of the alias makes the queries logically equivalent but not an exact textual (structure) match as required by the criteria."

In [None]:
test_cases = []
for req, criteria in zip(e.reqsuests, e.criterias):
    test_cases.append((req, criteria))
print(f"Created {len(test_cases)} test cases")
return test_cases