In [None]:
from openai import OpenAI
import pandas as pd
import json
from pydantic import BaseModel, Field
from IPython.display import Markdown
import os
from dotenv import load_dotenv

load_dotenv()



True

In [None]:
import phoenix as px
import os
from phoenix.otel import register
from openinference.instrumentation.openai import OpenAIInstrumentor
from openinference.semconv.trace import SpanAttributes
from opentelemetry.trace import Status, StatusCode
from openinference.instrumentation import TracerProvider

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import phoenix as px

session = px.launch_app()


  next(self.gen)
  next(self.gen)


üåç To view the Phoenix app in your browser, visit http://localhost:6006/
üìñ For more information on how to use Phoenix, check out https://arize.com/docs/phoenix


In [None]:

from phoenix.otel import register

tracer_provider = register(
  project_name="sql-agent",
  endpoint="http://localhost:6006/v1/traces",
  protocol="grpc",
  
)
OpenAIInstrumentor().instrument(tracer_provider = tracer_provider)

OpenTelemetry Tracing Details
|  Phoenix Project: sql-agent
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://localhost:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [None]:
tracer = tracer_provider.get_tracer(__name__)


In [None]:
# prompt template for step 2 of tool 1
SQL_GENERATION_PROMPT = """
Generate an SQL query based on a prompt"," Do not reply with anything besides the SQL query","
The prompt is: {prompt}

The SQL query should be valid and executable on a PostgreSQL database","
The database schema is as follows:
{schema}
Only Select statements are allowed","

"""

In [7]:

client = OpenAI()

MODEL = "gpt-4o-mini"

In [None]:
# code for step 2 of tool 1
@tracer.chain()
def generate_sql_query(prompt: str, schema: str) -> str:
    """Generate an SQL query based on a prompt"""
    formatted_prompt = SQL_GENERATION_PROMPT.format(prompt=prompt, 
                                                    schema=schema)

    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": formatted_prompt}],
    )
    
    return response.choices[0].message.content

In [None]:
# code for tool 1
@tracer.tool()
def lookup_sales_data(prompt: str) -> str:
    """Implementation of data lookup from postgres db using SQL"""
    try:
        import requests
        with tracer.start_as_current_span("prompt for retrieving schema context",openinference_span_kind="chain") as span:
            span.set_input(prompt)
            # Step 1: Get schema context from the context endpoint
            context_response = requests.post(
                "http://localhost:8000/context",
                json={"query": prompt}  # Fixed: prompt is now the direct value"," not inside another dict
            )
            
            if context_response.status_code != 200:
                return f"Error retrieving schema context: {context_response.text}"
            
            context_data = context_response.json()
            # Extract text from contexts to use as schema
            schema = context_data.get("contexts", [])
            span.set_output(value=schema)
        # Step 2: Generate SQL query based on prompt and schema
        sql_query = generate_sql_query(prompt, schema)
        print(f"Generated SQL Query: {sql_query}")  # Debugging output
        # Clean the response to make sure it only includes the SQL code
        sql_query = sql_query.strip()
        sql_query = sql_query.replace("```sql", "").replace("```", "")
        
        # Step 3: Execute the SQL query using the data endpoint
        data_response = requests.post(
            "http://localhost:8000/data",
            json={"query": sql_query}
        )
        
        if data_response.status_code != 200:
            return f"Error executing query: {data_response.text}"
        
        # Convert the response to a pandas DataFrame for formatting
        result_data = data_response.json()
        if not result_data["records"]:
            return "No results found for your query."
            
        result = pd.DataFrame(result_data["records"])
        
        return result.to_string()
    except Exception as e:
        return f"Error accessing data: {str(e)}"

In [10]:
# example_data = lookup_sales_data("Ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u")
# print(example_data)

# Router

In [None]:
# Define tools/functions that can be called by the model
tools = [
    {
        "type": "function",
        "function": {
            "name": "lookup_sales_data",
            "description": "Look up data from postgres database using SQL",
            "parameters": {
                "type": "object",
                "properties": {
                    "prompt": {"type": "string", "description": "The unchanged prompt that the user provided",""}
                },
                "required": ["prompt"]
            }
        }
    }
]

# Dictionary mapping function names to their implementations
tool_implementations = {
    "lookup_sales_data": lookup_sales_data
}

In [None]:
# code for executing the tools returned in the model's response
@tracer.chain()
def handle_tool_calls(tool_calls, messages):
    
    for tool_call in tool_calls:   
        function = tool_implementations[tool_call.function.name]
        function_args = json.loads(tool_call.function.arguments)
        result = function(**function_args)
        messages.append({"role": "tool", "content": result, "tool_call_id": tool_call.id})
        
    return messages

In [None]:
SYSTEM_PROMPT = """
You are a helpful assistant that can answer questions about data and generate visualizations using tools","
"""

In [None]:
def run_agent(messages):
    print("Running agent with messages:", messages)
    if isinstance(messages, str):
        messages = [{"role": "user", "content": messages}]
    if not any(
            isinstance(message, dict) and message.get("role") == "system" for message in messages
        ):
            system_prompt = {"role": "system", "content": SYSTEM_PROMPT}
            messages.append(system_prompt)

    while True:
        # Router Span
        print("Starting router call span")
        with tracer.start_as_current_span(
            "router_call", openinference_span_kind="chain",
        ) as span:
            span.set_input(value=messages)
            
            response = client.chat.completions.create(
                model=MODEL,
                messages=messages,
                tools=tools,
            )
            messages.append(response.choices[0].message.model_dump())
            tool_calls = response.choices[0].message.tool_calls
            print("Received response with tool calls:", bool(tool_calls))
            span.set_status(StatusCode.OK)
    
            if tool_calls:
                print("Starting tool calls span")
                messages = handle_tool_calls(tool_calls, messages)
                span.set_output(value=tool_calls)
            else:
                print("No tool calls, returning final response")
                span.set_output(value=response.choices[0].message.content)
                return response.choices[0].message.content

In [None]:
def start_main_span(messages):
    print("Starting main span with messages:", messages)
    
    with tracer.start_as_current_span(
        "AgentRun", openinference_span_kind="agent"
    ) as span:
        span.set_input(value=messages)
        ret = run_agent(messages)
        print("Main span completed with return value:", ret)
        span.set_output(value=ret)
        span.set_status(StatusCode.OK)
        return ret

In [17]:
result = start_main_span([{"role": "user", 
                           "content": "Ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u"}])

Starting main span with messages: [{'role': 'user', 'content': 'Ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u'}]
Running agent with messages: [{'role': 'user', 'content': 'Ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u'}]
Starting router call span
Received response with tool calls: True
Starting tool calls span
Generated SQL Query: ```sql
SELECT
    thang,
    SUM(tong_phat) AS tong_tien_phat
FROM
    public."vcc_vhkt.vhkt_import_dt_tp_vtt_2"
WHERE
    nam = 2024
    AND ma_tinh = 'AGG'
GROUP BY
    thang
ORDER BY
    thang;
```
Starting router call span
Received response with tool calls: False
No tool calls, returning final response
Main span completed with return value: D∆∞·ªõi ƒë√¢y l√† ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG nƒÉm 2024 theo t·ª´ng th√°ng:

| Th√°ng | T·ªïng Ti·ªÅn Ph·∫°t (VNƒê)    |
|-------|-------------------------|
| 1     | 30,166,750,000          |
| 2     | 8,531,815,000           |
| 3     | 4,670,423,000       

In [None]:
import nest_asyncio
nest_asyncio.apply()
from tqdm import tqdm
agent_questions = [
    "T·ªïng ti·ªÅn ph·∫°t nƒÉm 2024 l√† bao nhi√™u",
    "T·ªïng ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG nƒÉm 2024 l√† bao nhi√™u",
    "Ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG theo t·ª´ng th√°ng nƒÉm 2024",
    "Ti·ªÅn ph·∫°t KPI duy tr√¨ c·ªßa AGG nƒÉm 2024 l√† bao nhi√™u",
    "Ti·ªÅn ph·∫°t KPI duy tr√¨ c·ªßa AGG theo t·ª´ng th√°ng nƒÉm 2024",
    "Ti·ªÅn ph·∫°t KPI tri·ªÉn khai m·ªõi c·ªßa AGG nƒÉm 2024 l√† bao nhi√™u",
    "Ti·ªÅn ph·∫°t KPI tri·ªÉn khai m·ªõi c·ªßa AGG theo t·ª´ng th√°ng nƒÉm 2024",
    "Ti·ªÅn ph·∫°t l·ªói √Ω th·ª©c, th√°i ƒë·ªô c·ªßa AGG nƒÉm 2024 l√† bao nhi√™u",
    "Ti·ªÅn ph·∫°t l·ªói √Ω th·ª©c th√°i ƒë·ªô c·ªßa AGG theo t·ª´ng th√°ng nƒÉm 2024",
    "Ti·ªÅn ph·∫°t r·ªùi m·∫°ng c·ªßa AGG nƒÉm 2024 l√† bao nhi√™u",
    "Ti·ªÅn ph·∫°t r·ªùi m·∫°ng c·ªßa AGG theo t·ª´ng th√°ng nƒÉm 2024",
    "T·ª∑ l·ªá PAKH to√†n qu·ªëc trong nƒÉm 2024 l√† bao nhi√™u",
    "T·ª∑ l·ªá PAKH to√†n qu·ªëc trong nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u",
  "T·ª∑ l·ªá s·ª± c·ªë l·∫∑p l·∫°i to√†n qu·ªëc trong nƒÉm 2024 l√† bao nhi√™u",
    "T·ª∑ l·ªá s·ª± c·ªë l·∫∑p l·∫°i to√†n qu·ªëc trong nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u",
    "T·ª∑ l·ªá s·ª± c·ªë l·∫∑p l·∫°i to√†n qu·ªëc trong nƒÉm 2024 l√† bao nhi√™u",
    "T·ª∑ l·ªá s·ª± c·ªë l·∫∑p l·∫°i to√†n qu·ªëc trong nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u",
    "T·ª∑ l·ªá s·ª± c·ªë l·∫∑p l·∫°i c·ªßa AGG nƒÉm 2024 l√† bao nhi√™u",
    "T·ª∑ l·ªá s·ª± c·ªë l·∫∑p l·∫°i c·ªßa AGG nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u?",
    "S·ªë PAKH ph√°t sinh nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u?",
    "T·ª∑ l·ªá x·ª≠ l√Ω s·ª± c·ªë trong 3h nƒÉm 2024 c·ªßa To√†n qu·ªëc theo t·ª´ng th√°ng l√† bao nhi√™u?",
    "S·ªë l∆∞·ª£ng s·ª± c·ªë ph√°t sinh c·ªßa AGG nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u?",
    # "T·ª∑ l·ªá x·ª≠ l√Ω s·ª± c·ªë trong 10h nƒÉm 2024 c·ªßa To√†n qu·ªëc l√† bao nhi√™u?",

]

for question in tqdm(agent_questions, desc="Processing questions"):

    try:
        ret= lookup_sales_data(question)
        # print(f"Result for question '{question}':")
    except Exception as e:
        print(f"Error processing question: {question}")
        print(e)
        continue

Processing questions:   0%|          | 0/22 [00:00<?, ?it/s]

Generated SQL Query: ```sql
SELECT 
    thang, 
    AVG(ty_le_pakh_10000tb_ngay_dich_vu_ftth) AS ty_le_pakh_ftth, 
    AVG(ty_le_pakh_10000tb_ngay_dich_vu_khac) AS ty_le_pakh_khac
FROM 
    public."vcc_vhkt.vhkt_import_kpi_duy_tri_cdbr_tinh"
WHERE 
    nam = 2024
GROUP BY 
    thang
ORDER BY 
    thang;
```
Error processing question: T·ª∑ l·ªá PAKH to√†n qu·ªëc trong nƒÉm 2024 theo t·ª´ng th√°ng l√† bao nhi√™u
object str can't be used in 'await' expression
Generated SQL Query: ```sql
SELECT 
    SUM(tong_su_co_phat_sinh_co_hen) AS tong_su_co_phat_sinh_co_hen,
    SUM(tong_su_co_duoc_xu_ly_dung_hen) AS tong_su_co_duoc_xu_ly_dung_hen,
    (SUM(tong_su_co_duoc_xu_ly_dung_hen) * 100.0 / NULLIF(SUM(tong_su_co_phat_sinh_co_hen), 0)) AS ty_le_su_co_duoc_xu_ly_dung_hen
FROM 
    public."vcc_vhkt.vhkt_import_kpi_duy_tri_cdbr_tinh"
WHERE 
    nam = 2024;
```
Error processing question: T·ª∑ l·ªá s·ª± c·ªë l·∫∑p l·∫°i to√†n qu·ªëc trong nƒÉm 2024 l√† bao nhi√™u
object str can't be used in 'await' 

In [None]:
from phoenix.evals import (
    TOOL_CALLING_PROMPT_TEMPLATE, 
    llm_classify,
    OpenAIModel
)
from phoenix.trace import SpanEvaluations
from phoenix.trace.dsl import SpanQuery
from openinference.instrumentation import suppress_tracing

In [None]:
query = SpanQuery().where(
    "name =='generate_sql_query'"
).select(
    sql_gen="output.value",
    context="input.value",
    
)
prompt = SpanQuery().where(
    "span_kind=='AGENT'"
).select(
    prompt="input.value"
)

# The Phoenix Client can take this query and return the dataframe.
sql_df = px.Client().query_spans(query, 
                                 project_name="sql-agent",
                                 timeout=None)

prompt_df = px.Client().query_spans(prompt,
                                    project_name="sql-agent",
                                    timeout=None)
sql_df['prompt'] = prompt_df['prompt'].values
sql_df.head()

Unnamed: 0_level_0,sql_gen,context,prompt
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
715fd7cdad76baed,```sql\nSELECT SUM(tong_phat) AS tong_tien_pha...,"{""prompt"": ""T·ªïng ti·ªÅn ph·∫°t nƒÉm 2024"", ""schema""...","[{""role"": ""user"", ""content"": ""T·ªïng ti·ªÅn ph·∫°t n..."
ca196ca3dd66c0db,```sql\nSELECT SUM(tong_phat) AS tong_tien_pha...,"{""prompt"": ""T·ªïng ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG nƒÉm 20...","[{""role"": ""user"", ""content"": ""T·ªïng ti·ªÅn ph·∫°t c..."
08394cc649b3b987,"```sql\nSELECT thang, SUM(tong_phat) AS tong_t...","{""prompt"": ""Ti·ªÅn ph·∫°t c·ªßa t·ªânh AGG theo t·ª´ng t...","[{""role"": ""user"", ""content"": ""Ti·ªÅn ph·∫°t c·ªßa t·ªâ..."
4aeb95b07ed2be60,```sql\nSELECT \n SUM(phat_kpi_duy_tri_cdbr...,"{""prompt"": ""Ti·ªÅn ph·∫°t KPI duy tr√¨ c·ªßa AGG nƒÉm ...","[{""role"": ""user"", ""content"": ""Ti·ªÅn ph·∫°t KPI du..."
e8cad356d2362fe5,"```sql\nSELECT thang, SUM(phat_kpi_duy_tri_cdb...","{""prompt"": ""Ti·ªÅn ph·∫°t KPI duy tr√¨ c·ªßa AGG theo...","[{""role"": ""user"", ""content"": ""Ti·ªÅn ph·∫°t KPI du..."


In [None]:
# save sql_df to a file
sql_df.to_csv("sql_queries.csv", index=False)

In [None]:
#save to excel 
sql_df.to_excel("sql_queries2.xlsx", index=False)