## RAGAS

In [None]:
%pip install ragas
%pip install langchain langchain-openai langchain-core langgraph openai

**Python Version**

La version de python debe ser mayor a 3.1.

**Librerias**

In [86]:
import os
from typing import Annotated, Literal, TypedDict
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, BaseMessage, AIMessage
from langchain_core.tools import Tool
from langgraph.graph import StateGraph, MessagesState
from langgraph.prebuilt import ToolNode
from langgraph.checkpoint.memory import MemorySaver
from langchain.callbacks.tracers import LangChainTracer
from agent.states import StatusMessagesState
from langchain.schema import HumanMessage, SystemMessage, AIMessage
# For LangChain Community (newer versions):
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage

# For OpenAI-style messages:
from openai.types.chat import ChatCompletionMessage
from agent.prompts import SYSTEM_MESSAGE
from agent.agent import build_graph
import pandas as pd

**LangGraph**

In [105]:
# Set your OpenAI API key

## Incluye tu API key de OpenAI en la variable de entorno
## Incluye tu API key de LangChain en la variable de entorno. No es esencial, pero es recomendable
os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGCHAIN_API_KEY"] = ""
## add langsmith
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "arithmetic"

## Dataset ##

In [98]:
results_json = {
    'question': 'Cual es el resultado de (1645+1223)/344',
    'result': 'El resultado de la operacion es el siguiente: 8.34',
    'tool_calls': [
        ToolCall(name="sum", args={"__arg1": 1645, "__arg2": 1223}),
        ToolCall(name="divide", args={"__arg1": 2868, "__arg2": 344})
    ]
}
df = pd.DataFrame([results_json])
df

Unnamed: 0,question,result,tool_calls
0,Cual es el resultado de (1645+1223)/344,El resultado de la operacion es el siguiente: ...,"[name='sum' args={'__arg1': 1645, '__arg2': 12..."


**Call Agents**

In [89]:
import uuid

# Generate a unique session ID
session_id = str(uuid.uuid4())
user_str = "default_user"
message = df['question'][0]
app = build_graph()


tracer = LangChainTracer()
thread = {
    "configurable": {
        "thread_id": session_id,
    }
}

thread["callbacks"] = [tracer]
thread["tracing_v2_enabled"] = True

# Process the chat history to ensure proper structure
messages = []
tool_call_message = None  # Track the last message with tool_calls

# Add the current user message
messages.append(HumanMessage(content=message))

# Create initial state with messages and user
initial_state = StatusMessagesState(
    messages=messages,
    user=user_str
)

result = app.invoke(initial_state, config=thread)
print(f"📋 Final result has {len(result['messages'])} messages")

total_result = []
# Find the last AI message and check if it has tool calls
for msg in reversed(result["messages"]):
    if isinstance(msg, AIMessage):
        total_result.append(msg)

print(total_result)


🤖 Chatbot node: Processing messages...
System message content: You are a helpful assistant that can perform basic arithmetic operations.

Key capabilities:
- Perform multiplication
- Perform division
- Perform addition
- Perform subtraction

The tool you should use is:
- multiply: Multiplies two numbers. The arguments should be passed as 'a' and 'b'. Must not be like arg1: 2422*6454656
- divide: Divides two numbers. The arguments should be passed as 'a' and 'b'. Must not be like arg1: 2422/6454656
- sum: Sums two numbers. The arguments should be passed as 'a' and 'b'. Must not be like arg1: 2422+6454656
- sub: Subtracts two numbers. The arguments should be passed as 'a' and 'b'. Must not be like arg1: 2422-6454656

Consideration
- You must follow the rules of the mathematics for calling the tool sequentially. If you have to perform a division, you must first perform obtian the 
values in the numerator and denominator. and then perform the division.

When users ask vague questions, ask 

In [90]:
total_result


[AIMessage(content='El resultado es 8.34.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 683, 'total_tokens': 692, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-2025-04-14', 'system_fingerprint': 'fp_3502f4eb73', 'finish_reason': 'stop', 'logprobs': None}, id='run--fb7c79fe-1f54-449c-840c-e431da5f6129-0', usage_metadata={'input_tokens': 683, 'output_tokens': 9, 'total_tokens': 692, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}),
 AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_mPtajg8l2OtnIas3LbXqJzDN', 'function': {'arguments': '{"__arg1":"2868","__arg2":"344"}', 'name': 'divide'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'

In [91]:
total_result_reversed = total_result[::-1]
total_result_reversed.insert(0, HumanMessage(content=message))
total_result_reversed

[HumanMessage(content='Cual es el resultado de (1645+1223)/344', additional_kwargs={}, response_metadata={}),
 AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_kSSObQgYyoVpWUlsPutoH6Tw', 'function': {'arguments': '{"__arg1":"1645","__arg2":"1223"}', 'name': 'sum'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 23, 'prompt_tokens': 613, 'total_tokens': 636, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4.1-2025-04-14', 'system_fingerprint': 'fp_3502f4eb73', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--ee634d49-0799-4e26-a154-d3d2f25ef491-0', tool_calls=[{'name': 'sum', 'args': {'__arg1': '1645', '__arg2': '1223'}, 'id': 'call_kSSObQgYyoVpWUlsPutoH6Tw', 'type': 'tool_call'}], usage_metadata={'input_tokens': 613, 'output_tokens': 

## Metricas de RAGAS

RAGAS es una libreria en python para evaluar chatbots tanto de RAG como agenticos. Esta libreria consta de distintas metricas para poder evaluar la perfomances de los chabot, usando LLM como evaluador.

El link de esta libreria se encuentra en https://docs.ragas.io/en/stable/

Los chatbot con agentes podrian ser evaluado por dos metricas: 

1. Tool Accuracy: Precision en la ejecucion de los tools. El modelo deberia llamar al tool cuando sea necesario. 
2. Agent Goal accuracy: Precision en la ejecucion del objetivo del agente. El agente deberia cumplir con el objetivo. Esta metrica podria ser como la accuracy de cualquier modelo de ML, y se estudiara la metrica de precision de NVIDIA.

**Tool Accuracy**

In [92]:
from ragas.metrics import ToolCallAccuracy
from ragas.dataset_schema import MultiTurnSample
from ragas.integrations.langgraph import convert_to_ragas_messages
import ragas.messages as r

ragas_trace = convert_to_ragas_messages(
    messages=total_result_reversed
)  # List of Ragas messages converted using the Ragas function


In [93]:
ragas_trace

[HumanMessage(content='Cual es el resultado de (1645+1223)/344', metadata=None, type='human'),
 AIMessage(content='', metadata=None, type='ai', tool_calls=[ToolCall(name='sum', args={'__arg1': '1645', '__arg2': '1223'})]),
 AIMessage(content='', metadata=None, type='ai', tool_calls=[ToolCall(name='divide', args={'__arg1': '2868', '__arg2': '344'})]),
 AIMessage(content='El resultado es 8.34.', metadata=None, type='ai', tool_calls=[])]

In [99]:
from ragas.metrics import ToolCallAccuracy
from ragas.dataset_schema import  MultiTurnSample
from ragas.messages import HumanMessage,AIMessage,ToolMessage,ToolCall

sample = MultiTurnSample(
    user_input=ragas_trace,
    reference_tool_calls=df['tool_calls'][0]
)

scorer = ToolCallAccuracy()
await scorer.multi_turn_ascore(sample)

1.0

**Agent Goal accuracy**

In [101]:
print('La respuesta es: ' + df['result'][0])

La respuesta es: El resultado de la operacion es el siguiente: 8.34


In [102]:
from langchain_openai import ChatOpenAI
from ragas.dataset_schema import MultiTurnSample
from ragas.messages import HumanMessage, AIMessage, ToolMessage, ToolCall
from ragas.metrics import AgentGoalAccuracyWithReference
from ragas.llms import LangchainLLMWrapper

# 1️⃣ Define the evaluator LLM
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))

# 2️⃣ Create your MultiTurnSample
sample = MultiTurnSample(
    user_input=ragas_trace,
    reference=df['result'][0]
)

# 3️⃣ Create the scorer with the evaluator_llm
scorer = AgentGoalAccuracyWithReference(llm=evaluator_llm)

# 4️⃣ Run the evaluation
result = await scorer.multi_turn_ascore(sample)
print(result)


1.0


Esta correcto, pero que pasa si lo dejamos con todos los numeros decimales, el resultado cambia.

In [None]:
reference_redefined = 'El resultado de la operacion es el siguiente: 8.33720'

In [104]:
# 1️⃣ Define the evaluator LLM
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))

# 2️⃣ Create your MultiTurnSample
sample = MultiTurnSample(
    user_input=ragas_trace,
    reference=reference_redefined
)

# 3️⃣ Create the scorer with the evaluator_llm
scorer = AgentGoalAccuracyWithReference(llm=evaluator_llm)

# 4️⃣ Run the evaluation
result = await scorer.multi_turn_ascore(sample)
print(result)


0.0


**Answer Accuracy (NVIDIA metrics)**

In [68]:
ragas_trace[-1].content

'El resultado es 8.34.'

In [76]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import AnswerAccuracy

sample = SingleTurnSample(
    user_input=message,
    response=ragas_trace[-1].content,
    reference="El resultado de de la operacion es 8.337"
)
scorer = AnswerAccuracy(llm=evaluator_llm) # evaluator_llm wrapped with ragas LLM Wrapper
score = await scorer.single_turn_ascore(sample)
print(score)

0.5
