In [2]:
import pandas as pd

In [4]:
benchmark = pd.read_excel(r"C:\Users\sreya.kumar\Downloads\gamer_benchmark_6_16.xlsx")

benchmark = benchmark.rename(
    columns={
        "output_answer": "target_answer",
    }
)

benchmark["predicted_answer"] = pd.Series(dtype="str")
benchmark["data_source"] = pd.Series(dtype="str")
benchmark["generation_time"] = pd.Series(dtype="float")
benchmark["response_evaluation"] = pd.Series(dtype="str")
benchmark["response_score"] = pd.Series(dtype="int")
benchmark["predicted_python"] = pd.Series(dtype="str")
benchmark["predicted_mongodb_query"] = pd.Series(dtype="str")

test_df = benchmark[benchmark['query_type'] !=  "schema_docs"]

In [5]:
from gamer_x.agent import main
import time

from typing import Annotated, Literal, TypedDict

from langchain import hub

from gamer_x.utils.llms import (
    SONNET_4_LLM
)


In [6]:
class Evaluator(TypedDict):
    """Relevant material in the retrieved document +
    Binary score to check relevance to the question"""

    score: Annotated[
        Literal["CORRECT", "INCORRECT", "ERROR"],
        ...,
        (
            "Predicted response matched target response, 'correct' or 'incorrect'"
            "Predicted response is an error message, 'error'"
        ),
    ]


evaluator = SONNET_4_LLM.with_structured_output(Evaluator)
evaluator_prompt = hub.pull("eden19/evaluator")
evaluator_chain = evaluator_prompt | evaluator


In [7]:

async def eval():
    benchmark = test_df
    for index, row in benchmark.iterrows():

        response = "Error occurred"
        time_taken = -1
        response_evaluation = "ERROR"
        response_score = 0

        query = row["input_question"]
        target_response = row["target_answer"]

        try:
            try:
                start = time.time()
                answer = await main(query)
                end = time.time()
                time_taken = end - start
                response = answer["generation"]

            except Exception as e:
                response = f"Error: {e}"

            benchmark.at[index, "predicted_answer"] = response
            benchmark.at[index, "generation_time"] = time_taken

            mongodb_response = answer.get("mongodb_query", "NA")
            python_response = answer.get("python_code", "NA")



            response_result = await evaluator_chain.ainvoke(
                {
                    "query": query,
                    "target": target_response,
                    "predicted": response,
                }
            )
            response_evaluation = response_result["score"]

            response_score = 0

            if response_evaluation == "CORRECT":
                response_score = 1

        except Exception as e:
            response_score = f"Error: {e}"

        benchmark.at[index, "response_evaluation"] = (
            response_evaluation
        )
        benchmark.at[index, "response_score"] = response_score

    return benchmark

In [8]:
results = await eval()

results.to_csv("gamer_2.0_evals_part_1.csv", index=False)

content=[{'type': 'text', 'text': 'I need to identify assets where `acquisition.experimenter_full_name` exists as strings instead of a list of strings. Based on the analysis, I\'ll use an aggregation pipeline to find documents where this field has type "string" and count them.'}, {'type': 'tool_use', 'name': 'aggregation_retrieval', 'input': {'agg_pipeline': [{'$match': {'acquisition.experimenter_full_name': {'$exists': True, '$type': 'string'}}}, {'$count': 'assets_with_string_experimenter_name'}]}, 'id': 'tooluse_XfLJxtozT46Do4HgkT20Nw'}] additional_kwargs={} response_metadata={'ResponseMetadata': {'RequestId': 'e2f0c4e7-6bf8-4d00-914e-4fa7339c17ac', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 06 Aug 2025 22:14:53 GMT', 'content-type': 'application/json', 'content-length': '794', 'connection': 'keep-alive', 'x-amzn-requestid': 'e2f0c4e7-6bf8-4d00-914e-4fa7339c17ac'}, 'RetryAttempts': 0}, 'stopReason': 'tool_use', 'metrics': {'latencyMs': [3395]}, 'model_name': 'us.anthropic.

Python REPL can execute arbitrary code. Use with caution.


content=[{'type': 'text', 'text': "I'll execute the provided Python code to retrieve session information for mouse 721291 from the MongoDB database."}, {'type': 'tool_use', 'name': 'python_executor', 'input': {'python_code': 'import json\nfrom datetime import datetime\n\nfrom aind_data_access_api.document_db import MetadataDbClient\n\nAPI_GATEWAY_HOST = "api.allenneuraldynamics-test.org"\nDATABASE = "metadata_vector_index"\nCOLLECTION = "static_eval_data_assets_3_14"\n\ndocdb_api_client = MetadataDbClient(\n    host=API_GATEWAY_HOST,\n    database=DATABASE,\n    collection=COLLECTION,\n)\n\n# Filter for mouse 721291 and project session information\nfilter = {\n    "session.subject_id": "721291"\n}\n\nprojection = {\n    "session.session_start_time": 1,\n    "session.session_type": 1\n}\n\ntry:\n    records = docdb_api_client.retrieve_docdb_records(\n        filter_query=filter,\n        projection=projection,\n    )\n    \n    # Create table with sessions, date, and session_type\n    p

CancelledError: 