In [1]:
import pandas as pd

In [2]:
benchmark = pd.read_excel(r"C:\Users\sreya.kumar\Downloads\gamer_benchmark_6_16.xlsx")

benchmark = benchmark.rename(
    columns={
        "output_answer": "target_answer",
    }
)

benchmark["predicted_answer"] = pd.Series(dtype="str")
benchmark["data_source"] = pd.Series(dtype="str")
benchmark["generation_time"] = pd.Series(dtype="float")
benchmark["response_evaluation"] = pd.Series(dtype="str")
benchmark["response_score"] = pd.Series(dtype="int")
benchmark["predicted_python"] = pd.Series(dtype="str")
benchmark["predicted_mongodb_query"] = pd.Series(dtype="str")

test_df = benchmark[benchmark['query_type'] !=  "schema_docs"]

In [4]:
test_df["query_type"].value_counts()

query_type
database    36
analysis    23
field       23
project     21
asset       16
Name: count, dtype: int64

In [None]:
from gamer_x.agent import main
import time

from typing import Annotated, Literal, TypedDict

from langchain import hub

from gamer_x.utils.llms import (
    SONNET_4_LLM
)


In [None]:
class Evaluator(TypedDict):
    """Relevant material in the retrieved document +
    Binary score to check relevance to the question"""

    score: Annotated[
        Literal["CORRECT", "INCORRECT", "ERROR"],
        ...,
        (
            "Predicted response matched target response, 'correct' or 'incorrect'"
            "Predicted response is an error message, 'error'"
        ),
    ]


evaluator = SONNET_4_LLM.with_structured_output(Evaluator)
evaluator_prompt = hub.pull("eden19/evaluator")
evaluator_chain = evaluator_prompt | evaluator


In [None]:

async def eval():
    benchmark = test_df
    for index, row in benchmark.iterrows():

        response = "Error occurred"
        time_taken = -1
        response_evaluation = "ERROR"
        response_score = 0

        query = row["input_question"]
        target_response = row["target_answer"]

        try:
            try:
                start = time.time()
                answer = await main(query)
                end = time.time()
                time_taken = end - start
                response = answer["generation"]

            except Exception as e:
                response = f"Error: {e}"

            benchmark.at[index, "predicted_answer"] = response
            benchmark.at[index, "generation_time"] = time_taken

            mongodb_response = answer.get("mongodb_query", "NA")
            python_response = answer.get("python_code", "NA")



            response_result = await evaluator_chain.ainvoke(
                {
                    "query": query,
                    "target": target_response,
                    "predicted": response,
                }
            )
            response_evaluation = response_result["score"]

            response_score = 0

            if response_evaluation == "CORRECT":
                response_score = 1

        except Exception as e:
            response_score = f"Error: {e}"

        benchmark.at[index, "response_evaluation"] = (
            response_evaluation
        )
        benchmark.at[index, "response_score"] = response_score

    return benchmark

In [None]:
results = await eval()

results.to_csv("gamer_2.0_evals_part_1.csv", index=False)