In [1]:
import deepeval
from deepeval import evaluate
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

In [2]:
# Import the necessary module
from dotenv import load_dotenv

# Load environment variables from the .env file (if present)
load_dotenv()

True

In [3]:
import os

CONFIDENT_API_KEY = os.getenv('CONFIDENT_API_KEY')
deepeval.login_with_confident_api_key(CONFIDENT_API_KEY)

In [4]:
from react_agent.graph import graph


async def run_agent(user_message):
    """Call agent and return last message.

    A function to run the agent and return the latest message.
    """
    messages = await graph.ainvoke({"messages": [("user", user_message)]})
    return messages["messages"][-1].content

In [5]:
# Pull from Confident AI
dataset = EvaluationDataset()
dataset.pull("Strategy Intelligence Report")

Output()

In [6]:
# Populate test cases
for golden in dataset.goldens:
    output = await run_agent(golden.input)
    test_case = LLMTestCase(
        input=golden.input,
        # Generate an LLM output by replacing this with
        # the output your LLM app generated for this `golden.input`
        actual_output= output
    )
    dataset.add_test_case(test_case)


In [21]:
completeness_metric = GEval(
        name="Completeness",
        criteria="Determine if the 'actual output' contains all needed sections: 1. Market trends. 2. Competitor analysis 3. strategic recommendations",
        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
        strict_mode=True
    )

usefulness = GEval(
    name= "Usefulness",
    criteria="Determine if the 'actual output' contains useful relevant insights and actionable recommendations.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    strict_mode=True
)

format = GEval(
    name= "Format",
    criteria="Determine if the 'actual output' format can be presented to leadership and contains visual representaion of the insights",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    threshold=0.3
)

In [22]:
from react_agent.configuration import Configuration

configuration = Configuration()

In [23]:
evaluate(dataset, [completeness_metric, usefulness, format], hyperparameters = {"model":configuration.model,
                                                               "prompt template": configuration.system_prompt,
                                                               "agent":"baseline--react-agent--search",
                                                               "max_search_results": configuration.max_search_results})

Evaluating 6 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (6/6) [Time Taken: 00:21,  3.61s/test case]




Metrics Summary

  - ‚ùå Completeness (GEval) (score: 0.0, threshold: 1.0, strict: True, evaluation model: gpt-4o, reason: The Actual Output lacks a dedicated 'Competitor analysis' section and does not explicitly title the 'Market trends' section, affecting completeness and alignment with the Input., error: None)
  - ‚úÖ Usefulness (GEval) (score: 1.0, threshold: 1.0, strict: True, evaluation model: gpt-4o, reason: The actual output thoroughly addresses the input by providing insights into the electric vehicle market size, growth, key players, trends, challenges, opportunities, and strategic recommendations for both established players and new entrants. The recommendations are actionable, practical, and aligned with the concerns and goals outlined in the input., error: None)
  - ‚úÖ Format (GEval) (score: 0.30642092374869484, threshold: 0.3, strict: False, evaluation model: gpt-4o, reason: The output does not include visual elements like charts, graphs, or tables which are crucial fo

EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Completeness (GEval)', threshold=1.0, success=False, score=0.0, reason="The Actual Output lacks a dedicated 'Competitor analysis' section and does not explicitly title the 'Market trends' section, affecting completeness and alignment with the Input.", strict_mode=True, evaluation_model='gpt-4o', error=None, evaluation_cost=0.00523, verbose_logs='Criteria:\nDetermine if the \'actual output\' contains all needed sections: 1. Market trends. 2. Competitor analysis 3. strategic recommendations \n \nEvaluation Steps:\n[\n    "1. Verify if the \'Actual Output\' includes a section titled or related to \'Market trends\' and ensure its content aligns with the \'Input\'.",\n    "2. Confirm the presence of a \'Competitor analysis\' section in the \'Actual Output\' and evaluate its completeness against the \'Input\'.",\n    "3. Check for a \'Strategic recommendations\' section within the \'Ac