In [1]:
import deepeval
from deepeval import evaluate
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

In [2]:
# Import the necessary module
from dotenv import load_dotenv

# Load environment variables from the .env file (if present)
load_dotenv()

True

In [3]:
import os

CONFIDENT_API_KEY = os.getenv('CONFIDENT_API_KEY')
deepeval.login_with_confident_api_key(CONFIDENT_API_KEY)

In [13]:
import re

from react_agent.graph import graph


def extract_file_content(input_string, filename):
    """Extract Final_report from the last message."""
    # Pattern to match code blocks like ```filename ... ```
    pattern = rf'```{re.escape(filename)}(.*?)```'
    match = re.search(pattern, input_string, re.DOTALL)
    
    if match:
        return match.group(1).strip()  # Extract and clean the content
    else:
        return None  # Or raise an error if needed



async def run_agent(user_message):
    """Call agent and return last message.

    A function to run the agent and return the latest message.
    """
    messages = await graph.ainvoke({"messages": [("user", user_message)]}, {"recursion_limit": 40})
    # Example usage
    filename = "Final_report.md"
    content = extract_file_content(messages["messages"][-1].content, filename)
    return content

In [14]:
# output = await run_agent("Generate a strategy intelligence report for the electric vehicle market and its key players.")
# def write_markdown_to_file(markdown_string: str, file_path: str) -> None:
#     """Write a Markdown-formatted string to a file.
    
#     Args:
#         markdown_string (str): The Markdown content to write
#         file_path (str): Path to the output file (including .md extension)
    
#     Returns:
#         None
    
#     Raises:
#         IOError: If there's a problem writing the file
#     """
#     try:
#         with open(file_path, 'w', encoding='utf-8') as f:
#             f.write(markdown_string)
#         print(f"Successfully wrote Markdown to {file_path}")
#     except IOError as e:
#         print(f"Error writing Markdown file: {e}")
#         raise

# write_markdown_to_file(output, "../../samples/final_report_sample-claude3.7-search_code_002.md")

In [15]:
# Pull from Confident AI
dataset = EvaluationDataset()
dataset.pull("Strategy Intelligence Report")

Output()

In [16]:
# Populate test cases
for golden in dataset.goldens:
    output = await run_agent(golden.input)
    test_case = LLMTestCase(
        input=golden.input,
        # Generate an LLM output by replacing this with
        # the output your LLM app generated for this `golden.input`
        actual_output= output
    )
    dataset.add_test_case(test_case)


In [17]:
completeness_metric = GEval(
        name="Completeness",
        criteria="Determine if the 'actual output' contains all needed sections: 1. Market trends. 2. Competitor analysis 3. strategic recommendations",
        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
        threshold=0.5
    )

usefulness = GEval(
    name= "Usefulness",
    criteria="Determine if the 'actual output' contains useful relevant insights and actionable recommendations.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    threshold=0.5
)

format = GEval(
    name= "Format",
    criteria="Determine if the 'actual output' format can be presented to leadership and contains visual representaion of the insights",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    threshold=0.3
)

In [18]:
from react_agent.configuration import Configuration

configuration = Configuration()

In [19]:
evaluate(dataset, [completeness_metric, usefulness, format], hyperparameters = {"model":configuration.model,
                                                               "prompt template": configuration.system_prompt,
                                                               "agent":"003--react-agent--search-code",
                                                               "max_search_results": configuration.max_search_results})

Evaluating 6 test case(s) in parallel: |█▋        | 17% (1/6) [Time Taken: 00:26, 26.92s/test case]ERROR:root:OpenAI Error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-emClbMMyd9XcilFQuLclNFwb on tokens per min (TPM): Limit 30000, Used 27137, Requested 3893. Please try again in 2.06s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}} Retrying: 1 time(s)...
ERROR:root:OpenAI Error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-emClbMMyd9XcilFQuLclNFwb on tokens per min (TPM): Limit 30000, Used 27167, Requested 3856. Please try again in 2.046s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}} Retrying: 1 time(s)...
Evaluating 6 test case(s) in parallel: |███▎      | 33% (2/6) [Time Taken: 00:32, 14.57s/test case]ERROR:root:Open



Metrics Summary

  - ✅ Completeness (GEval) (score: 0.9858846303970591, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The actual output includes a 'Market Insights' section covering market trends, a detailed 'Competitor Analysis' section, and comprehensive 'Strategic Recommendations,' aligning well with the input requirements for a strategy intelligence report for the electric vehicle market., error: None)
  - ✅ Usefulness (GEval) (score: 0.9835483537103438, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The report provides detailed insights directly related to the input by covering market trends and key players in the electric vehicle market. It addresses key points such as market size, regional analysis, growth drivers, and challenges. The report is useful as it introduces new perspectives on competitive dynamics and emerging trends, along with actionable recommendations for manufacturers, investors, and policymakers, aligning with the provided 

EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Completeness (GEval)', threshold=0.5, success=True, score=0.9858846303970591, reason="The actual output includes a 'Market Insights' section covering market trends, a detailed 'Competitor Analysis' section, and comprehensive 'Strategic Recommendations,' aligning well with the input requirements for a strategy intelligence report for the electric vehicle market.", strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0084525, verbose_logs='Criteria:\nDetermine if the \'actual output\' contains all needed sections: 1. Market trends. 2. Competitor analysis 3. strategic recommendations \n \nEvaluation Steps:\n[\n    "Check if the \'actual output\' includes a \'Market trends\' section and compare it to the \'input\' requirements.",\n    "Verify that a \'Competitor analysis\' section is present in the \'actual output\' and evaluate its alignment with the \'input\' 