In [0]:
%pip install -U -qqq mlflow langgraph==0.3.4 databricks-langchain databricks-agents uv 
dbutils.library.restartPython()


## Generate synthetic data

To synthesise evaluations for an agent that uses document retrieval, use the `generate_evals_df` method from the `databricks-agents` Python package. This step automatically creates a set of evaluation questions and answers based on your unstructured guidance documents.

- **Input**: A table of documents (`content`, `doc_uri`) and evaluation guidelines
- **Output**: A generated set of eval questions stored as a Delta table
- **Purpose**: To test and refine retrieval performance using realistic, autogenerated examples


In [0]:
import pandas as pd

#Define the catalog name for each user 
user_email = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get() 
email = str(user_email).split('@')
catalog = email[0].replace('.','')

schema = "agent_workshop"

num_evals = 30 
model_version = 1

table_name = f"{catalog}.{schema}.guidance_unstructure"
eval_table = f"{catalog}.{schema}.finOps_synthetic_evals"
model_name = f"models:/{catalog}.{schema}.financial_assistance_agent/{model_version}"

# FinOps portfolio analyst persona , guideline for synthetic data generation 
guidelines = f"""
# Task Description
The agent is a financial portfolio advisor and a fraud assistant agent. The agents task is to provide guidance on difference scenarios of rebalancing , portfolio assessment , fraud analysis guidance , procedures of next steps if fraud happens etc.s

# User personas
- A customer who wants to assess and understand their portfolio and seek guidance on management.
- A customer who wants to understand the steps to take if fraud happens.
- A customer who wants to get guidance on portfolio rebalancing.

# Example questions
- What is the standard procedure for rebalancing my portfolio and what should i consider?
- How much time does it take for a fraud incident to resolve ?

# Additional Guidelines
- Questions should be succinct, and human-like
"""

In [0]:
# Use the synthetic eval generation API to get some evals
from databricks.agents.evals import generate_evals_df

parsed_docs_df = spark.table(table_name).withColumnRenamed('file_name', 'doc_uri').withColumnRenamed('file_content', 'content').toPandas()

evals = generate_evals_df(
    docs=parsed_docs_df, # Pass your docs. Pandas/Spark Dataframes with columns `content STRING, doc_uri STRING` are suitable.
    num_evals=num_evals, # How many synthetic evaluations to generate
    question_guidelines=guidelines
)
display(evals)

#store the synthetic evaluation set into a delta table
spark.createDataFrame(evals).write.format("delta").mode("overwrite").saveAsTable(eval_table)


# Offline evaluation via Mosaic AI eval

- Built-in LLM Judges
- Global Guidelines

In [0]:
import mlflow 
itern = 1
with mlflow.start_run(run_name=f"finOps_assistant_offline_eval_{itern}") as run: 
  # Run evaluation
  eval_results = mlflow.evaluate(
    data=evals,  # The latest evaluation dataset
    model=model_name,  
    model_type="databricks-agent",  
    evaluator_config={
          "databricks-agent": {
              "metrics": ['relevance_to_query',
                          'safety', 
                          'groundedness',
                          'correctness', 
                          'chunk_relevance', 
                          'guideline_adherence'
                          ],
            "global_guidelines": ["The response must be on the domain of Financial Assistant or Fraud analysis and suggest  financial guidance", 
                                  "The response should not contain inconsistant answer to the question",
                                  "The response should not generate hypothetical answer", 
                                  "The response should not contain harmful or discriminatory content"]
          }
      }
    )
  
iter +=1