# Local Evaluation - Groundedness

After you have setup and configured the prompt flow, its time to evaluation its performance. Here we can use the prompt flow SDK to test different questions and see how the prompt flow performs using the evaluation prompt flows provided.

In [6]:
from promptflow import PFClient
pf_client = PFClient()

In [7]:
# Add a question to test the base prompt flow.
question = "Can you tell me about your jackets?"
customerId = "4"
output = pf_client.test(
    flow="../", # Path to the flow directory
    inputs={ # Inputs to the flow
        "chat_history": [],
        "question": question,
        "customerId": customerId,
    },
)

output["answer"] = "".join(list(output["answer"]))



2023-12-01 14:10:54 -0600   25660 execution.flow     INFO     Start to run 5 nodes with concurrency level 16.
2023-12-01 14:10:54 -0600   25660 execution.flow     INFO     Executing node question_embedding. node run id: d7be5c33-87e1-49f9-aa89-11e807fead1f_question_embedding_0
2023-12-01 14:10:54 -0600   25660 execution.flow     INFO     Executing node customer_lookup. node run id: d7be5c33-87e1-49f9-aa89-11e807fead1f_customer_lookup_0
2023-12-01 14:10:55 -0600   25660 execution.flow     INFO     Node question_embedding completes.
2023-12-01 14:10:55 -0600   25660 execution.flow     INFO     Executing node retrieve_documentation. node run id: d7be5c33-87e1-49f9-aa89-11e807fead1f_retrieve_documentation_0
2023-12-01 14:10:56 -0600   25660 execution.flow     INFO     Node retrieve_documentation completes.
2023-12-01 14:10:56 -0600   25660 execution.flow     INFO     Node customer_lookup completes.
2023-12-01 14:10:56 -0600   25660 execution.flow     INFO     Executing node customer_prompt

In [8]:
output

{'answer': "Hi Sarah Lee! 😊 We offer two hiking jackets that might interest you: the Summit Breeze Jacket and the RainGuard Hiking Jacket. The Summit Breeze Jacket (#3) is a perfect lightweight option for outdoor adventures, while the RainGuard Hiking Jacket (#17) offers ultimate weatherproof comfort. Both jackets are made with breathable fabric and have adjustable cuffs for maximum comfort. If you're looking to complete your hiking kit, our TrailMaster X4 Tent and CozyNights Sleeping Bag would go well with either of the jackets. 🏕️ Happy adventuring!",
 'context': [{'id': '3',
   'title': 'Summit Breeze Jacket',
   'content': "Discover the joy of hiking with MountainStyle's Summit Breeze Jacket. This lightweight jacket is your perfect companion for outdoor adventures. Sporting a trail-ready, windproof design and a water-resistant fabric, it's ready to withstand any weather. The breathable polyester material and adjustable cuffs keep you comfortable, whether you're ascending a mountain

Test the groundedness of the prompt flow with the answer from the above question.

In [9]:
test = pf_client.test(
    "groundedness",
    inputs={
        "question": question,
        "context": str(output["context"]),
        "answer": output["answer"],
    },
)

2023-12-01 14:11:01 -0600   25660 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2023-12-01 14:11:01 -0600   25660 execution.flow     INFO     Executing node groundedness_score. node run id: 56c91bc8-e4ad-47c1-9a16-499ff2accd24_groundedness_score_0
2023-12-01 14:11:02 -0600   25660 execution.flow     INFO     Node groundedness_score completes.
2023-12-01 14:11:02 -0600   25660 execution.flow     INFO     Executing node concat_scores. node run id: 56c91bc8-e4ad-47c1-9a16-499ff2accd24_concat_scores_0
2023-12-01 14:11:02 -0600   25660 execution.flow     INFO     Node concat_scores completes.
2023-12-01 14:11:02 -0600   25660 execution.flow     INFO     Start to run 1 nodes with concurrency level 16.
2023-12-01 14:11:02 -0600   25660 execution.flow     INFO     Executing node aggregate_variants_results. node run id: d72dfa8a-8e06-4503-ba06-ceee66b25c8d_aggregate_variants_results_reduce
2023-12-01 14:11:02 -0600   25660 execution.flow     INFO     Node aggregate

In [10]:
test

{'gpt_groundedness': 5.0}

# Local Evaluation - Multiple Metrics 

Now use the same prompt flow and test it against the Multi Evaluation flow for groundedness, coherence, fluency, and relevance.

In [11]:
test_multi = pf_client.test(
    "multi_flow",
    inputs={
        "question": question,
        "context": str(output["context"]),
        "answer": output["answer"],
    },
)

2023-12-01 14:11:06 -0600   25660 execution.flow     INFO     Start to run 8 nodes with concurrency level 16.
2023-12-01 14:11:06 -0600   25660 execution.flow     INFO     Executing node coherence_score. node run id: 33e95ae2-a6fd-44e7-bbfa-60ccd739f364_coherence_score_0
2023-12-01 14:11:06 -0600   25660 execution.flow     INFO     Executing node fluency_score. node run id: 33e95ae2-a6fd-44e7-bbfa-60ccd739f364_fluency_score_0
2023-12-01 14:11:06 -0600   25660 execution.flow     INFO     Executing node groundedness_score. node run id: 33e95ae2-a6fd-44e7-bbfa-60ccd739f364_groundedness_score_0
2023-12-01 14:11:06 -0600   25660 execution.flow     INFO     Executing node relevance_score. node run id: 33e95ae2-a6fd-44e7-bbfa-60ccd739f364_relevance_score_0
2023-12-01 14:11:08 -0600   25660 execution.flow     INFO     Node groundedness_score completes.
2023-12-01 14:11:08 -0600   25660 execution.flow     INFO     Node relevance_score completes.
2023-12-01 14:11:08 -0600   25660 execution.flow 

In [12]:
test_multi

{'gpt_coherence': 5.0,
 'gpt_fluency': 5.0,
 'gpt_groundedness': 5.0,
 'gpt_relevance': 5.0}

# AI Studio Azure batch run on an evaluation json dataset

Now in order to test these more thoroughly, we can use the Azure AI Studio to run batches of test data with the evaluation prompt flow on a larger dataset.

In [13]:
import json
# Import required libraries
from promptflow.azure import PFClient

# Import required libraries
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

In [14]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

Populate the `config.json` file with the subscription_id, resource_group, and workspace_name.

In [15]:
config_path = "../config.json"
pf_azure_client = PFClient.from_config(credential=credential, path=config_path)

Found the config file in: ..\config.json


Add the runtime from the AI Studio that will be used for the cloud batch runs.

In [16]:
# Update the runtime to the name of the runtime you created previously
runtime = "contoso-retail-runtime"
# load flow
flow = "../"
# load data
data = "../data/testdata.jsonl"

In [18]:
# get current time stamp for run name
import datetime
now = datetime.datetime.now()
timestamp = now.strftime("%Y_%m_%d_%H%M%S")
run_name = timestamp+"_base_run"
print(run_name)

2023_12_01_141144_base_run


Create a base run to use as the variant for the evaluation runs. 

_NOTE: If you get "'An existing connection was forcibly closed by the remote host'" run the cell again._

In [19]:
# create base run in Azure Ai Studio
base_run = pf_azure_client.run(
    flow=flow,
    data=data,
    column_mapping={
        # reference data
        "customerId": "${data.customerId}",
        "question": "${data.question}",
    },
    runtime=runtime,
    # create a display name as current datetime
    display_name=run_name,
    name=run_name
)
print(base_run)

[32mUploading contoso-chat (0.19 MBs): 100%|##########| 193234/193234 [00:04<00:00, 43111.52it/s]
[39m



FlowRequestException: Calling submit_bulk_run failed with request id: 6252f36b-b3c7-4aea-a43a-03d718f3473f 
Status code: 404 
Reason: Flow runtime contoso-retail-runtime not found 
Error message: (UserError) Flow runtime contoso-retail-runtime not found
Code: UserError
Message: Flow runtime contoso-retail-runtime not found 


In [None]:
pf_azure_client.stream(base_run)

In [None]:
details = pf_azure_client.get_details(base_run)
details.head(10)

# Cloud Eval run on Json Data

In [None]:
eval_flow = "multi_flow/"
data = "../data/testdata.jsonl"
run_name = timestamp+"_eval_run"
print(run_name)

eval_run_variant = pf_azure_client.run(
    flow=eval_flow,
    data=data,  # path to the data file
    run=base_run,  # use run as the variant
    column_mapping={
        # reference data
        "customerId": "${data.customerId}",
        "question": "${data.question}",
        "context": "${run.outputs.context}",
        # reference the run's output
        "answer": "${run.outputs.answer}",
    },
    runtime=runtime,
    display_name=run_name,
    name=run_name
)

In [None]:
pf_azure_client.stream(eval_run_variant)

In [None]:
details = pf_azure_client.get_details(eval_run_variant)
details.head(10)

In [None]:

metrics = pf_azure_client.get_metrics(eval_run_variant)
print(json.dumps(metrics, indent=4))

In [None]:
pf_azure_client.visualize([base_run, eval_run_variant])