In [None]:
%pip install azure-ai-evaluation

In [None]:
import json
import pandas as pd
from contoso_chat.chat_request import get_response
# Import the Relevanace and Groundedness evaluators
from azure.ai.evaluation import RelevanceEvaluator, GroundednessEvaluator, evaluate

In [None]:
def load_data():
    data_path = "./evaluators/data.jsonl"

    df = pd.read_json(data_path, lines=True)
    df.head()
    return df

In [None]:
def create_response_data(df):
    results = []

    for index, row in df.iterrows():
        customerId = row['customerId']
        question = row['question']
        
        # Run contoso-chat/chat_request flow to get response
        response = get_response(customerId=customerId, question=question, chat_history=[])
        print(response)
        
        # Add results to list
        result = {
            'question': question,
            'context': response["context"],
            'answer': response["answer"]
        }
        results.append(result)

    # Save results to a JSONL file
    with open('result.jsonl', 'w') as file:
        for result in results:
            file.write(json.dumps(result) + '\n')
    return results

In [None]:
# This needs to be re-written so that you don't need to hardcode these values.
# For testing purposes, I had mine hardcoded.
model_config = {
    "azure_endpoint": "<your azure endoint>",
    "api_key": "<your deployment API key>",
    "azure_deployment": "gpt-4-evals"
}

In [None]:
# Initialze Relevance and Groundedness Evaluators
relevance_eval = RelevanceEvaluator(model_config)
groundedness_eval = GroundednessEvaluator(model_config)

In [None]:
# Set the path to the data to be evaluated
data_path = "result.jsonl"

In [None]:
def get_evaluation():
    evaluate(
    data=data_path, # provide your data here
    evaluators={
        "relevance": relevance_eval,
        "groundedness": groundedness_eval,
    },
    # column mapping
    evaluator_config={
        # A config is needed for groundedness, which requires response and context
        # Only the values data or target are allowed below
        "groundedness": {
            "response": "${data.answer}",
            "context": "${data.context}"
        },
        # A config is needed for relevance, which requires response, context, and query
        # Only the values data or target are allowed below
        "relevance": {
            "response": "${data.answer}",
            "context": "${data.context}",
            "query": "${data.question}"
        }
    },
    # Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
    # I've only got this to output the results to a file once, otherwise I view the results in Trace view
    # A link to Trace View will appear after you run the next cell
    # For an example of an outfit for the RevelanceEvalutor, view the file builtin-evals-results-sample.json
    output_path="./myevalresults.json"
)

In [None]:
# If you run and get a Key error, close the Codespace and restart. Then come back and run the evaluation again.
# The run takes around 4 minutes or so with two evaluators, possibly longer depending on retries for quota limits.
# If a markdown version of the results is preferred, then the logic will need to be added to do so.
# For now, I only get the results for the metrics just to validate that this works.
if __name__ == "__main__":

   test_data_df = load_data()
   response_results = create_response_data(test_data_df)
   result_evaluated = get_evaluation()