## Testing

### Define variables we will need

In [None]:
import os
from dotenv import load_dotenv

# User provided values
load_dotenv(".env")

# Variables passed by previous notebooks
load_dotenv(".env.state")

# Let's capture the initial working directory because the evaluate function will change it
dir = os.getcwd()

experiment_name = os.getenv("DATASET_NAME")
experiment_dir = f"{dir}/dataset/{experiment_name}-files"

# Dataset generated by the gen notebook that we will evaluate the baseline and finetuned models on
dataset_path_hf_eval = f"{experiment_dir}/{experiment_name}-hf.eval.jsonl"

# Evaluated answer files
dataset_path_hf_eval_answer = f"{experiment_dir}/{experiment_name}-hf.eval.answer.jsonl"
dataset_path_hf_eval_answer_baseline = f"{experiment_dir}/{experiment_name}-hf.eval.answer.baseline.jsonl"

# Formatted answer evaluation files
dataset_path_eval_answer_finetuned = f"{experiment_dir}/{experiment_name}-eval.answer.finetuned.jsonl"
dataset_path_eval_answer_baseline = f"{experiment_dir}/{experiment_name}-eval.answer.baseline.jsonl"

# Scored answer files
dataset_path_eval_answer_score_finetuned = f"{experiment_dir}/{experiment_name}-eval.answer.score.finetuned.jsonl"
dataset_path_eval_answer_score_baseline = f"{experiment_dir}/{experiment_name}-eval.answer.score.baseline.jsonl"

BASELINE_OPENAI_DEPLOYMENT = os.getenv("BASELINE_OPENAI_DEPLOYMENT")
FINETUNED_OPENAI_DEPLOYMENT = os.getenv("FINETUNED_OPENAI_DEPLOYMENT")
FINETUNED_MODEL_FORMAT = os.getenv("FINETUNED_MODEL_FORMAT")

print(f"Evaluating the finetuned {FINETUNED_MODEL_FORMAT} model {FINETUNED_OPENAI_DEPLOYMENT} against the baseline model {BASELINE_OPENAI_DEPLOYMENT}")

## Let's review the formatted files

### Finetuned model answers

In [2]:
import pandas as pd

In [None]:
df_finetuned = pd.read_json(dataset_path_eval_answer_finetuned, lines=True)
df_finetuned.head(2)

### Baseline model answers

In [None]:
df_baseline = pd.read_json(dataset_path_eval_answer_baseline, lines=True)
df_baseline.head(2)

In [None]:
df_merged=pd.merge(df_baseline, df_finetuned, on="question", suffixes=('_baseline', '_finetuned'))
df_merged.head(2)

In [None]:
from random import randint
sample_idx = randint(0, len(df_merged) - 1)
sample = df_merged.iloc[sample_idx]
sample

In [None]:
from IPython.display import display, Markdown

def format_tags_md(text):
    md = text.replace("<ANSWER>", "`<ANSWER>`").replace("<DOCUMENT>", "`<DOCUMENT>`").replace("</DOCUMENT>", "`</DOCUMENT>`").replace("##begin_quote##", "`##begin_quote##`").replace("##end_quote##", "`##end_quote##`")
    return md

answer_baseline_md = format_tags_md(sample.answer_baseline)
answer_finetuned_md = format_tags_md(sample.answer_finetuned)
context_md = format_tags_md(sample.context_finetuned)

display(Markdown(f"""
## Context
{context_md}

## Question
{sample.question}

## Baseline Answer
{answer_baseline_md}

## Finetuned CoT Answer
{answer_finetuned_md}

## Finetuned Answer
{sample.final_answer_finetuned}
"""))

#### Quality Evaluator

In [None]:
import os
from promptflow.core import AzureOpenAIModelConfiguration

azure_endpoint = os.environ.get("JUDGE_AZURE_OPENAI_ENDPOINT")
azure_deployment = os.environ.get("JUDGE_AZURE_OPENAI_DEPLOYMENT")
api_key = os.environ.get("JUDGE_AZURE_OPENAI_API_KEY")
api_version = os.environ.get("JUDGE_OPENAI_API_VERSION")

print(f"azure_endpoint={azure_endpoint}")
print(f"azure_deployment={azure_deployment}")
print(f"api_version={api_version}")

# Initialize Azure OpenAI Connection
model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=azure_endpoint,
    azure_deployment=azure_deployment,
    api_version=api_version,
    api_key=api_key
)

In [26]:
from promptflow.evals.evaluators import RelevanceEvaluator, SimilarityEvaluator, GroundednessEvaluator

# Initializing evaluators
similarity = SimilarityEvaluator(model_config)
groundedness = GroundednessEvaluator(model_config)

In [None]:
df = pd.read_json(dataset_path_eval_answer_finetuned, lines=True)
sample = df.iloc[1]
sample

In [None]:
def compute_sample_metrics(sample):
    similarity_score = similarity(
        question=sample["question"],
        answer=sample["final_answer"],
        context=sample["context"],
        ground_truth=sample["gold_final_answer"],
    )
    groundedness_score = groundedness(
        answer=sample["final_answer"],
        context=sample["context"],
    )
    return similarity_score | groundedness_score

compute_sample_metrics(sample)

### Using the Evaluate API to calculate the metrics

In previous sections, we walked you through how to use built-in evaluators to evaluate a single row and how to define your own custom evaluators. Now, we will show you how to use these evaluators with the powerful `evaluate` API to assess an entire dataset.

### Running the metrics

Now, we will invoke the `evaluate` API using a few evaluators that we already initialized

Additionally, we have a column mapping to map the `truth` column from the dataset to `ground_truth`, which is accepted by the evaluator.

In [41]:
from promptflow.evals.evaluate import evaluate

def score_dataset(dataset, output_path=None):
    result = evaluate(
        data=dataset,
        evaluators={"similarity": similarity, "groundedness": groundedness},
        # column mapping
        evaluator_config={
            "similarity": {
                "question": "${data.question}",
                "answer": "${data.final_answer}",
                "ground_truth": "${data.gold_final_answer}",
                "context": "${data.context}",
            },
            "groundedness": {
                "answer": "${data.final_answer}",
                "context": "${data.context}",
            },
        },
    )

    if output_path:
        pd.DataFrame.from_dict(result["rows"]).to_json(output_path, orient="records", lines=True)

    return result

#### Baseline model evaluation metrics

In [None]:
pd.read_json(dataset_path_eval_answer_baseline, lines=True).head(2)

In [None]:
baseline_result = score_dataset(dataset_path_eval_answer_baseline, dataset_path_eval_answer_score_baseline)
from IPython.display import display, JSON

display(JSON(baseline_result["metrics"]))

In [None]:
# Check the results using Azure AI Studio UI
if baseline_result["studio_url"]:
    print(f"Results uploaded to AI Studio {baseline_result['studio_url']}")
else:
    print("Results available at http://127.0.0.1:23333")

#### Finetuned model evaluation metrics

In [None]:
pd.read_json(dataset_path_eval_answer_finetuned, lines=True).head(2)

In [None]:
finetune_result = score_dataset(dataset_path_eval_answer_finetuned, dataset_path_eval_answer_score_finetuned)
from IPython.display import display, JSON

display(JSON(finetune_result["metrics"]))


Finally, let's check the results produced by the evaluate API.

In [None]:
# Check the results using Azure AI Studio UI
if finetune_result["studio_url"]:
    print(f"Results uploaded to AI Studio {finetune_result['studio_url']}")
else:
    print("Results available at http://127.0.0.1:23333")

## Let's look at examples

In [None]:
df_baseline=pd.read_json(dataset_path_eval_answer_baseline, lines=True)
df_finetuned=pd.read_json(dataset_path_eval_answer_finetuned, lines=True)
df_merged=pd.merge(df_baseline, df_finetuned, on="question", suffixes=('_baseline', '_finetuned'))

## Compare the metrics of the fine-tuned model against the baseline

In [None]:
metrics = pd.DataFrame.from_dict({"baseline": baseline_result["metrics"], "finetuned": finetune_result["metrics"]})
metrics["improvement"] = metrics["finetuned"] / metrics["baseline"]
metrics

In [None]:
metrics.drop("improvement", axis=1).plot.bar(rot=0)