diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 66bedf48ccec..738a7ad86c00 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_a63b4a27cf" + "Tag": "python/evaluation/azure-ai-evaluation_d414254496" } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py index a93d24ece854..7ad067db627e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py @@ -80,7 +80,7 @@ async def __call__(self, *, query: str, response: str, ground_truth: str, **kwar class SimilarityEvaluator: """ - Evaluates similarity score for a given query, response, and ground truth or a multi-turn conversation. + Evaluates similarity score for a given query, response, and ground truth. The similarity measure evaluates the likeness between a ground truth sentence (or document) and the AI model's generated prediction. This calculation involves creating sentence-level embeddings for both diff --git a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py index 50ae75cfa456..da609a0bb333 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py @@ -1,6 +1,7 @@ from .__openai_patcher import TestProxyConfig, TestProxyHttpxClientBase # isort: split from . import __pf_service_isolation # isort: split # noqa: F401 +import os import json import multiprocessing import time diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py new file mode 100644 index 000000000000..c8e8b1c4c17d --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py @@ -0,0 +1,372 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +import os +import pathlib +import pandas as pd +import pytest + + +from azure.ai.evaluation import ( + F1ScoreEvaluator, + # GleuScoreEvaluator, + BleuScoreEvaluator, + RougeScoreEvaluator, + MeteorScoreEvaluator, + CoherenceEvaluator, + FluencyEvaluator, + RelevanceEvaluator, + # SimilarityEvaluator, + GroundednessEvaluator, + # QAEvaluator, + ContentSafetyEvaluator, + GroundednessProEvaluator, + ProtectedMaterialEvaluator, + IndirectAttackEvaluator, + RetrievalEvaluator, + # ContentSafetyMultimodalEvaluator, + ProtectedMaterialMultimodalEvaluator, + RougeType, + evaluate, +) +from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator + + +@pytest.fixture +def data_file(): + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "evaluate_test_data.jsonl") + + +@pytest.fixture +def data_convo_file(): + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "evaluate_test_data_conversation.jsonl") + + +# I didn't feel like using gross-looking package manipulation commands, +# or importing the lazy_fixture 3p decorator. So we have this monster instead, +# to allow for 'parameterized' fixtures. +@pytest.fixture +def multimodal_input_selector(): + def selector(selection: str): + if selection == "imageurls": + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "dataset_messages_image_urls.jsonl") + if selection == "imageurls_with_target": + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "dataset_messages_image_urls_target.jsonl") + if selection == "b64_images": + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "dataset_messages_b64_images.jsonl") + + return selector + + +@pytest.mark.usefixtures("recording_injection", "recorded_test") +class TestMassEvaluate: + """ + Testing file for testing evaluators within the actual `evaluate` wrapper function. Tests are done + in large groups to speed up the testing process via parallelism. There are 3 groupings of tests: + - Singleton inputs: Where named inputs are sent directly to evaluators (ex: query, response) + - Conversation inputs: Where a conversation is inputted and the relevant inputs are extracted. + - Multi-modal inputs: This one has some parameters for the different types of multi-modal inputs. + """ + + def test_evaluate_singleton_inputs(self, model_config, azure_cred, project_scope, data_file): + # qa and similarity disabled due to being playback-unfriendly due to URL sanitization problems. + # glue disabled due to being unfriendly to CI playback for some reason. + # content safety disabled temporarily to test CI PF teardown race condition + evaluators = { + "f1_score": F1ScoreEvaluator(), + # "gleu": GleuScoreEvaluator(), + "bleu": BleuScoreEvaluator(), + "rouge": RougeScoreEvaluator(RougeType.ROUGE_L), + "meteor": MeteorScoreEvaluator(), + "grounded": GroundednessEvaluator(model_config), + "coherence": CoherenceEvaluator(model_config), + "fluency": FluencyEvaluator(model_config), + "relevance": RelevanceEvaluator(model_config), + # "similarity": SimilarityEvaluator(model_config), + # "qa" : QAEvaluator(model_config), + "grounded_pro": GroundednessProEvaluator(azure_cred, project_scope), + "protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope), + "indirect_attack": IndirectAttackEvaluator(azure_cred, project_scope), + "eci": ECIEvaluator(azure_cred, project_scope), + "content_safety": ContentSafetyEvaluator(azure_cred, project_scope), + } + + # run the evaluation + result = evaluate( + data=data_file, + evaluators=evaluators, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + assert len(row_result_df.keys()) == 45 # 63 with gleu, qa/similarity + assert len(row_result_df["inputs.query"]) == 3 + assert len(row_result_df["inputs.context"]) == 3 + assert len(row_result_df["inputs.response"]) == 3 + assert len(row_result_df["inputs.ground_truth"]) == 3 + assert len(row_result_df["outputs.f1_score.f1_score"]) == 3 + # assert len(row_result_df["outputs.gleu.gleu_score"]) == 3 + assert len(row_result_df["outputs.bleu.bleu_score"]) == 3 + assert len(row_result_df["outputs.rouge.rouge_precision"]) == 3 + assert len(row_result_df["outputs.rouge.rouge_recall"]) == 3 + assert len(row_result_df["outputs.rouge.rouge_f1_score"]) == 3 + assert len(row_result_df["outputs.meteor.meteor_score"]) == 3 + assert len(row_result_df["outputs.grounded.groundedness"]) == 3 + assert len(row_result_df["outputs.grounded.gpt_groundedness"]) == 3 + assert len(row_result_df["outputs.grounded.groundedness_reason"]) == 3 + assert len(row_result_df["outputs.coherence.coherence"]) == 3 + assert len(row_result_df["outputs.coherence.gpt_coherence"]) == 3 + assert len(row_result_df["outputs.coherence.coherence_reason"]) == 3 + assert len(row_result_df["outputs.fluency.fluency"]) == 3 + assert len(row_result_df["outputs.fluency.gpt_fluency"]) == 3 + assert len(row_result_df["outputs.fluency.fluency_reason"]) == 3 + assert len(row_result_df["outputs.relevance.relevance"]) == 3 + assert len(row_result_df["outputs.relevance.gpt_relevance"]) == 3 + assert len(row_result_df["outputs.relevance.relevance_reason"]) == 3 + # assert len(row_result_df['outputs.similarity.similarity']) == 3 + # assert len(row_result_df['outputs.similarity.gpt_similarity']) == 3 + # assert len(row_result_df['outputs.qa.f1_score']) == 3 + # assert len(row_result_df['outputs.qa.groundedness']) == 3 + # assert len(row_result_df['outputs.qa.gpt_groundedness']) == 3 + # assert len(row_result_df['outputs.qa.groundedness_reason']) == 3 + # assert len(row_result_df['outputs.qa.coherence']) == 3 + # assert len(row_result_df['outputs.qa.gpt_coherence']) == 3 + # assert len(row_result_df['outputs.qa.coherence_reason']) == 3 + # assert len(row_result_df['outputs.qa.fluency']) == 3 + # assert len(row_result_df['outputs.qa.gpt_fluency']) == 3 + # assert len(row_result_df['outputs.qa.fluency_reason']) == 3 + # assert len(row_result_df['outputs.qa.relevance']) == 3 + # assert len(row_result_df['outputs.qa.gpt_relevance']) == 3 + # assert len(row_result_df['outputs.qa.relevance_reason']) == 3 + # assert len(row_result_df['outputs.qa.similarity']) == 3 + # assert len(row_result_df['outputs.qa.gpt_similarity']) == 3 + assert len(row_result_df["outputs.grounded_pro.groundedness_pro_label"]) == 3 + assert len(row_result_df["outputs.grounded_pro.groundedness_pro_reason"]) == 3 + assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 3 + assert len(row_result_df["outputs.protected_material.protected_material_reason"]) == 3 + assert len(row_result_df["outputs.indirect_attack.xpia_label"]) == 3 + assert len(row_result_df["outputs.indirect_attack.xpia_reason"]) == 3 + assert len(row_result_df["outputs.indirect_attack.xpia_manipulated_content"]) == 3 + assert len(row_result_df["outputs.indirect_attack.xpia_intrusion"]) == 3 + assert len(row_result_df["outputs.indirect_attack.xpia_information_gathering"]) == 3 + assert len(row_result_df["outputs.eci.eci_label"]) == 3 + assert len(row_result_df["outputs.eci.eci_reason"]) == 3 + assert len(row_result_df["outputs.content_safety.sexual"]) == 3 + assert len(row_result_df["outputs.content_safety.sexual_score"]) == 3 + assert len(row_result_df["outputs.content_safety.sexual_reason"]) == 3 + assert len(row_result_df["outputs.content_safety.self_harm"]) == 3 + assert len(row_result_df["outputs.content_safety.self_harm_score"]) == 3 + assert len(row_result_df["outputs.content_safety.self_harm_reason"]) == 3 + assert len(row_result_df["outputs.content_safety.hate_unfairness"]) == 3 + assert len(row_result_df["outputs.content_safety.hate_unfairness_score"]) == 3 + assert len(row_result_df["outputs.content_safety.hate_unfairness_reason"]) == 3 + assert len(row_result_df["outputs.content_safety.violence"]) == 3 + assert len(row_result_df["outputs.content_safety.violence_score"]) == 3 + assert len(row_result_df["outputs.content_safety.violence_reason"]) == 3 + + assert len(metrics.keys()) == 25 # 39 with gleu, qa, similarity + assert metrics["f1_score.f1_score"] >= 0 + # assert metrics["gleu.gleu_score"] >= 0 + assert metrics["bleu.bleu_score"] >= 0 + assert metrics["rouge.rouge_precision"] >= 0 + assert metrics["rouge.rouge_recall"] >= 0 + assert metrics["rouge.rouge_f1_score"] >= 0 + assert metrics["meteor.meteor_score"] >= 0 + assert metrics["grounded.groundedness"] >= 0 + assert metrics["grounded.gpt_groundedness"] >= 0 + assert metrics["coherence.coherence"] >= 0 + assert metrics["coherence.gpt_coherence"] >= 0 + assert metrics["fluency.fluency"] >= 0 + assert metrics["fluency.gpt_fluency"] >= 0 + assert metrics["relevance.relevance"] >= 0 + assert metrics["relevance.gpt_relevance"] >= 0 + # assert metrics['similarity.similarity'] >= 0 + # assert metrics['similarity.gpt_similarity'] >= 0 + assert metrics["indirect_attack.xpia_manipulated_content"] >= 0 + assert metrics["indirect_attack.xpia_intrusion"] >= 0 + assert metrics["indirect_attack.xpia_information_gathering"] >= 0 + assert metrics["content_safety.sexual_defect_rate"] >= 0 + assert metrics["content_safety.self_harm_defect_rate"] >= 0 + assert metrics["content_safety.hate_unfairness_defect_rate"] >= 0 + assert metrics["content_safety.violence_defect_rate"] >= 0 + assert metrics["grounded_pro.groundedness_pro_passing_rate"] >= 0 + assert metrics["protected_material.protected_material_defect_rate"] >= 0 + assert metrics["indirect_attack.xpia_defect_rate"] >= 0 + assert metrics["eci.eci_defect_rate"] >= 0 + # assert metrics['qa.f1_score'] >= 0 + # assert metrics['qa.groundedness'] >= 0 + # assert metrics['qa.gpt_groundedness'] >= 0 + # assert metrics['qa.coherence'] >= 0 + # assert metrics['qa.gpt_coherence'] >= 0 + # assert metrics['qa.fluency'] >= 0 + # assert metrics['qa.gpt_fluency'] >= 0 + # assert metrics['qa.relevance'] >= 0 + # assert metrics['qa.gpt_relevance'] >= 0 + # assert metrics['qa.similarity'] >= 0 + # assert metrics['qa.gpt_similarity'] >= 0 + + def test_evaluate_conversation(self, model_config, data_convo_file, azure_cred, project_scope): + evaluators = { + "grounded": GroundednessEvaluator(model_config), + "coherence": CoherenceEvaluator(model_config), + "fluency": FluencyEvaluator(model_config), + "relevance": RelevanceEvaluator(model_config), + "grounded_pro": GroundednessProEvaluator(azure_cred, project_scope), + "protected_material": ProtectedMaterialEvaluator(azure_cred, project_scope), + "indirect_attack": IndirectAttackEvaluator(azure_cred, project_scope), + "eci": ECIEvaluator(azure_cred, project_scope), + "content_safety": ContentSafetyEvaluator(azure_cred, project_scope), + "retrieval": RetrievalEvaluator(model_config), + } + + # run the evaluation + result = evaluate( + data=data_convo_file, + evaluators=evaluators, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + assert len(row_result_df.keys()) == 32 + assert len(row_result_df["inputs.conversation"]) == 2 + assert len(row_result_df["outputs.grounded.groundedness"]) == 2 + assert len(row_result_df["outputs.grounded.gpt_groundedness"]) == 2 + assert len(row_result_df["outputs.grounded.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.coherence.coherence"]) == 2 + assert len(row_result_df["outputs.coherence.gpt_coherence"]) == 2 + assert len(row_result_df["outputs.coherence.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.fluency.fluency"]) == 2 + assert len(row_result_df["outputs.fluency.gpt_fluency"]) == 2 + assert len(row_result_df["outputs.fluency.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.relevance.relevance"]) == 2 + assert len(row_result_df["outputs.relevance.gpt_relevance"]) == 2 + assert len(row_result_df["outputs.relevance.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.grounded_pro.groundedness_pro_label"]) == 2 + assert len(row_result_df["outputs.grounded_pro.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.protected_material.protected_material_label"]) == 2 + assert len(row_result_df["outputs.protected_material.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.indirect_attack.xpia_label"]) == 2 + assert len(row_result_df["outputs.indirect_attack.xpia_manipulated_content"]) == 2 + assert len(row_result_df["outputs.indirect_attack.xpia_intrusion"]) == 2 + assert len(row_result_df["outputs.indirect_attack.xpia_information_gathering"]) == 2 + assert len(row_result_df["outputs.indirect_attack.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.eci.eci_label"]) == 2 + assert len(row_result_df["outputs.eci.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.content_safety.sexual_score"]) == 2 + assert len(row_result_df["outputs.content_safety.violence_score"]) == 2 + assert len(row_result_df["outputs.content_safety.self_harm_score"]) == 2 + assert len(row_result_df["outputs.content_safety.hate_unfairness_score"]) == 2 + assert len(row_result_df["outputs.content_safety.evaluation_per_turn"]) == 2 + assert len(row_result_df["outputs.retrieval.retrieval"]) == 2 + assert len(row_result_df["outputs.retrieval.gpt_retrieval"]) == 2 + assert len(row_result_df["outputs.retrieval.evaluation_per_turn"]) == 2 + + assert len(metrics.keys()) == 21 + assert metrics["coherence.coherence"] >= 0 + assert metrics["coherence.gpt_coherence"] >= 0 + assert metrics["fluency.fluency"] >= 0 + assert metrics["fluency.gpt_fluency"] >= 0 + assert metrics["relevance.relevance"] >= 0 + assert metrics["relevance.gpt_relevance"] >= 0 + assert metrics["grounded.gpt_groundedness"] >= 0 + assert metrics["grounded.groundedness"] >= 0 + assert metrics["retrieval.retrieval"] >= 0 + assert metrics["retrieval.gpt_retrieval"] >= 0 + assert metrics["indirect_attack.xpia_manipulated_content"] >= 0 + assert metrics["indirect_attack.xpia_intrusion"] >= 0 + assert metrics["indirect_attack.xpia_information_gathering"] >= 0 + assert metrics["content_safety.sexual_defect_rate"] >= 0 + assert metrics["content_safety.violence_defect_rate"] >= 0 + assert metrics["content_safety.hate_unfairness_defect_rate"] >= 0 + assert metrics["content_safety.self_harm_defect_rate"] >= 0 + assert metrics["grounded_pro.groundedness_pro_passing_rate"] >= 0 + assert metrics["protected_material.protected_material_defect_rate"] >= 0 + assert metrics["indirect_attack.xpia_defect_rate"] >= 0 + assert metrics["eci.eci_defect_rate"] >= 0 + + # Imagee urls with target is disabled due to being unstable in CI + @pytest.mark.parametrize( + "multi_modal_input_type", + [ + "imageurls", + # "imageurls_with_target", + "b64_images", + ], + ) + def test_evaluate_multimodal(self, multi_modal_input_type, multimodal_input_selector, azure_cred, project_scope): + # Content safety is removed due to being unstable in playback mode + evaluators = { + # "content_safety" : ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope), + "protected_material": ProtectedMaterialMultimodalEvaluator( + credential=azure_cred, azure_ai_project=project_scope + ), + } + + evaluator_config = None # use default normally + target = None + if multi_modal_input_type == "imageurls_with_target": + evaluator_config = { + # "content_safety": {"conversation": "${target.conversation}"}, + "protected_material": {"conversation": "${target.conversation}"}, + } + from .target_fn import target_multimodal_fn1 + + target = target_multimodal_fn1 + + # run the evaluation + result = evaluate( + data=multimodal_input_selector(multi_modal_input_type), + evaluators=evaluators, + evaluator_config=evaluator_config, + target=target, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # validate the results + assert result is not None + assert result["rows"] is not None + + assert "outputs.protected_material.artwork_label" in row_result_df.columns.to_list() + assert "outputs.protected_material.artwork_reason" in row_result_df.columns.to_list() + assert "outputs.protected_material.fictional_characters_label" in row_result_df.columns.to_list() + assert "outputs.protected_material.fictional_characters_reason" in row_result_df.columns.to_list() + assert "outputs.protected_material.logos_and_brands_label" in row_result_df.columns.to_list() + assert "outputs.protected_material.logos_and_brands_reason" in row_result_df.columns.to_list() + + # assert "outputs.content_safety.sexual" in row_result_df.columns.to_list() + # assert "outputs.content_safety.violence" in row_result_df.columns.to_list() + # assert "outputs.content_safety.self_harm" in row_result_df.columns.to_list() + # assert "outputs.content_safety.hate_unfairness" in row_result_df.columns.to_list() + # assert "outputs.content_safety.sexual_score" in row_result_df.columns.to_list() + # assert "outputs.content_safety.violence_score" in row_result_df.columns.to_list() + # assert "outputs.content_safety.self_harm_score" in row_result_df.columns.to_list() + # assert "outputs.content_safety.hate_unfairness_score" in row_result_df.columns.to_list() + # assert "outputs.content_safety.sexual_reason" in row_result_df.columns.to_list() + # assert "outputs.content_safety.violence_reason" in row_result_df.columns.to_list() + # assert "outputs.content_safety.self_harm_reason" in row_result_df.columns.to_list() + # assert "outputs.content_safety.hate_unfairness_reason" in row_result_df.columns.to_list() + + # assert "content_safety.sexual_defect_rate" in metrics.keys() + # assert "content_safety.violence_defect_rate" in metrics.keys() + # assert "content_safety.self_harm_defect_rate" in metrics.keys() + # assert "content_safety.hate_unfairness_defect_rate" in metrics.keys() + assert "protected_material.fictional_characters_label" in metrics.keys() + assert "protected_material.logos_and_brands_label" in metrics.keys() + assert "protected_material.artwork_label" in metrics.keys() + + # assert 0 <= metrics.get("content_safety.sexual_defect_rate") <= 1 + # assert 0 <= metrics.get("content_safety.violence_defect_rate") <= 1 + # assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1 + # assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1 + assert 0 <= metrics.get("protected_material.fictional_characters_label") <= 1 + assert 0 <= metrics.get("protected_material.logos_and_brands_label") <= 1 + assert 0 <= metrics.get("protected_material.artwork_label") <= 1 diff --git a/sdk/evaluation/ci.yml b/sdk/evaluation/ci.yml index a9a2435d2041..701f7d96c25e 100644 --- a/sdk/evaluation/ci.yml +++ b/sdk/evaluation/ci.yml @@ -29,6 +29,11 @@ extends: TestProxy: true # This custom matrix config should be dropped once: # * Once azure-ai-ml supports 3.13 (currently crashes due to type annotation) + MatrixConfigs: + - Name: ai_ci_matrix + Path: sdk/evaluation/platform-matrix.json + Selection: sparse + GenerateVMJobs: true MatrixFilters: - PythonVersion=^(?!3\.13) Artifacts: diff --git a/sdk/evaluation/platform-matrix.json b/sdk/evaluation/platform-matrix.json new file mode 100644 index 000000000000..938e1e07d52e --- /dev/null +++ b/sdk/evaluation/platform-matrix.json @@ -0,0 +1,51 @@ +{ + "displayNames": { + "--disablecov": "", + "false": "", + "true": "" + }, + "matrix": { + "Agent": { + "macos-latest": { "OSVmImage": "env:MACVMIMAGE", "Pool": "env:MACPOOL" }, + "ubuntu-20.04": { "OSVmImage": "env:LINUXVMIMAGE", "Pool": "env:LINUXPOOL" } + }, + "PythonVersion": [ "3.8", "3.11", "3.10" ], + "CoverageArg": "--disablecov", + "TestSamples": "false" + }, + "include": [ + { + "CoverageConfig": { + "ubuntu2004_39_coverage": { + "OSVmImage": "env:LINUXVMIMAGE", + "Pool": "env:LINUXPOOL", + "PythonVersion": "3.9", + "CoverageArg": "", + "TestSamples": "false" + } + } + }, + { + "CoverageConfig": { + "ubuntu2004_pypy39": { + "OSVmImage": "env:LINUXVMIMAGE", + "Pool": "env:LINUXPOOL", + "PythonVersion": "pypy3.9", + "CoverageArg": "", + "TestSamples": "false" + } + } + }, + { + "Config": { + "Ubuntu2004_312": { + "OSVmImage": "env:LINUXVMIMAGE", + "Pool": "env:LINUXPOOL", + "PythonVersion": "3.12", + "CoverageArg": "--disablecov", + "TestSamples": "false" + } + } + } + ] +} \ No newline at end of file