From f999e50760877d80e777a316c4e3af945c5d05f4 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Mon, 28 Oct 2024 13:47:04 -0400 Subject: [PATCH 01/12] Update evaluate to allow optional params + add tests --- .../ai/evaluation/_evaluate/_evaluate.py | 4 +- .../tests/e2etests/test_evaluate.py | 47 +++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 689ce162122a..011a83759b45 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -287,7 +287,9 @@ def _validate_columns_for_evaluators( # Ignore the missing fields if "conversation" presents in the input data missing_inputs = [] else: - missing_inputs = [col for col in evaluator_params if col not in new_df.columns] + optional_params = evaluator._OPTIONAL_PARAMS if hasattr(evaluator, "_OPTIONAL_PARAMS") else [] + excluded_params = set(new_df.columns).union(optional_params) + missing_inputs = [col for col in evaluator_params if col not in excluded_params] # If "conversation" is the only parameter and it is missing, keep it in the missing inputs # Otherwise, remove it from the missing inputs diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index 948660387773..9a4e3116f380 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -26,6 +26,12 @@ def data_file(): return os.path.join(data_path, "evaluate_test_data.jsonl") +@pytest.fixture +def data_file_no_query(): + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "evaluate_test_data_no_query.jsonl") + + @pytest.fixture def data_convo_file(): data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") @@ -98,8 +104,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope): @pytest.mark.usefixtures("recording_injection", "recorded_test") @pytest.mark.localtest class TestEvaluate: - @pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr") - def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): + def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, data_file): # data input_data = pd.read_json(data_file, lines=True) @@ -131,7 +136,43 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): ) assert metrics.get("f1_score.f1_score") == list_mean_nan_safe(row_result_df["outputs.f1_score.f1_score"]) - assert row_result_df["outputs.grounded.groundedness"][2] in [4, 5] + assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5] + assert row_result_df["outputs.f1_score.f1_score"][2] == 1 + assert result["studio_url"] is None + + def test_evaluate_with_groundedness_evaluator_without_query(self, model_config, data_file_no_query): + # data + input_data = pd.read_json(data_file_no_query, lines=True) + + groundedness_eval = GroundednessEvaluator(model_config) + f1_score_eval = F1ScoreEvaluator() + + # run the evaluation + result = evaluate( + data=data_file_no_query, + evaluators={"grounded": groundedness_eval, "f1_score": f1_score_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() + assert "outputs.f1_score.f1_score" in row_result_df.columns.to_list() + + assert "grounded.groundedness" in metrics.keys() + assert "f1_score.f1_score" in metrics.keys() + + assert metrics.get("grounded.groundedness") == list_mean_nan_safe( + row_result_df["outputs.grounded.groundedness"] + ) + assert metrics.get("f1_score.f1_score") == list_mean_nan_safe(row_result_df["outputs.f1_score.f1_score"]) + + assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5] assert row_result_df["outputs.f1_score.f1_score"][2] == 1 assert result["studio_url"] is None From aa3118d0e0185c4e2be0aa31e5a8d1dfbb711257 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Mon, 28 Oct 2024 13:49:26 -0400 Subject: [PATCH 02/12] Record tests --- sdk/evaluation/azure-ai-evaluation/assets.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 7144de427f88..99fe29491313 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_f0444ef220" + "Tag": "python/evaluation/azure-ai-evaluation_2dcae6ba0e" } From 4aca60175c5223b74612af4c59a9a8d5abf361ae Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 16:15:45 -0400 Subject: [PATCH 03/12] Exclude optional params from 'missing inputs' warning --- .../azure/ai/evaluation/_evaluate/_evaluate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 448979c71cee..37d443647c89 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -287,7 +287,9 @@ def _validate_columns_for_evaluators( # Ignore the missing fields if "conversation" presents in the input data missing_inputs = [] else: - optional_params = evaluator._OPTIONAL_PARAMS if hasattr(evaluator, "_OPTIONAL_PARAMS") else [] + optional_params = ( + evaluator._OPTIONAL_PARAMS if hasattr(evaluator, "_OPTIONAL_PARAMS") else [] # pylint: disable=protected-access + ) excluded_params = set(new_df.columns).union(optional_params) missing_inputs = [col for col in evaluator_params if col not in excluded_params] From 15efb4db5c8d0258549e0a32e574b0c6473d2756 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 16:52:53 -0400 Subject: [PATCH 04/12] Add tests --- .../azure-ai-evaluation/tests/conftest.py | 9 ++ .../tests/e2etests/test_evaluate.py | 133 +++++++++++++----- 2 files changed, 104 insertions(+), 38 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py index 4c27d1cf0c90..f4dfe7ee6a38 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py @@ -142,11 +142,20 @@ def live_connection_file_values(): add_general_regex_sanitizer(regex=project_scope["project_name"], value=SanitizedValues.WORKSPACE_NAME) add_general_regex_sanitizer(regex=model_config["azure_endpoint"], value=mock_model_config["azure_endpoint"]) + def promptflow_root_run_id_sanitizer(): + """Sanitize the promptflow service isolation values.""" + add_general_regex_sanitizer( + value="root_run_id", + regex=r'"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_[^"]+"', + replacement='"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_SANITIZED"' + ) + azure_workspace_triad_sanitizer() azureopenai_connection_sanitizer() openai_stainless_default_headers() azure_ai_generative_sanitizer() live_connection_file_values() + promptflow_root_run_id_sanitizer() @pytest.fixture diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index d2160a21d5db..bd3e76ef266b 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -119,7 +119,8 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope): @pytest.mark.usefixtures("recording_injection", "recorded_test") @pytest.mark.localtest class TestEvaluate: - def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, data_file): + @pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr") + def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): # data input_data = pd.read_json(data_file, lines=True) @@ -151,43 +152,7 @@ def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, dat ) assert metrics.get("f1_score.f1_score") == list_mean_nan_safe(row_result_df["outputs.f1_score.f1_score"]) - assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5] - assert row_result_df["outputs.f1_score.f1_score"][2] == 1 - assert result["studio_url"] is None - - def test_evaluate_with_groundedness_evaluator_without_query(self, model_config, data_file_no_query): - # data - input_data = pd.read_json(data_file_no_query, lines=True) - - groundedness_eval = GroundednessEvaluator(model_config) - f1_score_eval = F1ScoreEvaluator() - - # run the evaluation - result = evaluate( - data=data_file_no_query, - evaluators={"grounded": groundedness_eval, "f1_score": f1_score_eval}, - ) - - row_result_df = pd.DataFrame(result["rows"]) - metrics = result["metrics"] - - # validate the results - assert result is not None - assert result["rows"] is not None - assert row_result_df.shape[0] == len(input_data) - - assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() - assert "outputs.f1_score.f1_score" in row_result_df.columns.to_list() - - assert "grounded.groundedness" in metrics.keys() - assert "f1_score.f1_score" in metrics.keys() - - assert metrics.get("grounded.groundedness") == list_mean_nan_safe( - row_result_df["outputs.grounded.groundedness"] - ) - assert metrics.get("f1_score.f1_score") == list_mean_nan_safe(row_result_df["outputs.f1_score.f1_score"]) - - assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5] + assert row_result_df["outputs.grounded.groundedness"][2] in [4, 5] assert row_result_df["outputs.f1_score.f1_score"][2] == 1 assert result["studio_url"] is None @@ -766,3 +731,95 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso @pytest.mark.skip(reason="TODO: Add test back") def test_prompty_with_threadpool_implementation(self): pass + + def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, data_file): + # data + input_data = pd.read_json(data_file, lines=True) + + groundedness_eval = GroundednessEvaluator(model_config) + + # run the evaluation + result = evaluate( + data=data_file, + evaluators={"grounded": groundedness_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + print(result["rows"]) + print(metrics) + + + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() + assert "grounded.groundedness" in metrics.keys() + assert metrics.get("grounded.groundedness") == list_mean_nan_safe( + row_result_df["outputs.grounded.groundedness"] + ) + assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5] + assert result["studio_url"] is None + + def test_evaluate_with_groundedness_evaluator_without_query(self, model_config, data_file_no_query): + # data + input_data = pd.read_json(data_file_no_query, lines=True) + + groundedness_eval = GroundednessEvaluator(model_config) + + # run the evaluation + result = evaluate( + data=data_file_no_query, + evaluators={"grounded": groundedness_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() + + assert "grounded.groundedness" in metrics.keys() + + assert metrics.get("grounded.groundedness") == list_mean_nan_safe( + row_result_df["outputs.grounded.groundedness"] + ) + + assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5] + assert result["studio_url"] is None + + def test_evaluate_with_groundedness_evaluator_with_convo(self, model_config, data_convo_file): + # data + input_data = pd.read_json(data_convo_file, lines=True) + + groundedness_eval = GroundednessEvaluator(model_config) + + # run the evaluation + result = evaluate( + data=data_convo_file, + evaluators={"grounded": groundedness_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() + + assert "grounded.groundedness" in metrics.keys() + + assert metrics.get("grounded.groundedness") == list_mean_nan_safe( + row_result_df["outputs.grounded.groundedness"] + ) + assert row_result_df["outputs.grounded.groundedness"][1] in [3, 4, 5] + assert result["studio_url"] is None From 3ee2f9c01e30d07f63fb41680aaa76f30561f3f7 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 17:00:13 -0400 Subject: [PATCH 05/12] Record tests --- sdk/evaluation/azure-ai-evaluation/assets.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 356e246839ab..1ea0a49b66cd 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_043418c052" + "Tag": "python/evaluation/azure-ai-evaluation_5ec60aae8a" } From 3e7f25a7a889138de67ba61cefd4d12f7c7251a4 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 17:34:14 -0400 Subject: [PATCH 06/12] Fix linting errors --- .../azure/ai/evaluation/_evaluate/_evaluate.py | 4 +++- sdk/evaluation/azure-ai-evaluation/tests/conftest.py | 2 +- .../azure-ai-evaluation/tests/e2etests/test_evaluate.py | 4 ---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 37d443647c89..f879def1501a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -288,7 +288,9 @@ def _validate_columns_for_evaluators( missing_inputs = [] else: optional_params = ( - evaluator._OPTIONAL_PARAMS if hasattr(evaluator, "_OPTIONAL_PARAMS") else [] # pylint: disable=protected-access + evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access + if hasattr(evaluator, "_OPTIONAL_PARAMS") + else [] ) excluded_params = set(new_df.columns).union(optional_params) missing_inputs = [col for col in evaluator_params if col not in excluded_params] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py index f4dfe7ee6a38..9302f64c161c 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py @@ -147,7 +147,7 @@ def promptflow_root_run_id_sanitizer(): add_general_regex_sanitizer( value="root_run_id", regex=r'"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_[^"]+"', - replacement='"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_SANITIZED"' + replacement='"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_SANITIZED"', ) azure_workspace_triad_sanitizer() diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index bd3e76ef266b..b69c043e27c1 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -747,10 +747,6 @@ def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, dat row_result_df = pd.DataFrame(result["rows"]) metrics = result["metrics"] - print(result["rows"]) - print(metrics) - - # validate the results assert result is not None assert result["rows"] is not None From 9f75ed28da05ffc4a1817197ec46173bb3adc756 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 18:07:12 -0400 Subject: [PATCH 07/12] Merge from main and fix linting error --- .../azure/ai/evaluation/simulator/_direct_attack_simulator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py index 7278ff3be2de..9da9d930acc4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py @@ -179,9 +179,7 @@ async def __call__( if not randomization_seed: randomization_seed = randint(0, 1000000) - regular_sim = AdversarialSimulator( - azure_ai_project=self.azure_ai_project, credential=self.credential - ) + regular_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential) regular_sim_results = await regular_sim( scenario=scenario, target=target, From 418c34bca848b0ab4d5f1eedfe96f78cbe5c4ead Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 19:09:09 -0400 Subject: [PATCH 08/12] Add unit test for missing output --- .../tests/unittests/test_evaluate.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 9e26bf9a992b..24491fec5bfd 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -648,3 +648,13 @@ def test_optional_inputs_with_target(self, questions_file, questions_answers_bas ) # type: ignore assert double_override_results["rows"][0]["outputs.echo.echo_query"] == "new query" assert double_override_results["rows"][0]["outputs.echo.echo_response"] == "new response" + + def test_missing_inputs(self, questions_file): + """Test we are raising exception if required input is missing in data.""" + with pytest.raises(EvaluationException) as cm: + evaluate( + data=questions_file, + target=_target_fn, + evaluators={"f1": F1ScoreEvaluator()}, + ) + assert "Some evaluators are missing required inputs:\n- f1: ['ground_truth']\n\n" in cm.value.args[0] \ No newline at end of file From 5cef42d4bb4b24533e8e367ddeec5cb5eb3b1517 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 19:27:18 -0400 Subject: [PATCH 09/12] Update convo test to assert per turn results --- .../azure-ai-evaluation/tests/e2etests/test_evaluate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index b69c043e27c1..625cf021e9d3 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -811,11 +811,12 @@ def test_evaluate_with_groundedness_evaluator_with_convo(self, model_config, dat assert row_result_df.shape[0] == len(input_data) assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() - + assert "outputs.grounded.evaluation_per_turn" in row_result_df.columns.to_list() assert "grounded.groundedness" in metrics.keys() - assert metrics.get("grounded.groundedness") == list_mean_nan_safe( row_result_df["outputs.grounded.groundedness"] ) assert row_result_df["outputs.grounded.groundedness"][1] in [3, 4, 5] + assert row_result_df["outputs.grounded.evaluation_per_turn"][0]["groundedness"][0] in [3.0, 4.0, 5.0] + assert row_result_df["outputs.grounded.evaluation_per_turn"][0]["groundedness_reason"][0] is not None assert result["studio_url"] is None From 59506508395218f4537f76218a0398d93969b871 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 19:28:01 -0400 Subject: [PATCH 10/12] Run linting --- .../azure-ai-evaluation/tests/unittests/test_evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 24491fec5bfd..f8e65f92a3d7 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -657,4 +657,4 @@ def test_missing_inputs(self, questions_file): target=_target_fn, evaluators={"f1": F1ScoreEvaluator()}, ) - assert "Some evaluators are missing required inputs:\n- f1: ['ground_truth']\n\n" in cm.value.args[0] \ No newline at end of file + assert "Some evaluators are missing required inputs:\n- f1: ['ground_truth']\n\n" in cm.value.args[0] From 262443e49e5c30073bfee53469ece74edba8512b Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 19:55:38 -0400 Subject: [PATCH 11/12] Add test data --- .../tests/e2etests/data/evaluate_test_data_no_query.jsonl | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_no_query.jsonl diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_no_query.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_no_query.jsonl new file mode 100644 index 000000000000..e807a941f640 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_no_query.jsonl @@ -0,0 +1,3 @@ +{"context":"AML API only","response":"To create a run using the Azure Machine Learning API, you first need to create an Experiment. Once you have an experiment, you can create a Run object that is associated with that experiment. Here is some Python code that demonstrates this process:\n\n```\nfrom azureml.core import Experiment, Run\nfrom azureml.core.workspace import Workspace\n\n# Define workspace and experiment\nws = Workspace.from_config()\nexp = Experiment(workspace=ws, name='my_experiment')\n\n# Create a new run\nrun = exp.start_logging()\n```\n\nIn this code, the `from_config()` method reads the configuration file that you created when you set up your Azure Machine Learning workspace. The `Experiment` constructor creates an Experiment object that is associated with your workspace, and the `start_logging()` method creates a new Run object that is associated with the Experiment. Now you can use the `run` object to log metrics, upload files, and track other information related to your machine learning experiment.", "ground_truth":"Paris is the capital of France."} +{"context":"Logging can be done using any OSS Sdk","response":"There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.","ground_truth":"Paris is the capital of France."} +{"context":"France is in Europe","response":"Paris is the capital of France.", "ground_truth":"Paris is the capital of France."} \ No newline at end of file From d39139599bfe28b9d923e9f2e204d61b0fd27f42 Mon Sep 17 00:00:00 2001 From: Diondra Peck Date: Fri, 1 Nov 2024 20:03:13 -0400 Subject: [PATCH 12/12] Re-record test --- sdk/evaluation/azure-ai-evaluation/assets.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 1ea0a49b66cd..1e52dc68cc08 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_5ec60aae8a" + "Tag": "python/evaluation/azure-ai-evaluation_f6269c684c" }