Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
- The `parallel` parameter has been removed from composite evaluators: `QAEvaluator`, `ContentSafetyChatEvaluator`, and `ContentSafetyMultimodalEvaluator`. To control evaluator parallelism, you can now use the `_parallel` keyword argument, though please note that this private parameter may change in the future.

### Bugs Fixed
- Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
- Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
- Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.

### Other Changes
- Refined error messages for serviced-based evaluators and simulators.
- Introduced environment variable `AI_EVALS_DISABLE_EXPERIMENTAL_WARNING` to disable the warning message for experimental features.
- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
- Changed the randomization pattern for `AdversarialSimulator` such that there is an almost equal number of Adversarial harm categories (e.g. Hate + Unfairness, Self-Harm, Violence, Sex) represented in the `AdversarialSimulator` outputs. Previously, for 200 `max_simulation_results` a user might see 140 results belonging to the 'Hate + Unfairness' category and 40 results belonging to the 'Self-Harm' category. Now, user will see 50 results for each of Hate + Unfairness, Self-Harm, Violence, and Sex.
- For the `DirectAttackSimulator`, the prompt templates used to generate simulated outputs for each Adversarial harm category will no longer be in a randomized order by default. To override this behavior, pass `randomize_order=True` when you call the `DirectAttackSimulator`, for example:
```python
adversarial_simulator = DirectAttackSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
Expand Down Expand Up @@ -83,7 +84,7 @@ outputs = asyncio.run(custom_simulator(
- `SimilarityEvaluator`
- `RetrievalEvaluator`
- The following evaluators will now have a new key in their result output including LLM reasoning behind the score. The new key will follow the pattern "<metric_name>_reason". The reasoning is the result of a more detailed prompt template being used to generate the LLM response. Note that this requires the maximum number of tokens used to run these evaluators to be increased.

| Evaluator | New `max_token` for Generation |
| --- | --- |
| `CoherenceEvaluator` | 800 |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,12 @@ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
self.client = client
self._is_batch_timeout_set_by_system = False
self._is_otel_timeout_set_by_system = False
self._original_cwd = os.getcwd()

def __enter__(self) -> None:
# Preserve current working directory, as PF may change it without restoring it afterward
self._original_cwd = os.getcwd()

if isinstance(self.client, CodeClient):
ClientUserAgentUtil.append_user_agent(USER_AGENT)
inject_openai_api()
Expand All @@ -64,6 +68,8 @@ def __exit__(
exc_value: Optional[BaseException],
exc_tb: Optional[types.TracebackType],
) -> None:
os.chdir(self._original_cwd)

if isinstance(self.client, CodeClient):
recover_openai_api()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@ class TargetRunContext:

def __init__(self, upload_snapshot: bool) -> None:
self._upload_snapshot = upload_snapshot
self._original_cwd = os.getcwd()

def __enter__(self) -> None:
# Preserve current working directory, as PF may change it without restoring it afterward
self._original_cwd = os.getcwd()

# Address "[WinError 32] The process cannot access the file" error,
# caused by conflicts when the venv and target function are in the same directory.
# Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
Expand All @@ -31,5 +35,7 @@ def __exit__(
exc_value: Optional[BaseException],
exc_tb: Optional[types.TracebackType],
) -> None:
os.chdir(self._original_cwd)

if not self._upload_snapshot:
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
)

output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
if not os.path.exists(output_dir):
if output_dir and not os.path.exists(output_dir):
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
raise EvaluationException(
message=msg,
Expand Down Expand Up @@ -698,7 +698,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
if output_dict:
print("======= Combined Run Summary (Per Evaluator) =======\n")
print(json.dumps(output_dict, indent=4))
print("\n====================================================")
print("\n====================================================\n")


def _evaluate( # pylint: disable=too-many-locals,too-many-statements
Expand Down Expand Up @@ -888,9 +888,9 @@ def eval_batch_run(
result_df_dict = result_df.to_dict("records")
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore

_print_summary(per_evaluator_results)

if output_path:
_write_output(output_path, result)

_print_summary(per_evaluator_results)

return result
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
json.dump(data_dict, f)

print(f'Evaluation results saved to "{p.resolve()}".\n')


def _apply_column_mapping(
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,14 +396,18 @@ def test_evaluate_output_dir_not_exist(self, mock_model_config, questions_file):

assert "The output directory './not_exist_dir' does not exist." in exc_info.value.args[0]

@pytest.mark.parametrize("use_pf_client", [True, False])
def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_pf_client):
output_path = os.path.join(tmpdir, "eval_test_results.jsonl")
@pytest.mark.parametrize("use_relative_path", [True, False])
def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_relative_path):
# output_path is a file
if use_relative_path:
output_path = os.path.join(tmpdir, "eval_test_results.jsonl")
else:
output_path = "eval_test_results.jsonl"

result = evaluate(
data=evaluate_test_data_jsonl_file,
evaluators={"g": F1ScoreEvaluator()},
output_path=output_path,
_use_pf_client=use_pf_client,
)

assert result is not None
Expand All @@ -415,6 +419,9 @@ def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_p
data_from_file = json.loads(content)
assert result["metrics"] == data_from_file["metrics"]

os.remove(output_path)

# output_path is a directory
result = evaluate(
data=evaluate_test_data_jsonl_file,
evaluators={"g": F1ScoreEvaluator()},
Expand Down
Loading