From 10b09138ae213cdbdf922ccf8366d8c1e31f9969 Mon Sep 17 00:00:00 2001 From: marcorusso97 Date: Wed, 3 Jun 2026 14:42:55 +0200 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20feat:=20validate=20model=20avai?= =?UTF-8?q?lability=20before=20run=20execution?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- hackagent/attacks/orchestrator.py | 461 ++++++++++++++++++ .../attacks/test_orchestrator_extended.py | 182 +++++++ 2 files changed, 643 insertions(+) diff --git a/hackagent/attacks/orchestrator.py b/hackagent/attacks/orchestrator.py index 65cbb3f..cc0c183 100644 --- a/hackagent/attacks/orchestrator.py +++ b/hackagent/attacks/orchestrator.py @@ -24,11 +24,15 @@ import json import logging +import re import shutil import subprocess +import sys import time import threading from concurrent.futures import ThreadPoolExecutor +from contextlib import contextmanager, redirect_stderr, redirect_stdout +from io import StringIO from hackagent.logger import get_logger from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple from uuid import UUID @@ -98,6 +102,71 @@ class AdvPrefix(AttackOrchestrator): attack_type: str = None # Must be overridden by subclass attack_impl_class: type = None # Must be overridden by subclass + # Model-role extraction map used by pre-run availability preflight. + # Tuple format: (role_name, path_tuple, is_list) + _ATTACK_MODEL_ROLE_PATHS: Dict[ + str, Tuple[Tuple[str, Tuple[str, ...], bool], ...] + ] = { + "advprefix": ( + ("generator", ("generator",), False), + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ), + "baseline": ( + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ), + "flipattack": ( + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ), + "tap": ( + ("attacker", ("attacker",), False), + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ("on_topic_judge", ("on_topic_judge",), False), + ), + "pair": ( + ("attacker", ("attacker",), False), + ("scorer", ("scorer",), False), + ), + "autodan_turbo": ( + ("attacker", ("attacker",), False), + ("scorer", ("scorer",), False), + ("summarizer", ("summarizer",), False), + ("embedder", ("embedder",), False), + ), + "bon": ( + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ), + "cipherchat": ( + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ), + "h4rm3l": ( + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ("decorator_llm", ("decorator_llm",), False), + ), + "pap": ( + ("attacker", ("attacker",), False), + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ), + "indirect_prompt_injection": ( + ("attacker", ("attacker",), False), + ("judge", ("judge",), False), + ("judge", ("judges",), True), + ("embedder", ("rag_injection_params", "embedder"), False), + ), + } + + # Accepted aliases for attack names used by registry/UI labels. + _ATTACK_TYPE_ALIASES: Dict[str, str] = { + "autodanturbo": "autodan_turbo", + } + def __init__(self, hackagent_agent: "HackAgent"): """ Initialize orchestrator with HackAgent instance. @@ -332,6 +401,386 @@ def _validate_default_category_classifier_requirements( "explicitly in attack_config." ) + @staticmethod + @contextmanager + def _silence_preflight_internal_logs(): + """Temporarily silence internal logs/stdout/stderr during probes.""" + previous_disable = logging.root.manager.disable + swallowed_stdout = StringIO() + swallowed_stderr = StringIO() + logging.disable(logging.CRITICAL) + try: + with redirect_stdout(swallowed_stdout), redirect_stderr(swallowed_stderr): + yield + finally: + logging.disable(previous_disable) + + @staticmethod + def _supports_ansi_stdout() -> bool: + """Return whether stdout supports ANSI color/status updates.""" + stream = getattr(sys, "stdout", None) + return bool(stream and hasattr(stream, "isatty") and stream.isatty()) + + @classmethod + def _format_status_label(cls, ok: bool) -> str: + """Return status label, colorized when supported.""" + if not cls._supports_ansi_stdout(): + return "OK" if ok else "KO" + green = "\033[32m" + red = "\033[31m" + reset = "\033[0m" + return f"{green}OK{reset}" if ok else f"{red}KO{reset}" + + def _probe_model_target_with_progress( + self, target: Dict[str, Any] + ) -> Optional[str]: + """Probe one model target while streaming a single-line progress status.""" + role = str(target.get("role") or "unknown") + identifier = str(target.get("identifier") or "unknown") + prefix = f"Checking {role} ({identifier})" + display_stream = getattr(sys, "stdout", None) + if display_stream is None: + display_stream = sys.__stdout__ + + use_inline = self._supports_ansi_stdout() + spinner_stop = threading.Event() + spinner_thread: Optional[threading.Thread] = None + + if use_inline: + dots = (".", "..", "...") + + def _spinner() -> None: + idx = 0 + while not spinner_stop.is_set(): + frame = dots[idx % len(dots)] + idx += 1 + display_stream.write(f"\r{prefix} {frame}") + display_stream.flush() + time.sleep(0.2) + + spinner_thread = threading.Thread(target=_spinner, daemon=True) + spinner_thread.start() + else: + logger.info(f"{prefix} ...") + + error: Optional[str] + try: + with self._silence_preflight_internal_logs(): + error = self._probe_model_target(target) + except Exception as exc: + error = f"health check failed ({type(exc).__name__}): {exc}" + finally: + if use_inline: + spinner_stop.set() + if spinner_thread is not None: + spinner_thread.join(timeout=0.5) + + status_label = self._format_status_label(ok=not error) + if use_inline: + display_stream.write(f"\r{prefix} ... {status_label}\n") + display_stream.flush() + else: + logger.info(f"{prefix} ... {status_label}") + + return error + + @staticmethod + def _get_nested_config_value( + config: Dict[str, Any], path: Tuple[str, ...] + ) -> Optional[Any]: + """Return nested dict value for the given path, or ``None``.""" + current: Any = config + for key in path: + if not isinstance(current, dict): + return None + current = current.get(key) + return current + + @classmethod + def _normalize_attack_type_for_preflight(cls, attack_type: Any) -> str: + """Normalize attack names so role-path lookups are robust across aliases.""" + raw = str(attack_type or "").strip() + if not raw: + return "" + + candidates: List[str] = [] + + lowered = raw.lower() + candidates.append(lowered) + candidates.append(lowered.replace("-", "_").replace(" ", "_")) + + snake_case = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", raw) + candidates.append(snake_case.lower().replace("-", "_").replace(" ", "_")) + + deduped_candidates: List[str] = [] + seen: set[str] = set() + for candidate in candidates: + if not candidate or candidate in seen: + continue + seen.add(candidate) + deduped_candidates.append(candidate) + + for candidate in deduped_candidates: + alias = cls._ATTACK_TYPE_ALIASES.get(candidate) + if alias: + return alias + + alias = cls._ATTACK_TYPE_ALIASES.get(candidate.replace("_", "")) + if alias: + return alias + + if candidate in cls._ATTACK_MODEL_ROLE_PATHS: + return candidate + + return deduped_candidates[0] + + @staticmethod + def _normalize_model_role_config( + role: str, role_config: Any + ) -> Optional[Dict[str, Any]]: + """Normalize one role config into a model descriptor, if possible.""" + if not isinstance(role_config, dict): + return None + + identifier = ( + role_config.get("identifier") + or role_config.get("model") + or role_config.get("model_id") + or role_config.get("model_name") + or role_config.get("name") + ) + if not identifier: + return None + + endpoint = ( + role_config.get("endpoint") + or role_config.get("agent_endpoint") + or role_config.get("api_base") + or role_config.get("base_url") + or "" + ) + + agent_type = role_config.get("agent_type") or "" + if hasattr(agent_type, "value"): + agent_type = agent_type.value + + return { + "role": role, + "identifier": str(identifier), + "endpoint": str(endpoint), + "agent_type": str(agent_type), + "config": role_config, + "kind": "router_config", + } + + def _collect_model_preflight_targets( + self, + attack_config: Dict[str, Any], + *, + goal_labels_by_index: Optional[Dict[int, Dict[str, str]]] = None, + ) -> List[Dict[str, Any]]: + """Collect model endpoints that must be reachable before run start.""" + targets: List[Dict[str, Any]] = [] + + router_obj = getattr(self.hackagent_agent, "router", None) + backend_agent = ( + getattr(router_obj, "backend_agent", None) if router_obj else None + ) + if router_obj is not None and backend_agent is not None: + registration_key = str(getattr(backend_agent, "id", "")) + agent_instance = None + if registration_key and hasattr(router_obj, "get_agent_instance"): + try: + agent_instance = router_obj.get_agent_instance(registration_key) + except Exception: + agent_instance = None + + model_name = ( + getattr(agent_instance, "model_name", None) + or getattr(agent_instance, "litellm_model", None) + or getattr(backend_agent, "name", None) + or "target" + ) + endpoint = ( + getattr(agent_instance, "api_base_url", None) + or getattr(backend_agent, "endpoint", None) + or "" + ) + agent_type = getattr(backend_agent, "agent_type", "") + + targets.append( + { + "role": "target", + "identifier": str(model_name), + "endpoint": str(endpoint or ""), + "agent_type": str(agent_type or ""), + "kind": "existing_router", + "router": router_obj, + "registration_key": registration_key, + } + ) + + raw_attack_type = attack_config.get("attack_type") or self.attack_type + attack_type = self._normalize_attack_type_for_preflight(raw_attack_type) + role_specs = self._ATTACK_MODEL_ROLE_PATHS.get(attack_type) + if not role_specs: + return targets + + seen = { + ( + str(item.get("role", "")), + str(item.get("identifier", "")), + str(item.get("endpoint", "")), + str(item.get("agent_type", "")), + ) + for item in targets + } + + for role, path, is_list in role_specs: + value = self._get_nested_config_value(attack_config, path) + if value is None: + continue + + items = value if (is_list and isinstance(value, list)) else [value] + for item in items: + normalized = self._normalize_model_role_config(role, item) + if not normalized: + continue + key = ( + normalized["role"], + normalized["identifier"], + normalized["endpoint"], + normalized["agent_type"], + ) + if key in seen: + continue + seen.add(key) + targets.append(normalized) + + # Category classifier is part of goal tracking unless explicit labels + # are provided via intents (which disable per-goal classification). + if not goal_labels_by_index: + category_cfg = attack_config.get("category_classifier") + normalized = self._normalize_model_role_config( + "category_classifier", category_cfg + ) + if normalized: + key = ( + normalized["role"], + normalized["identifier"], + normalized["endpoint"], + normalized["agent_type"], + ) + if key not in seen: + targets.append(normalized) + + return targets + + @staticmethod + def _probe_router_registration( + router: Any, + registration_key: str, + ) -> Optional[str]: + """Issue a tiny completion request and return error text when unavailable.""" + try: + response = router.route_request( + registration_key=registration_key, + request_data={ + "messages": [{"role": "user", "content": "healthcheck"}], + "max_tokens": 1, + "temperature": 0.0, + }, + ) + except Exception as exc: + return f"request failed ({type(exc).__name__}): {exc}" + + # Some tests and custom adapters may return non-dict payloads. + # Treat these probes as inconclusive instead of hard-failing. + if not isinstance(response, dict): + return None + + error_message = response.get("error_message") + if error_message: + return str(error_message) + + return None + + def _probe_model_target(self, target: Dict[str, Any]) -> Optional[str]: + """Probe one model target and return an error string on failure.""" + kind = target.get("kind") + + if kind == "existing_router": + router = target.get("router") + registration_key = str(target.get("registration_key") or "") + if router is None or not registration_key: + return "missing router registration" + return self._probe_router_registration(router, registration_key) + + if kind == "router_config": + from hackagent.attacks.shared.router_factory import create_router + + try: + temp_router, registration_key = create_router( + backend=self.hackagent_agent.backend, + config=dict(target.get("config") or {}), + logger=logger, + router_name=f"preflight-{target.get('role', 'model')}", + ) + except Exception as exc: + return f"router init failed ({type(exc).__name__}): {exc}" + + return self._probe_router_registration(temp_router, registration_key) + + return "unknown preflight target type" + + def _validate_required_models_availability( + self, + attack_config: Dict[str, Any], + *, + goal_labels_by_index: Optional[Dict[int, Dict[str, str]]] = None, + ) -> Optional[str]: + """Return a formatted error message when required models are unavailable.""" + targets = self._collect_model_preflight_targets( + attack_config, + goal_labels_by_index=goal_labels_by_index, + ) + if not targets: + return None + + unavailable: List[Dict[str, str]] = [] + for target in targets: + error = self._probe_model_target_with_progress(target) + if not error: + continue + + unavailable.append( + { + "role": str(target.get("role") or "unknown"), + "identifier": str(target.get("identifier") or "unknown"), + "endpoint": str(target.get("endpoint") or ""), + "agent_type": str(target.get("agent_type") or "unknown"), + "error": error, + } + ) + + if unavailable: + details = "\n".join( + ( + f"- role={item['role']} " + f"identifier={item['identifier']} " + f"endpoint={item['endpoint']} " + f"error={item['error']}" + ) + for item in unavailable + ) + return ( + "Attack aborted: one or more required models are unavailable. " + "The run was not started. Unreachable models:\n" + f"{details}" + ) + + return None + def _load_goals_from_dataset(self, dataset_config: Dict[str, Any]) -> list: """ Load goals from a dataset configuration. @@ -714,6 +1163,18 @@ def execute( else: self._validate_default_category_classifier_requirements(attack_config) + availability_error = self._validate_required_models_availability( + attack_config, + goal_labels_by_index=goal_labels_by_index, + ) + if availability_error: + # Use the same logger/message style surfaced by HackAgent.hack so + # users get a visible rich ERROR line even when we abort gracefully. + get_logger("hackagent.agent").error( + f"Configuration error in HackAgent.hack: {availability_error}" + ) + return [] + # Enrich run config with expected goal cardinality so downstream views # can keep RUNNING until all expected goals are fully tracked. effective_run_config = dict(run_config_override or {}) diff --git a/tests/unit/attacks/test_orchestrator_extended.py b/tests/unit/attacks/test_orchestrator_extended.py index ad5c2ea..1f3dbb0 100644 --- a/tests/unit/attacks/test_orchestrator_extended.py +++ b/tests/unit/attacks/test_orchestrator_extended.py @@ -278,6 +278,188 @@ def test_skips_preflight_if_classifier_is_explicitly_configured(self): self.assertIsNotNone(results) +class TestRequiredModelAvailabilityPreflight(unittest.TestCase): + """Test fail-fast behavior for model availability preflight.""" + + def test_normalize_attack_type_for_preflight_accepts_autodan_aliases(self): + """AutoDAN aliases should resolve to autodan_turbo mapping key.""" + orch, _, _ = _make_orchestrator() + + self.assertEqual( + orch._normalize_attack_type_for_preflight("AutoDANTurbo"), + "autodan_turbo", + ) + self.assertEqual( + orch._normalize_attack_type_for_preflight("autodan-turbo"), + "autodan_turbo", + ) + + def test_collect_targets_uses_normalized_attack_type_for_autodan_roles(self): + """AutoDAN preflight must include attacker/scorer/summarizer/embedder roles.""" + orch, _, _ = _make_orchestrator() + orch.attack_type = "AutoDANTurbo" + orch.hackagent_agent.router = None + + attack_config = { + "attack_type": "autodan_turbo", + "attacker": { + "identifier": "a-model", + "endpoint": "http://localhost:1111", + "agent_type": "OPENAI_SDK", + }, + "scorer": { + "identifier": "s-model", + "endpoint": "http://localhost:2222", + "agent_type": "OPENAI_SDK", + }, + "summarizer": { + "identifier": "z-model", + "endpoint": "http://localhost:3333", + "agent_type": "OPENAI_SDK", + }, + "embedder": { + "identifier": "e-model", + "endpoint": "http://localhost:4444", + "agent_type": "OPENAI_SDK", + }, + } + + targets = orch._collect_model_preflight_targets( + attack_config, + goal_labels_by_index={0: {"category": "c", "subcategory": "s"}}, + ) + + roles = {item["role"] for item in targets} + self.assertIn("attacker", roles) + self.assertIn("scorer", roles) + self.assertIn("summarizer", roles) + self.assertIn("embedder", roles) + + def test_validate_required_models_availability_reports_model_and_endpoint(self): + """Error should include role, identifier, and endpoint for unavailable models.""" + orch, _, _ = _make_orchestrator() + + with patch.object( + AttackOrchestrator, + "_collect_model_preflight_targets", + return_value=[ + { + "role": "attacker", + "identifier": "gemma3:4b", + "endpoint": "http://localhost:11434", + "agent_type": "OLLAMA", + "kind": "router_config", + "config": { + "identifier": "gemma3:4b", + "endpoint": "http://localhost:11434", + "agent_type": "OLLAMA", + }, + } + ], + ): + with patch.object( + AttackOrchestrator, + "_probe_model_target", + return_value="model not found", + ): + message = orch._validate_required_models_availability( + attack_config={"goals": ["test"]} + ) + + self.assertIsInstance(message, str) + self.assertIn("required models are unavailable", message) + self.assertIn("gemma3:4b", message) + self.assertIn("http://localhost:11434", message) + + def test_validate_required_models_availability_lists_multiple_models(self): + """When target and judge are unavailable, both must appear in the error list.""" + orch, _, _ = _make_orchestrator() + + with patch.object( + AttackOrchestrator, + "_collect_model_preflight_targets", + return_value=[ + { + "role": "target", + "identifier": "google/gemma-3-27b-it", + "endpoint": "https://openrouter.ai/api/v1", + "agent_type": "OPENAI_SDK", + "kind": "existing_router", + }, + { + "role": "judge", + "identifier": "mistralai/mistral-small-3.1", + "endpoint": "https://openrouter.ai/api/v1", + "agent_type": "OPENAI_SDK", + "kind": "router_config", + "config": { + "identifier": "mistralai/mistral-small-3.1", + "endpoint": "https://openrouter.ai/api/v1", + "agent_type": "OPENAI_SDK", + }, + }, + ], + ): + with patch.object( + AttackOrchestrator, + "_probe_model_target", + side_effect=[ + "target invalid model", + "judge invalid model", + ], + ): + message = orch._validate_required_models_availability( + attack_config={"goals": ["test"]} + ) + + self.assertIsInstance(message, str) + self.assertIn("Unreachable models:\n- role=target", message) + self.assertIn("\n- role=judge", message) + self.assertIn("identifier=google/gemma-3-27b-it", message) + self.assertIn("identifier=mistralai/mistral-small-3.1", message) + + def test_execute_aborts_before_db_records_when_model_unavailable(self): + """Run must not start when preflight detects an unavailable model.""" + orch, _, _ = _make_orchestrator() + + attack_config = {"goals": ["test"], "attack_type": "baseline"} + + with patch.object( + AttackOrchestrator, + "_validate_default_category_classifier_requirements", + return_value=None, + ): + with patch.object( + AttackOrchestrator, + "_validate_required_models_availability", + return_value=( + "Attack aborted: one or more required models are unavailable. " + "The run was not started. Unreachable models:\n" + "- role=target identifier=test-model endpoint=http://localhost:11434 " + "error=model not found" + ), + ): + with patch.object( + AttackOrchestrator, + "_create_server_attack_record", + return_value=_VALID_ATK_ID, + ) as mock_create_atk: + with patch.object( + AttackOrchestrator, + "_create_server_run_record", + return_value=_VALID_RUN_ID, + ) as mock_create_run: + results = orch.execute( + attack_config=attack_config, + run_config_override=None, + fail_on_run_error=False, + ) + + self.assertEqual(results, []) + mock_create_atk.assert_not_called() + mock_create_run.assert_not_called() + + class TestAttackOrchestratorHTTPResponseParsing(unittest.TestCase): """Test HTTP response parsing helpers in depth.""" From 98f040352c71b537ef574e1879e7c7f84ec07ead Mon Sep 17 00:00:00 2001 From: marcorusso97 Date: Wed, 3 Jun 2026 15:27:08 +0200 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=90=9B=20fix:=20fixed=20google=20adk?= =?UTF-8?q?=20integration=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- hackagent/router/providers/adk.py | 8 +++++++- tests/integration/conftest.py | 3 ++- tests/integration/router/test_adk_agent.py | 4 ++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/hackagent/router/providers/adk.py b/hackagent/router/providers/adk.py index 5899cdb..40a7bec 100644 --- a/hackagent/router/providers/adk.py +++ b/hackagent/router/providers/adk.py @@ -176,8 +176,14 @@ def _create_session( "Accept": "application/json", } payload = initial_state or {} + session_timeout = min(30, max(1, int(self.timeout))) try: - response = requests.post(url, headers=headers, json=payload, timeout=30) + response = requests.post( + url, + headers=headers, + json=payload, + timeout=session_timeout, + ) response.raise_for_status() return except requests.exceptions.HTTPError as http_err: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 8c45d03..e7f44d4 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -482,11 +482,12 @@ def google_adk_available(google_adk_agent_url: Optional[str]) -> bool: @pytest.fixture(scope="module") def google_adk_config(google_adk_agent_url: Optional[str]) -> Dict[str, Any]: """Return configuration dictionary for Google ADK adapter (module-scoped for speed).""" + adk_timeout = int(os.getenv("TEST_GOOGLE_ADK_TIMEOUT", "45")) return { "name": "multi_tool_agent", "endpoint": google_adk_agent_url or "http://localhost:8000", "user_id": "test_user", - "timeout": 120, + "timeout": max(1, adk_timeout), } diff --git a/tests/integration/router/test_adk_agent.py b/tests/integration/router/test_adk_agent.py index 20f8ad8..33c198e 100644 --- a/tests/integration/router/test_adk_agent.py +++ b/tests/integration/router/test_adk_agent.py @@ -33,6 +33,8 @@ logger = logging.getLogger(__name__) +ADK_TEST_TIMEOUT_SECONDS = 45 + @pytest.mark.integration @pytest.mark.google_adk @@ -378,6 +380,7 @@ def test_router_creates_adk_adapter( name="multi_tool_agent", agent_type=AgentTypeEnum.GOOGLE_ADK, endpoint=google_adk_agent_url, + adapter_operational_config={"timeout": ADK_TEST_TIMEOUT_SECONDS}, ) # Verify adapter was created @@ -414,6 +417,7 @@ def test_router_handles_adk_request( name="multi_tool_agent", agent_type=AgentTypeEnum.GOOGLE_ADK, endpoint=google_adk_agent_url, + adapter_operational_config={"timeout": ADK_TEST_TIMEOUT_SECONDS}, ) # Route a request From af3dba7815ce86f5889d91d0f0d4d8074aed7d37 Mon Sep 17 00:00:00 2001 From: marcorusso97 Date: Wed, 3 Jun 2026 15:53:13 +0200 Subject: [PATCH 3/5] =?UTF-8?q?=F0=9F=92=9A=20ci:=20validate=20commit=20me?= =?UTF-8?q?ssages=20against=20pr=20head=20sha=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e08e68c..12570af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: run: uv sync --group dev - name: Check commit messages in PR - run: uv run cz check --rev-range ${{ github.event.pull_request.base.sha }}..${{ github.sha }} + run: uv run cz check --rev-range ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} python-checks: name: Linting and Formatting From a212034c2a6e5ceb58a1136943511c2b4b57d377 Mon Sep 17 00:00:00 2001 From: marcorusso97 Date: Fri, 5 Jun 2026 15:13:47 +0200 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=90=9B=20fix:=20optimized=20preflight?= =?UTF-8?q?=20check=20on=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- hackagent/attacks/orchestrator.py | 186 +++++++++--- .../techniques/autodan_turbo/attack.py | 30 ++ hackagent/attacks/techniques/base.py | 23 ++ .../attacks/techniques/baseline/attack.py | 24 ++ hackagent/attacks/techniques/h4rm3l/attack.py | 39 +++ .../attacks/techniques/h4rm3l/decorators.py | 21 ++ .../attacks/techniques/h4rm3l/generation.py | 17 +- hackagent/attacks/techniques/tap/attack.py | 33 +++ tests/unit/attacks/test_orchestrator.py | 11 +- .../attacks/test_orchestrator_extended.py | 279 +++++++++++++++++- 10 files changed, 595 insertions(+), 68 deletions(-) diff --git a/hackagent/attacks/orchestrator.py b/hackagent/attacks/orchestrator.py index cc0c183..1c87d16 100644 --- a/hackagent/attacks/orchestrator.py +++ b/hackagent/attacks/orchestrator.py @@ -42,6 +42,7 @@ from hackagent.errors import HackAgentError from hackagent.attacks.techniques.config import ( DEFAULT_CATEGORY_CLASSIFIER_AGENT_TYPE, + DEFAULT_CATEGORY_CLASSIFIER_ENDPOINT, DEFAULT_CATEGORY_CLASSIFIER_IDENTIFIER, ) from hackagent.server.storage.enums import StatusEnum @@ -435,9 +436,10 @@ def _probe_model_target_with_progress( self, target: Dict[str, Any] ) -> Optional[str]: """Probe one model target while streaming a single-line progress status.""" - role = str(target.get("role") or "unknown") + role = self._format_target_roles(target) identifier = str(target.get("identifier") or "unknown") - prefix = f"Checking {role} ({identifier})" + optional_suffix = " [optional]" if not target.get("required", True) else "" + prefix = f"Checking {role} ({identifier}){optional_suffix}" display_stream = getattr(sys, "stdout", None) if display_stream is None: display_stream = sys.__stdout__ @@ -573,6 +575,27 @@ def _normalize_model_role_config( "kind": "router_config", } + @staticmethod + def _format_target_roles(target: Dict[str, Any]) -> str: + """Format logical role labels for user-facing progress/error messages.""" + roles = target.get("roles") + if isinstance(roles, list): + normalized = [str(role) for role in roles if role] + if normalized: + return ",".join(normalized) + + role = target.get("role") + return str(role or "unknown") + + @staticmethod + def _preflight_target_key(target: Dict[str, Any]) -> Tuple[str, str, str]: + """Build deduplication key for effective model endpoint checks.""" + return ( + str(target.get("identifier") or ""), + str(target.get("endpoint") or ""), + str(target.get("agent_type") or ""), + ) + def _collect_model_preflight_targets( self, attack_config: Dict[str, Any], @@ -581,6 +604,52 @@ def _collect_model_preflight_targets( ) -> List[Dict[str, Any]]: """Collect model endpoints that must be reachable before run start.""" targets: List[Dict[str, Any]] = [] + targets_by_key: Dict[Tuple[str, str, str], Dict[str, Any]] = {} + + def _register_target( + target: Dict[str, Any], + *, + role_name: Optional[str] = None, + required: bool = True, + ) -> None: + key = self._preflight_target_key(target) + if not any(key): + return + + roles = target.get("roles") + if isinstance(roles, list) and roles: + normalized_roles = [str(role) for role in roles if role] + else: + normalized_roles = [str(role_name or target.get("role") or "unknown")] + + existing = targets_by_key.get(key) + if existing is None: + normalized_target = dict(target) + normalized_target["roles"] = [] + for role in normalized_roles: + if role not in normalized_target["roles"]: + normalized_target["roles"].append(role) + normalized_target["role"] = normalized_target["roles"][0] + normalized_target["required"] = bool(required) + targets.append(normalized_target) + targets_by_key[key] = normalized_target + return + + for role in normalized_roles: + if role not in existing["roles"]: + existing["roles"].append(role) + existing["role"] = existing["roles"][0] + existing["required"] = bool(existing.get("required", True) or required) + + # Prefer probing already-built router registrations when available. + if ( + existing.get("kind") == "router_config" + and target.get("kind") == "existing_router" + ): + existing["kind"] = "existing_router" + existing["router"] = target.get("router") + existing["registration_key"] = target.get("registration_key") + existing.pop("config", None) router_obj = getattr(self.hackagent_agent, "router", None) backend_agent = ( @@ -608,7 +677,7 @@ def _collect_model_preflight_targets( ) agent_type = getattr(backend_agent, "agent_type", "") - targets.append( + _register_target( { "role": "target", "identifier": str(model_name), @@ -617,62 +686,82 @@ def _collect_model_preflight_targets( "kind": "existing_router", "router": router_obj, "registration_key": registration_key, - } + }, + role_name="target", + required=True, ) - raw_attack_type = attack_config.get("attack_type") or self.attack_type - attack_type = self._normalize_attack_type_for_preflight(raw_attack_type) - role_specs = self._ATTACK_MODEL_ROLE_PATHS.get(attack_type) - if not role_specs: - return targets - - seen = { - ( - str(item.get("role", "")), - str(item.get("identifier", "")), - str(item.get("endpoint", "")), - str(item.get("agent_type", "")), - ) - for item in targets - } + attack_owned_roles: Optional[List[Dict[str, Any]]] = None + attack_impl = getattr(self, "attack_impl_class", None) + if attack_impl and hasattr(attack_impl, "get_effective_model_roles"): + try: + attack_owned_roles = attack_impl.get_effective_model_roles( + attack_config, + goal_labels_by_index=goal_labels_by_index, + ) + except Exception as exc: + logger.warning( + "Attack-owned preflight role resolution failed for %s: %s", + getattr(attack_impl, "__name__", "unknown"), + exc, + ) + attack_owned_roles = None - for role, path, is_list in role_specs: - value = self._get_nested_config_value(attack_config, path) - if value is None: - continue + if attack_owned_roles is not None: + for role_item in attack_owned_roles: + if not isinstance(role_item, dict): + continue - items = value if (is_list and isinstance(value, list)) else [value] - for item in items: - normalized = self._normalize_model_role_config(role, item) - if not normalized: + role = str(role_item.get("role") or "").strip() + role_config = role_item.get("config") + required = bool(role_item.get("required", True)) + if not role: continue - key = ( - normalized["role"], - normalized["identifier"], - normalized["endpoint"], - normalized["agent_type"], - ) - if key in seen: + + normalized = self._normalize_model_role_config(role, role_config) + if not normalized: continue - seen.add(key) - targets.append(normalized) + + _register_target(normalized, role_name=role, required=required) + else: + raw_attack_type = attack_config.get("attack_type") or self.attack_type + attack_type = self._normalize_attack_type_for_preflight(raw_attack_type) + role_specs = self._ATTACK_MODEL_ROLE_PATHS.get(attack_type) + + if role_specs: + for role, path, is_list in role_specs: + value = self._get_nested_config_value(attack_config, path) + if value is None: + continue + + items = value if (is_list and isinstance(value, list)) else [value] + for item in items: + normalized = self._normalize_model_role_config(role, item) + if not normalized: + continue + _register_target(normalized, role_name=role, required=True) # Category classifier is part of goal tracking unless explicit labels # are provided via intents (which disable per-goal classification). if not goal_labels_by_index: - category_cfg = attack_config.get("category_classifier") + if self._uses_default_category_classifier(attack_config): + category_cfg = { + "identifier": DEFAULT_CATEGORY_CLASSIFIER_IDENTIFIER, + "endpoint": DEFAULT_CATEGORY_CLASSIFIER_ENDPOINT, + "agent_type": DEFAULT_CATEGORY_CLASSIFIER_AGENT_TYPE, + } + else: + category_cfg = attack_config.get("category_classifier") + normalized = self._normalize_model_role_config( "category_classifier", category_cfg ) if normalized: - key = ( - normalized["role"], - normalized["identifier"], - normalized["endpoint"], - normalized["agent_type"], + _register_target( + normalized, + role_name="category_classifier", + required=True, ) - if key not in seen: - targets.append(normalized) return targets @@ -747,15 +836,22 @@ def _validate_required_models_availability( if not targets: return None + probe_optional_roles = bool( + attack_config.get("_preflight_probe_optional_roles", False) + ) + unavailable: List[Dict[str, str]] = [] for target in targets: + if not target.get("required", True) and not probe_optional_roles: + continue + error = self._probe_model_target_with_progress(target) if not error: continue unavailable.append( { - "role": str(target.get("role") or "unknown"), + "role": self._format_target_roles(target), "identifier": str(target.get("identifier") or "unknown"), "endpoint": str(target.get("endpoint") or ""), "agent_type": str(target.get("agent_type") or "unknown"), diff --git a/hackagent/attacks/techniques/autodan_turbo/attack.py b/hackagent/attacks/techniques/autodan_turbo/attack.py index 010287e..2c49b57 100644 --- a/hackagent/attacks/techniques/autodan_turbo/attack.py +++ b/hackagent/attacks/techniques/autodan_turbo/attack.py @@ -106,6 +106,36 @@ def _validate_config(self): if not self.config.get("attacker", {}).get("identifier"): raise ValueError("attacker.identifier required") + @classmethod + def get_effective_model_roles( + cls, + attack_config: Dict[str, Any], + *, + goal_labels_by_index: Dict[int, Dict[str, str]] | None = None, + ) -> List[Dict[str, Any]]: + """Resolve AutoDAN-Turbo preflight roles with optional embedder semantics.""" + _ = goal_labels_by_index + + roles: List[Dict[str, Any]] = [] + for role_name in ("attacker", "scorer", "summarizer"): + role_config = attack_config.get(role_name) + if isinstance(role_config, dict): + roles.append({"role": role_name, "config": role_config}) + + embedder = attack_config.get("embedder") + if isinstance(embedder, dict): + roles.append( + { + "role": "embedder", + "config": embedder, + "required": bool( + attack_config.get("_preflight_require_embedder", False) + ), + } + ) + + return roles + def _get_pipeline_steps(self): """Disable BaseAttack static pipeline; orchestration is manual in ``run``. diff --git a/hackagent/attacks/techniques/base.py b/hackagent/attacks/techniques/base.py index ddadb9f..98b47c8 100644 --- a/hackagent/attacks/techniques/base.py +++ b/hackagent/attacks/techniques/base.py @@ -133,6 +133,29 @@ def _setup(self): """ self._setup_logging() + @classmethod + def get_effective_model_roles( + cls, + attack_config: Dict[str, Any], + *, + goal_labels_by_index: Optional[Dict[int, Dict[str, str]]] = None, + ) -> Optional[List[Dict[str, Any]]]: + """Return attack-owned preflight model roles, or ``None`` for fallback mapping. + + The orchestrator consumes this hook before using its legacy static role map. + Each returned item should be a dict with: + + - ``role``: logical role label (e.g. ``judge``) + - ``config``: model/router config dict for that role + - ``required`` (optional): whether this role must pass preflight + + Returning ``None`` delegates role discovery to orchestrator fallback logic. + Returning ``[]`` means no attack-specific model roles are required. + """ + _ = attack_config + _ = goal_labels_by_index + return None + def _setup_logging(self): """ Configure logging to console for this attack instance. diff --git a/hackagent/attacks/techniques/baseline/attack.py b/hackagent/attacks/techniques/baseline/attack.py index 7d325f9..e63b4ad 100644 --- a/hackagent/attacks/techniques/baseline/attack.py +++ b/hackagent/attacks/techniques/baseline/attack.py @@ -120,6 +120,30 @@ def _validate_config(self): f"Unknown objective: {objective}. Available: {list(OBJECTIVES.keys())}" ) + @classmethod + def get_effective_model_roles( + cls, + attack_config: Dict[str, Any], + *, + goal_labels_by_index: Optional[Dict[int, Dict[str, str]]] = None, + ) -> List[Dict[str, Any]]: + """Return only the model roles needed by the effective baseline evaluator.""" + _ = goal_labels_by_index + + evaluator_type = str(attack_config.get("evaluator_type", "llm_judge")).lower() + if evaluator_type != "llm_judge": + return [] + + judges = attack_config.get("judges") + if isinstance(judges, list) and judges: + return [{"role": "judge", "config": judge} for judge in judges] + + judge_config = attack_config.get("judge_config") + if isinstance(judge_config, dict): + return [{"role": "judge", "config": judge_config}] + + return [] + def _get_pipeline_steps(self) -> List[Dict]: """ Define the two baseline pipeline stage descriptors. diff --git a/hackagent/attacks/techniques/h4rm3l/attack.py b/hackagent/attacks/techniques/h4rm3l/attack.py index 5f8257c..a029e2f 100644 --- a/hackagent/attacks/techniques/h4rm3l/attack.py +++ b/hackagent/attacks/techniques/h4rm3l/attack.py @@ -29,6 +29,7 @@ from . import generation, evaluation from .config import DEFAULT_H4RM3L_CONFIG, PRESET_PROGRAMS +from .decorators import program_uses_llm_assisted_decorators def _recursive_update(target_dict, source_dict): @@ -116,6 +117,44 @@ def _validate_config(self): if syntax_version not in (1, 2): raise ValueError(f"syntax_version must be 1 or 2, got {syntax_version}") + @classmethod + def get_effective_model_roles( + cls, + attack_config: Dict[str, Any], + *, + goal_labels_by_index: Optional[Dict[int, Dict[str, str]]] = None, + ) -> List[Dict[str, Any]]: + """Resolve h4rm3l preflight roles from effective runtime program semantics.""" + _ = goal_labels_by_index + + roles: List[Dict[str, Any]] = [] + + judges = attack_config.get("judges") + if isinstance(judges, list) and judges: + roles.extend({"role": "judge", "config": judge} for judge in judges) + else: + judge = attack_config.get("judge") + if isinstance(judge, dict): + roles.append({"role": "judge", "config": judge}) + + params = attack_config.get("h4rm3l_params") + if not isinstance(params, dict): + params = {} + + program_ref = params.get("program", "IdentityDecorator()") + syntax_version = params.get("syntax_version", 2) + resolved_program = PRESET_PROGRAMS.get(program_ref, program_ref) + + if program_uses_llm_assisted_decorators(resolved_program, syntax_version): + decorator_llm = attack_config.get("decorator_llm") + if not isinstance(decorator_llm, dict): + decorator_llm = DEFAULT_H4RM3L_CONFIG.get("decorator_llm") + + if isinstance(decorator_llm, dict): + roles.append({"role": "decorator_llm", "config": decorator_llm}) + + return roles + def _get_pipeline_steps(self) -> List[Dict]: """Define the two-stage attack pipeline.""" return [ diff --git a/hackagent/attacks/techniques/h4rm3l/decorators.py b/hackagent/attacks/techniques/h4rm3l/decorators.py index e3a6c09..63a4b22 100644 --- a/hackagent/attacks/techniques/h4rm3l/decorators.py +++ b/hackagent/attacks/techniques/h4rm3l/decorators.py @@ -69,6 +69,7 @@ def has_prompting_interface() -> bool: LLM_ASSISTED_DECORATOR_NAMES = { "TranslateDecorator", + "TranslateBackDecorator", "PAPDecorator", "PersonaDecorator", "PersuasiveDecorator", @@ -1238,3 +1239,23 @@ def compile_program( return _compile_v2(program) else: raise ValueError(f"Unknown syntax_version={syntax_version}; expected 1 or 2") + + +def program_uses_llm_assisted_decorators( + program: str, + syntax_version: int = 2, +) -> bool: + """Return whether a program chain contains at least one LLM-assisted decorator. + + This helper is used by both preflight role resolution and runtime setup to + keep decorator-LLM activation semantics consistent. + """ + try: + _, decorator_steps = compile_program_with_steps(program, syntax_version) + return any( + is_llm_assisted_decorator_name(step.__class__.__name__) + for step in decorator_steps + ) + except Exception: + # Best-effort fallback when program cannot be compiled yet. + return any(name in str(program) for name in LLM_ASSISTED_DECORATOR_NAMES) diff --git a/hackagent/attacks/techniques/h4rm3l/generation.py b/hackagent/attacks/techniques/h4rm3l/generation.py index a73cd9e..37ef2db 100644 --- a/hackagent/attacks/techniques/h4rm3l/generation.py +++ b/hackagent/attacks/techniques/h4rm3l/generation.py @@ -23,6 +23,7 @@ compile_program_with_steps, has_prompting_interface, is_llm_assisted_decorator_name, + program_uses_llm_assisted_decorators, set_prompting_interface, ) @@ -145,18 +146,10 @@ def execute( logger.info(f"Using custom program: {resolved_program[:80]}...") # Set up LLM-assisted decorators if needed - llm_keywords = [ - "TranslateDecorator", - "PAPDecorator", - "PersonaDecorator", - "PersuasiveDecorator", - "SynonymDecorator", - "ResearcherDecorator", - "VillainDecorator", - "VisualObfuscationDecorator", - "TransformFxDecorator", - ] - needs_llm = any(kw in resolved_program for kw in llm_keywords) + needs_llm = program_uses_llm_assisted_decorators( + resolved_program, + syntax_version, + ) decoration_llm_identifier = None decoration_llm_endpoint = None diff --git a/hackagent/attacks/techniques/tap/attack.py b/hackagent/attacks/techniques/tap/attack.py index a079b57..624e5cc 100644 --- a/hackagent/attacks/techniques/tap/attack.py +++ b/hackagent/attacks/techniques/tap/attack.py @@ -177,6 +177,39 @@ def _validate_config(self) -> None: if value is None or value < 1: raise ValueError(f"tap_params.{key} must be >= 1") + @classmethod + def get_effective_model_roles( + cls, + attack_config: Dict[str, Any], + *, + goal_labels_by_index: Optional[Dict[int, Dict[str, str]]] = None, + ) -> List[Dict[str, Any]]: + """Resolve TAP runtime roles with on-topic fallback behavior.""" + _ = goal_labels_by_index + + roles: List[Dict[str, Any]] = [] + + attacker = attack_config.get("attacker") + if isinstance(attacker, dict): + roles.append({"role": "attacker", "config": attacker}) + + judges = attack_config.get("judges") + if isinstance(judges, list) and judges: + resolved_judges = judges + else: + judge = attack_config.get("judge") + resolved_judges = [judge] if isinstance(judge, dict) else [] + + roles.extend({"role": "judge", "config": judge} for judge in resolved_judges) + + on_topic_judge = attack_config.get("on_topic_judge") + if isinstance(on_topic_judge, dict): + roles.append({"role": "on_topic_judge", "config": on_topic_judge}) + elif resolved_judges: + roles.append({"role": "on_topic_judge", "config": resolved_judges[0]}) + + return roles + def _get_pipeline_steps(self) -> List[Dict]: """ Define the two TAP pipeline stages. diff --git a/tests/unit/attacks/test_orchestrator.py b/tests/unit/attacks/test_orchestrator.py index 09fb33c..d1e2954 100644 --- a/tests/unit/attacks/test_orchestrator.py +++ b/tests/unit/attacks/test_orchestrator.py @@ -239,9 +239,18 @@ def test_get_attack_impl_kwargs(self): @patch.object(AttackOrchestrator, "_create_server_attack_record") @patch.object(AttackOrchestrator, "_create_server_run_record") + @patch.object( + AttackOrchestrator, + "_validate_required_models_availability", + return_value=None, + ) @patch.object(AttackOrchestrator, "_execute_local_attack") def test_execute_full_workflow( - self, mock_execute_local, mock_create_run, mock_create_attack + self, + mock_execute_local, + mock_validate_models, + mock_create_run, + mock_create_attack, ): """Test full execute workflow.""" diff --git a/tests/unit/attacks/test_orchestrator_extended.py b/tests/unit/attacks/test_orchestrator_extended.py index 1f3dbb0..1ac4ed2 100644 --- a/tests/unit/attacks/test_orchestrator_extended.py +++ b/tests/unit/attacks/test_orchestrator_extended.py @@ -9,7 +9,16 @@ from uuid import uuid4 from hackagent.attacks.orchestrator import AttackOrchestrator +from hackagent.attacks.techniques.autodan_turbo.attack import AutoDANTurboAttack from hackagent.attacks.techniques.base import BaseAttack +from hackagent.attacks.techniques.baseline.attack import BaselineAttack +from hackagent.attacks.techniques.h4rm3l.attack import H4rm3lAttack +from hackagent.attacks.techniques.tap.attack import TAPAttack +from hackagent.attacks.techniques.config import ( + DEFAULT_CATEGORY_CLASSIFIER_AGENT_TYPE, + DEFAULT_CATEGORY_CLASSIFIER_ENDPOINT, + DEFAULT_CATEGORY_CLASSIFIER_IDENTIFIER, +) from hackagent.errors import HackAgentError @@ -48,9 +57,14 @@ class TestAttackOrchestratorExecuteFlow(unittest.TestCase): @patch.object( AttackOrchestrator, "_create_server_attack_record", return_value=_VALID_ATK_ID ) + @patch.object( + AttackOrchestrator, + "_validate_required_models_availability", + return_value=None, + ) @patch.object(AttackOrchestrator, "_execute_local_attack", return_value=["result"]) def test_execute_updates_run_status_to_running( - self, mock_exec, mock_create_atk, mock_create_run + self, mock_exec, mock_validate_models, mock_create_atk, mock_create_run ): """Test that execute updates run to RUNNING status.""" orch, hack_agent, _ = _make_orchestrator() @@ -79,9 +93,14 @@ def test_execute_updates_run_status_to_running( @patch.object( AttackOrchestrator, "_create_server_attack_record", return_value=_VALID_ATK_ID ) + @patch.object( + AttackOrchestrator, + "_validate_required_models_availability", + return_value=None, + ) @patch.object(AttackOrchestrator, "_execute_local_attack", return_value=["result"]) def test_execute_updates_run_status_to_completed( - self, mock_exec, mock_create_atk, mock_create_run + self, mock_exec, mock_validate_models, mock_create_atk, mock_create_run ): """Test that execute updates run to COMPLETED on success.""" orch, hack_agent, _ = _make_orchestrator() @@ -109,11 +128,16 @@ def test_execute_updates_run_status_to_completed( @patch.object( AttackOrchestrator, "_create_server_attack_record", return_value=_VALID_ATK_ID ) + @patch.object( + AttackOrchestrator, + "_validate_required_models_availability", + return_value=None, + ) @patch.object( AttackOrchestrator, "_execute_local_attack", side_effect=RuntimeError("Boom") ) def test_execute_updates_run_status_to_failed_on_error( - self, mock_exec, mock_create_atk, mock_create_run + self, mock_exec, mock_validate_models, mock_create_atk, mock_create_run ): """Test that execute updates run to FAILED on exception.""" orch, hack_agent, _ = _make_orchestrator() @@ -142,9 +166,14 @@ def test_execute_updates_run_status_to_failed_on_error( @patch.object( AttackOrchestrator, "_create_server_attack_record", return_value=_VALID_ATK_ID ) + @patch.object( + AttackOrchestrator, + "_validate_required_models_availability", + return_value=None, + ) @patch.object(AttackOrchestrator, "_execute_local_attack", return_value=["result"]) def test_execute_continues_when_status_update_fails( - self, mock_exec, mock_create_atk, mock_create_run + self, mock_exec, mock_validate_models, mock_create_atk, mock_create_run ): """Test that execute continues even if status update fails.""" orch, hack_agent, _ = _make_orchestrator() @@ -295,9 +324,10 @@ def test_normalize_attack_type_for_preflight_accepts_autodan_aliases(self): ) def test_collect_targets_uses_normalized_attack_type_for_autodan_roles(self): - """AutoDAN preflight must include attacker/scorer/summarizer/embedder roles.""" + """AutoDAN attack-owned roles include optional embedder and required core roles.""" orch, _, _ = _make_orchestrator() orch.attack_type = "AutoDANTurbo" + orch.attack_impl_class = AutoDANTurboAttack orch.hackagent_agent.router = None attack_config = { @@ -329,11 +359,166 @@ def test_collect_targets_uses_normalized_attack_type_for_autodan_roles(self): goal_labels_by_index={0: {"category": "c", "subcategory": "s"}}, ) - roles = {item["role"] for item in targets} - self.assertIn("attacker", roles) - self.assertIn("scorer", roles) - self.assertIn("summarizer", roles) - self.assertIn("embedder", roles) + required_by_role = {} + for item in targets: + for role in item.get("roles", [item.get("role")]): + required_by_role[role] = item.get("required", True) + + self.assertIn("attacker", required_by_role) + self.assertIn("scorer", required_by_role) + self.assertIn("summarizer", required_by_role) + self.assertIn("embedder", required_by_role) + self.assertTrue(required_by_role["attacker"]) + self.assertTrue(required_by_role["scorer"]) + self.assertTrue(required_by_role["summarizer"]) + self.assertFalse(required_by_role["embedder"]) + + def test_collect_targets_deduplicates_tap_judge_and_on_topic_when_shared(self): + """TAP judge and fallback on_topic_judge should collapse into one probe target.""" + orch, _, _ = _make_orchestrator() + orch.attack_type = "tap" + orch.attack_impl_class = TAPAttack + orch.hackagent_agent.router = None + + attack_config = { + "attack_type": "tap", + "attacker": { + "identifier": "att-model", + "endpoint": "http://localhost:1111", + "agent_type": "OPENAI_SDK", + }, + "judge": { + "identifier": "judge-model", + "endpoint": "http://localhost:2222", + "agent_type": "OPENAI_SDK", + }, + "on_topic_judge": None, + } + + targets = orch._collect_model_preflight_targets(attack_config) + judge_targets = [ + t for t in targets if str(t.get("identifier")) == "judge-model" + ] + + self.assertEqual(len(judge_targets), 1) + self.assertIn("judge", judge_targets[0].get("roles", [])) + self.assertIn("on_topic_judge", judge_targets[0].get("roles", [])) + + def test_collect_targets_keeps_classifier_when_intents_are_not_used(self): + """Category classifier remains preflighted unless explicit goal labels are provided.""" + orch, _, _ = _make_orchestrator() + orch.attack_type = "baseline" + orch.attack_impl_class = BaselineAttack + orch.hackagent_agent.router = None + + attack_config = { + "attack_type": "baseline", + "evaluator_type": "pattern", + "category_classifier": { + "identifier": "cc-model", + "endpoint": "http://localhost:9999", + "agent_type": "OPENAI_SDK", + }, + } + + targets = orch._collect_model_preflight_targets( + attack_config, + goal_labels_by_index=None, + ) + classifier_targets = [ + t for t in targets if "category_classifier" in t.get("roles", []) + ] + self.assertEqual(len(classifier_targets), 1) + + def test_collect_targets_uses_default_classifier_when_not_specified(self): + """When classifier is omitted and intents are not used, default classifier is preflighted.""" + orch, _, _ = _make_orchestrator() + orch.attack_type = "baseline" + orch.attack_impl_class = BaselineAttack + orch.hackagent_agent.router = None + + attack_config = { + "attack_type": "baseline", + "evaluator_type": "pattern", + } + + targets = orch._collect_model_preflight_targets( + attack_config, + goal_labels_by_index=None, + ) + classifier_targets = [ + t for t in targets if "category_classifier" in t.get("roles", []) + ] + + self.assertEqual(len(classifier_targets), 1) + self.assertEqual( + classifier_targets[0].get("identifier"), + DEFAULT_CATEGORY_CLASSIFIER_IDENTIFIER, + ) + self.assertEqual( + classifier_targets[0].get("endpoint"), + DEFAULT_CATEGORY_CLASSIFIER_ENDPOINT, + ) + self.assertEqual( + classifier_targets[0].get("agent_type"), + DEFAULT_CATEGORY_CLASSIFIER_AGENT_TYPE, + ) + + def test_collect_targets_h4rm3l_requires_decorator_llm_for_llm_program(self): + """h4rm3l must preflight decorator_llm when program uses LLM-assisted decorators.""" + orch, _, _ = _make_orchestrator() + orch.attack_type = "h4rm3l" + orch.attack_impl_class = H4rm3lAttack + orch.hackagent_agent.router = None + + attack_config = { + "attack_type": "h4rm3l", + "h4rm3l_params": {"program": "translate_zulu", "syntax_version": 2}, + "decorator_llm": { + "identifier": "decorator-model", + "endpoint": "http://localhost:8888", + "agent_type": "OPENAI_SDK", + }, + "judges": [ + { + "identifier": "judge-model", + "endpoint": "http://localhost:2222", + "agent_type": "OPENAI_SDK", + } + ], + } + + targets = orch._collect_model_preflight_targets(attack_config) + roles = {role for item in targets for role in item.get("roles", [])} + self.assertIn("decorator_llm", roles) + + def test_collect_targets_h4rm3l_skips_decorator_llm_for_non_llm_program(self): + """h4rm3l should not preflight decorator_llm for non-LLM decorator chains.""" + orch, _, _ = _make_orchestrator() + orch.attack_type = "h4rm3l" + orch.attack_impl_class = H4rm3lAttack + orch.hackagent_agent.router = None + + attack_config = { + "attack_type": "h4rm3l", + "h4rm3l_params": {"program": "identity", "syntax_version": 2}, + "decorator_llm": { + "identifier": "decorator-model", + "endpoint": "http://localhost:8888", + "agent_type": "OPENAI_SDK", + }, + "judges": [ + { + "identifier": "judge-model", + "endpoint": "http://localhost:2222", + "agent_type": "OPENAI_SDK", + } + ], + } + + targets = orch._collect_model_preflight_targets(attack_config) + roles = {role for item in targets for role in item.get("roles", [])} + self.assertNotIn("decorator_llm", roles) def test_validate_required_models_availability_reports_model_and_endpoint(self): """Error should include role, identifier, and endpoint for unavailable models.""" @@ -418,6 +603,80 @@ def test_validate_required_models_availability_lists_multiple_models(self): self.assertIn("identifier=google/gemma-3-27b-it", message) self.assertIn("identifier=mistralai/mistral-small-3.1", message) + def test_validate_required_models_availability_skips_optional_roles_by_default( + self, + ): + """Optional roles should not be probed unless explicitly enabled.""" + orch, _, _ = _make_orchestrator() + + with patch.object( + AttackOrchestrator, + "_collect_model_preflight_targets", + return_value=[ + { + "role": "embedder", + "roles": ["embedder"], + "identifier": "optional-embedder", + "endpoint": "http://localhost:9999", + "agent_type": "OPENAI_SDK", + "kind": "router_config", + "required": False, + "config": { + "identifier": "optional-embedder", + "endpoint": "http://localhost:9999", + "agent_type": "OPENAI_SDK", + }, + } + ], + ): + with patch.object(AttackOrchestrator, "_probe_model_target") as mock_probe: + message = orch._validate_required_models_availability( + attack_config={"goals": ["test"]} + ) + + self.assertIsNone(message) + mock_probe.assert_not_called() + + def test_validate_required_models_availability_probes_optional_when_enabled(self): + """Optional roles are probed when _preflight_probe_optional_roles is set.""" + orch, _, _ = _make_orchestrator() + + with patch.object( + AttackOrchestrator, + "_collect_model_preflight_targets", + return_value=[ + { + "role": "embedder", + "roles": ["embedder"], + "identifier": "optional-embedder", + "endpoint": "http://localhost:9999", + "agent_type": "OPENAI_SDK", + "kind": "router_config", + "required": False, + "config": { + "identifier": "optional-embedder", + "endpoint": "http://localhost:9999", + "agent_type": "OPENAI_SDK", + }, + } + ], + ): + with patch.object( + AttackOrchestrator, + "_probe_model_target", + return_value="optional role unavailable", + ) as mock_probe: + message = orch._validate_required_models_availability( + attack_config={ + "goals": ["test"], + "_preflight_probe_optional_roles": True, + } + ) + + self.assertIsInstance(message, str) + self.assertIn("optional role unavailable", message) + mock_probe.assert_called_once() + def test_execute_aborts_before_db_records_when_model_unavailable(self): """Run must not start when preflight detects an unavailable model.""" orch, _, _ = _make_orchestrator() From 467f89c9591ffe6bf89e0117430391b459b11ecf Mon Sep 17 00:00:00 2001 From: marcorusso97 Date: Fri, 5 Jun 2026 15:30:01 +0200 Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=93=9D=20docs:=20documented=20preflig?= =?UTF-8?q?ht=20parameters?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/docs/attacks/autodan_turbo.md | 7 +++++++ docs/docs/attacks/index.mdx | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/docs/docs/attacks/autodan_turbo.md b/docs/docs/attacks/autodan_turbo.md index 2c74333..004ea08 100644 --- a/docs/docs/attacks/autodan_turbo.md +++ b/docs/docs/attacks/autodan_turbo.md @@ -179,6 +179,13 @@ AutoDAN-Turbo uses a top-level `embedder` config for strategy retrieval. This ro | `embedder.endpoint` | Endpoint used by the embedder router | `http://localhost:11434` | | `embedder.agent_type` | Router adapter type for the embedder | `OLLAMA` | +### Preflight Controls (Advanced) + +| Parameter | Scope | Description | Default | +|-----------|-------|-------------|---------| +| `_preflight_require_embedder` | AutoDAN-Turbo | When `true`, `embedder` is treated as required during preflight availability checks. | `false` | +| `_preflight_probe_optional_roles` | Global (all attacks) | When `true`, preflight also probes roles marked optional by attack-specific role resolution. | `false` | + ### Role Models | Role | Required keys | diff --git a/docs/docs/attacks/index.mdx b/docs/docs/attacks/index.mdx index f8e318b..bf236ea 100644 --- a/docs/docs/attacks/index.mdx +++ b/docs/docs/attacks/index.mdx @@ -46,6 +46,20 @@ All attacks support loading goals from AI safety benchmarks like **AgentHarm**, :::tip Shared Category Classifier All attacks accept a top-level `category_classifier` config block to classify each goal at tracking time. You can customize model, endpoint, and adapter type directly in `attack_config`. + +Preflight behavior: + +- If you provide `intents` (with explicit labels), category-classifier preflight is skipped. +- If you use `goals` or `dataset` and do **not** provide `category_classifier`, HackAgent preflights the default classifier config automatically. +::: + +:::note Advanced Preflight Flags +HackAgent also supports internal preflight-control flags in `attack_config`: + +- `_preflight_probe_optional_roles` (all attacks): when `true`, preflight probes optional roles too. By default optional roles are skipped. +- `_preflight_require_embedder` (AutoDAN-Turbo only): when `true`, `embedder` is treated as required in preflight. + +These are advanced/debug controls and are usually not needed in standard runs. ::: ---