Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions docs/docs/datasets/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Instead of manually specifying `goals`, use the `dataset` parameter to load goal
- 🎯 **Presets** — 30+ ready-to-use AI safety benchmarks (AgentHarm, JailbreakBench, BeaverTails, etc.)
- 🤗 **HuggingFace Hub** — Any public or private dataset from HuggingFace
- 📁 **Local files** — JSON, JSONL, CSV, or TXT files from your filesystem
- 🧭 **Intent taxonomy selection** — Pick OmniSafeBench categories/subcategories with `intents`

```mermaid
graph LR
Expand Down Expand Up @@ -129,6 +130,32 @@ attack_config = {
results = agent.hack(attack_config=attack_config)
```

### 4. Selecting Intent Categories (OmniSafeBench)

When you want category-balanced goals without manually writing prompts, use
`intents` to select categories and subcategories directly from the
OmniSafeBench taxonomy.

```python
attack_config = {
"attack_type": "h4rm3l",
"intents": [
{
"category": "A",
"subcategories": ["A1", "A2"],
"samples_per_subcategory": 2,
}
],
}
```

HackAgent maps this to canonical labels in results/dashboard format:
`A. Ethical and Social Risks` / `A1. Bias and Discrimination`.

Taxonomy source: [OmniSafeBench-MM](https://github.com/jiaxiaojunQAQ/OmniSafeBench-MM/).

[See full guide: Selecting intent categories →](./selecting-intent-categories.md)

---

## Common Dataset Options
Expand Down Expand Up @@ -172,6 +199,7 @@ When both `shuffle` and `offset` are used, shuffling happens **first**, then off
## Next Steps

- 📖 [**Datasets Tutorial**](../getting-started/datasets-tutorial.mdx) — Complete walkthrough with examples
- 🧭 [**Selecting intent categories**](./selecting-intent-categories.md) — Use taxonomy categories/subcategories with strings, enums, or label codes
- 🎯 [**Presets**](./presets.md) — All 30+ pre-configured benchmarks
- 🤗 [**HuggingFace Provider**](./huggingface.md) — Load any HuggingFace dataset
- 📁 [**File Provider**](./file.md) — Load from local JSON, CSV, or TXT files
Expand Down
201 changes: 201 additions & 0 deletions docs/docs/datasets/selecting-intent-categories.md

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions docs/docs/getting-started/datasets-tutorial.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ sidebar_position: 2
This quick-start tutorial covers only the basics you need to start using datasets in HackAgent.
Presets are pre-configured benchmark datasets. They are the fastest way to run standardized evaluations.

If you want to select goals by risk taxonomy (OmniSafeBench) instead of full datasets,
you can use `intents` with categories/subcategories. See
[Selecting intent categories](../datasets/selecting-intent-categories) for details.

### Basic CLI Example

```bash
Expand Down
1 change: 1 addition & 0 deletions docs/sidebars.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ const sidebars: SidebarsConfig = {
id: 'datasets/index',
},
items: [
'datasets/selecting-intent-categories',
'datasets/presets',
'datasets/huggingface',
'datasets/file',
Expand Down
59 changes: 55 additions & 4 deletions hackagent/attacks/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,19 +195,33 @@ def _prepare_attack_params(self, attack_config: Dict[str, Any]) -> Dict[str, Any
# Check for direct goals first
goals = attack_config.get("goals")
dataset_config = attack_config.get("dataset")
intents_config = attack_config.get("intents")
goal_labels_by_index: Optional[Dict[int, Dict[str, str]]] = None

if goals is not None and dataset_config is not None:
logger.warning(
"Both 'goals' and 'dataset' provided. Using 'goals' directly."
)
dataset_config = None
if goals is not None and intents_config is not None:
logger.warning(
"Both 'goals' and 'intents' provided. Using 'goals' directly."
)
intents_config = None

if intents_config is not None and dataset_config is not None:
logger.warning("Both 'intents' and 'dataset' provided. Using 'intents'.")
dataset_config = None

if dataset_config is not None:
if intents_config is not None:
goals, goal_labels_by_index = self._load_goals_from_intents(intents_config)
elif dataset_config is not None:
# Load goals from dataset source
goals = self._load_goals_from_dataset(dataset_config)
elif goals is None:
raise ValueError(
f"'{self.attack_type}' requires either 'goals' (list) or 'dataset' (config)"
f"'{self.attack_type}' requires either 'goals' (list), "
"'dataset' (config), or 'intents' (config)"
)

if not isinstance(goals, list):
Expand All @@ -217,7 +231,10 @@ def _prepare_attack_params(self, attack_config: Dict[str, Any]) -> Dict[str, Any
raise ValueError(f"'goals' list is empty for {self.attack_type}")

logger.info(f"Prepared {len(goals)} goals for {self.attack_type} attack")
return {"goals": goals}
params: Dict[str, Any] = {"goals": goals}
if goal_labels_by_index:
params["_goal_labels_by_index"] = goal_labels_by_index
return params

@staticmethod
def _uses_default_category_classifier(attack_config: Dict[str, Any]) -> bool:
Expand Down Expand Up @@ -354,6 +371,26 @@ def _load_goals_from_dataset(self, dataset_config: Dict[str, Any]) -> list:
logger.error(f"Failed to load goals from dataset: {e}", exc_info=True)
raise ValueError(f"Failed to load goals from dataset: {e}") from e

def _load_goals_from_intents(
self, intents_config: Any
) -> Tuple[List[str], Dict[int, Dict[str, str]]]:
"""Load goals from intent taxonomy labels and sample selectors."""
from hackagent.datasets.intents import load_goals_from_intents_config

logger.info("Loading goals from intents taxonomy config")

try:
goals, goal_labels_by_index = load_goals_from_intents_config(intents_config)
logger.info(
"Loaded %s goals from intents across %s labeled entries",
len(goals),
len(goal_labels_by_index),
)
return goals, goal_labels_by_index
except Exception as e:
logger.error(f"Failed to load goals from intents: {e}", exc_info=True)
raise ValueError(f"Failed to load goals from intents: {e}") from e

def _get_attack_impl_kwargs(
self,
attack_config: Dict[str, Any],
Expand Down Expand Up @@ -664,9 +701,16 @@ def execute(
"""
# 1. Validate parameters
attack_params = self._prepare_attack_params(attack_config)
goal_labels_by_index = attack_params.pop("_goal_labels_by_index", None)

# Fail-fast preflight before creating Attack/Run DB records.
self._validate_default_category_classifier_requirements(attack_config)
# Skip this when intents already provide explicit category labels.
if goal_labels_by_index:
logger.info(
"Using explicit intents taxonomy labels: category classifier preflight skipped"
)
else:
self._validate_default_category_classifier_requirements(attack_config)

# Enrich run config with expected goal cardinality so downstream views
# can keep RUNNING until all expected goals are fully tracked.
Expand Down Expand Up @@ -710,6 +754,13 @@ def execute(
except Exception as e:
logger.warning(f"Failed to update run status to RUNNING: {e}")

if goal_labels_by_index:
attack_config = {
**attack_config,
"_goal_labels_by_index": goal_labels_by_index,
"_disable_goal_category_classifier": True,
}

# Make the event bus available to the technique impl and to the
# tracker via the shared config bag (alongside _run_id / _backend).
if _tui_event_bus is not None:
Expand Down
4 changes: 4 additions & 0 deletions hackagent/attacks/techniques/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ def _initialize_coordinator(
logger=self.logger,
attack_type=attack_type,
category_classifier_config=self.config.get("category_classifier"),
preclassified_goal_labels_by_index=self.config.get("_goal_labels_by_index"),
disable_goal_category_classifier=bool(
self.config.get("_disable_goal_category_classifier")
),
goals=goals,
initial_metadata=initial_metadata,
goal_index_start=goal_index_start,
Expand Down
1 change: 1 addition & 0 deletions hackagent/attacks/techniques/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class GoalsDatasetConfig(BaseModel):

goals: List[str] = Field(default_factory=list)
dataset: Optional[Union[str, Dict[str, Any]]] = None
intents: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None


class RunConfig(BaseModel):
Expand Down
8 changes: 8 additions & 0 deletions hackagent/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
"""

from hackagent.datasets.base import DatasetProvider
from hackagent.datasets.intents import (
IntentCategory,
IntentSubcategory,
load_goals_from_intents_config,
)
from hackagent.datasets.presets import PRESETS, get_preset, list_presets
from hackagent.datasets.registry import (
get_provider,
Expand All @@ -42,9 +47,12 @@

__all__ = [
"DatasetProvider",
"IntentCategory",
"IntentSubcategory",
"PRESETS",
"get_preset",
"get_provider",
"load_goals_from_intents_config",
"list_presets",
"load_goals",
"load_goals_from_config",
Expand Down
Loading
Loading