Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
685bb3f
✨ feat: add guardrails support to router and attack techniques
RPaolino May 14, 2026
97663ad
refactor: unify guardrail response detection via adapter_type
RPaolino May 15, 2026
2440ba1
fix: preserve guardrail info in PAIR/TAP trace recordings
RPaolino May 15, 2026
628755c
✨ feat: dashboard improvements and guardrail visualization
RPaolino May 15, 2026
108d3b7
fix: correct goal index offset in TAP generation and evaluation
RPaolino May 17, 2026
aac7a29
fix: return raw guardrail dict from TAP _query_target
RPaolino May 18, 2026
1526432
fix: add _goal_index_offset to TAP config_keys
RPaolino May 18, 2026
b8490a8
fix: record per-prefix evaluation traces in AdvPrefix
RPaolino May 18, 2026
8ec09c7
fix: AutoDAN-Turbo trace parsing + dashboard guardrail display
RPaolino May 18, 2026
f4fb634
✨ feat(dashboard): add run filters, goal filters, and UX improvements
RPaolino May 18, 2026
3879ee3
fix: guardrail blocked
RPaolino May 18, 2026
e465b01
fix: consistent tables between dashboard and history tabs
RPaolino May 19, 2026
9809a39
fix: advprefix/baseline detail tables always showing mitigated
May 25, 2026
91cf581
feat: added copy to clipboard button
RPaolino May 25, 2026
d9b849f
fix: removed Report tab
RPaolino May 25, 2026
e6c44a2
fix: History tab using the whole panel width
RPaolino May 25, 2026
47a08af
fix: consistent Run Details and Comparison panels
RPaolino May 25, 2026
21f92b1
feat: improved comparison visualization
RPaolino May 25, 2026
86af06f
feat: download plots in svg format
RPaolino May 25, 2026
bd88d76
feat: added documentation, cli and tui support of guardrails
RPaolino May 27, 2026
10f7605
fix: prevent TAP attacker from seeing guardrail internals on block
RPaolino May 27, 2026
0985b1d
fix: remove guardrails keys from attack config dict
RPaolino May 28, 2026
a8e4b1d
refactor(dashboard): extract attack card renderers into per-attack mi…
RPaolino May 28, 2026
039f747
style: format router.py with ruff
RPaolino May 28, 2026
18c4401
✅ test(dashboard-tests): additional tests for the dashboard
May 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions hackagent/attacks/evaluator/evaluation_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,12 +1251,14 @@ def _update_tracker(

explanation = " | ".join(notes_parts) if notes_parts else ""

_prefix = item.get("prefix", "") or ""
self._tracker.add_evaluation_trace(
ctx=goal_ctx,
evaluation_result=eval_result,
score=item.get("best_score", 0.0),
explanation=explanation,
evaluator_name=f"{evaluator_prefix}_{'_'.join(judges_used)}",
metadata={"prefix": _prefix} if _prefix else None,
)

# Keys whose presence signals that results were already evaluated
Expand Down
2 changes: 2 additions & 0 deletions hackagent/attacks/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,8 @@ def _get_attack_impl_kwargs(
run_config_for_attack = dict(run_config_override or {})
# Run-level dashboard metadata must not leak into strict attack configs.
run_config_for_attack.pop("expected_total_goals", None)
run_config_for_attack.pop("before_guardrail", None)
run_config_for_attack.pop("after_guardrail", None)

return {
"config": {
Expand Down
1 change: 1 addition & 0 deletions hackagent/attacks/techniques/advprefix/attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ def _get_pipeline_steps(self):
"filter_len",
"n_prefixes_per_goal",
"max_ce",
"_tracker", # For per-goal evaluation traces
],
"input_data_arg_name": "input_data",
"required_args": ["logger", "client", "config"],
Expand Down
6 changes: 6 additions & 0 deletions hackagent/attacks/techniques/advprefix/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ def execute(self, input_data: List[Dict]) -> List[Dict]:
judge_keys = self._build_judge_keys_from_data(evaluated_data)
self._sync_to_server(evaluated_data, judge_keys)

# Enrich items with best_score/success, then record per-prefix evaluation
# traces on the goal tracker so the dashboard can attribute jailbreaks to
# specific prefixes.
self._enrich_items_with_scores(evaluated_data)
self._update_tracker(evaluated_data)

# Aggregation
self.logger.info(
f"Aggregation: Aggregating {len(evaluated_data)} evaluation results"
Expand Down
2 changes: 2 additions & 0 deletions hackagent/attacks/techniques/tap/attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ def _get_pipeline_steps(self) -> List[Dict]:
"max_judge_retries",
"organization_id",
"_tracker",
"_goal_index_offset",
],
"input_data_arg_name": "goals",
"required_args": ["logger", "agent_router", "config", "client"],
Expand All @@ -237,6 +238,7 @@ def _get_pipeline_steps(self) -> List[Dict]:
"max_judge_retries",
"organization_id",
"_tracker",
"_goal_index_offset",
],
"input_data_arg_name": "input_data",
"required_args": ["logger", "config", "client"],
Expand Down
3 changes: 2 additions & 1 deletion hackagent/attacks/techniques/tap/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ def execute(
success_threshold = tap_params.get("success_score_threshold", 10)

tracker = config.get("_tracker")
_goal_offset = int(config.get("_goal_index_offset", 0))

for idx, item in enumerate(input_data):
best_prompt = item.get("best_prompt")
Expand All @@ -277,7 +278,7 @@ def execute(
item["is_success"] = best_score >= success_threshold

if tracker:
goal_ctx = tracker.get_goal_context(idx)
goal_ctx = tracker.get_goal_context(_goal_offset + idx)
if goal_ctx and best_score is not None:
tracker.add_evaluation_trace(
ctx=goal_ctx,
Expand Down
13 changes: 9 additions & 4 deletions hackagent/attacks/techniques/tap/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,7 @@ def execute(
n_streams = tap_params.get("n_streams", 4)

tracker: Optional[Tracker] = config.get("_tracker")
_goal_offset = int(config.get("_goal_index_offset", 0))

executor = TapExecutor(
config=config,
Expand All @@ -921,9 +922,11 @@ def execute(
_goal_pool.submit(
executor.run_single_goal,
goal=goal,
goal_index=i,
goal_index=_goal_offset + i,
goal_tracker=tracker,
goal_ctx=tracker.get_goal_context(i) if tracker else None,
goal_ctx=tracker.get_goal_context(_goal_offset + i)
if tracker
else None,
progress_bar=progress_bar,
task=task,
): i
Expand All @@ -946,10 +949,12 @@ def execute(
}
else:
for i, goal in enumerate(goals):
goal_ctx = tracker.get_goal_context(i) if tracker else None
goal_ctx = (
tracker.get_goal_context(_goal_offset + i) if tracker else None
)
results_map[i] = executor.run_single_goal(
goal=goal,
goal_index=i,
goal_index=_goal_offset + i,
goal_tracker=tracker,
goal_ctx=goal_ctx,
progress_bar=progress_bar,
Expand Down
43 changes: 43 additions & 0 deletions hackagent/router/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,49 @@ def route_request(
raw_request=request_data,
registration_key=registration_key,
)
# --- After guardrail: check the model response before returning it ---
if self.after_guardrail is not None:
_response_text = (
response.get("processed_response")
or response.get("generated_text")
or ""
)
_response_text = str(_response_text).strip()
if not _response_text:
# Nothing to classify — skip silently.
logger.debug(
"after_guardrail: empty response text for agent %s, skipping check.",
registration_key,
)
else:
_gr = self.after_guardrail.check(_response_text)
if not _gr.is_safe:
logger.warning(
"after_guardrail blocked response for agent %s: %s",
registration_key,
_gr.explanation,
)
return {
"raw_request": request_data,
"processed_response": None,
"generated_text": None,
"raw_response_status": 200,
"raw_response_headers": None,
"raw_response_body": None,
"agent_specific_data": {
"guardrail": "after_guardrail_censored",
"side": "after",
"message": "Response censored: flagged as unsafe by guardrail.",
"categories": getattr(_gr, "categories", []),
"reasoning": _gr.explanation,
},
"error_message": None,
"error_category": None,
"agent_id": registration_key,
"adapter_type": "guardrail",
}

return response

# --- After guardrail: check the model response before returning it ---
if self.after_guardrail is not None:
Expand Down
Loading
Loading