From fa2eb6fac4bf69399a8080c305b0940fb8110c93 Mon Sep 17 00:00:00 2001 From: Goncalo Paulo Date: Fri, 7 Nov 2025 10:48:33 +0000 Subject: [PATCH 1/2] Update fuzz docstring --- delphi/scorers/classifier/fuzz.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/delphi/scorers/classifier/fuzz.py b/delphi/scorers/classifier/fuzz.py index db76afb6..4d43d071 100644 --- a/delphi/scorers/classifier/fuzz.py +++ b/delphi/scorers/classifier/fuzz.py @@ -38,6 +38,10 @@ def __init__( it harder for models to generate anwers in the correct format. log_prob: Whether to use log probabilities to allow for AUC calculation. generation_kwargs: Additional generation kwargs. + temperature: Which temperature to use for the scorer model. + fuzz_type: Which type of fuzzing to use. Default uses non-activating + examples and highlights n_incorrect tokens. Active uses activating + examples and highlights non-activating tokens. """ super().__init__( client=client, From 25ed88adeb86d0d6f2e1746ce15cab021b74eaa0 Mon Sep 17 00:00:00 2001 From: Goncalo Paulo Date: Fri, 7 Nov 2025 10:49:36 +0000 Subject: [PATCH 2/2] Add fuzz type to config --- delphi/__main__.py | 1 + delphi/config.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/delphi/__main__.py b/delphi/__main__.py index a1cf6ca2..d69d7b10 100644 --- a/delphi/__main__.py +++ b/delphi/__main__.py @@ -256,6 +256,7 @@ def scorer_postprocess(result, score_dir): n_examples_shown=run_cfg.num_examples_per_scorer_prompt, verbose=run_cfg.verbose, log_prob=run_cfg.log_probs, + fuzz_type=run_cfg.fuzz_type, ) elif scorer_name == "detection": scorer = DetectionScorer( diff --git a/delphi/config.py b/delphi/config.py index 0cf6452e..09a9fbb0 100644 --- a/delphi/config.py +++ b/delphi/config.py @@ -160,6 +160,10 @@ class RunConfig(Serializable): ) """Scorer methods to score latent explanations. Options are 'fuzz', 'detection', and 'simulation'.""" + fuzz_type: Literal["default", "active"] = "default" + """Type of fuzzing to use for the fuzz scorer. Default uses non-activating + examples and highlights n_incorrect tokens. Active uses activating examples + and highlights non-activating tokens.""" name: str = "" """The name of the run. Results are saved in a directory with this name."""