In [1]:
# Setup
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.score import SelfAskRefusalScorer
from pyrit.setup import IN_MEMORY, initialize_pyrit_async

await initialize_pyrit_async(memory_db_type=IN_MEMORY)
target = OpenAIChatTarget()

Found default environment files: ['C:\\Users\\rlundeen\\.pyrit\\.env', 'C:\\Users\\rlundeen\\.pyrit\\.env.local']
Loaded environment file: C:\Users\rlundeen\.pyrit\.env
Loaded environment file: C:\Users\rlundeen\.pyrit\.env.local


## Running a Scorer Evaluation

The simplest way to evaluate a scorer is to call `evaluate_async()` on the scorer instance.
This uses the scorer's configured `evaluation_file_mapping` to find the appropriate datasets.

In [None]:
# Create and evaluate a refusal scorer
refusal_scorer = SelfAskRefusalScorer(chat_target=target)

# Run evaluation - uses the scorer's default evaluation_file_mapping
results = await refusal_scorer.evaluate_async(num_scorer_trials=1)

In [None]:
# Results is a dict mapping dataset name to metrics
for name, metrics in results.items():
    print(f"Dataset: {name}")
    print(f"  Accuracy: {metrics.accuracy:.2%}")
    print(f"  Precision: {metrics.precision:.3f}")
    print(f"  Recall: {metrics.recall:.3f}")
    print(f"  F1 Score: {metrics.f1_score:.3f}")

## Understanding Evaluation File Mapping

Each scorer can define an `evaluation_file_mapping` that specifies which CSV datasets to use.
This is a list of `ScorerEvalDatasetFiles` objects that map glob patterns to result files.

In [None]:
# View the scorer's configured evaluation file mapping
print("Refusal scorer's evaluation_file_mapping:")
for mapping in refusal_scorer.evaluation_file_mapping:
    print(f"  Datasets: {mapping.human_labeled_datasets_files}")
    print(f"  Result file: {mapping.result_file}")

## Running with Multiple Trials

To measure scorer consistency, run multiple trials. The `trial_scores` array contains
the raw scores from each trial.

In [None]:
# Run with multiple trials to measure scorer variance
results = await refusal_scorer.evaluate_async(num_scorer_trials=3)

for name, metrics in results.items():
    print(f"Dataset: {name}")
    print(f"  Accuracy: {metrics.accuracy:.2%}")
    
    # Access raw trial scores
    if metrics.trial_scores is not None:
        print(f"  Trial scores shape: {metrics.trial_scores.shape}")
        print(f"  First response across trials: {metrics.trial_scores[:, 0]}")

## Using Custom File Mappings

You can override the default evaluation by providing a custom `file_mapping`:

In [None]:
from pyrit.score.scorer_evaluation import ScorerEvalDatasetFiles

# Define custom file mapping
custom_mapping = [
    ScorerEvalDatasetFiles(
        human_labeled_datasets_files=["refusal_scorer/mini_refusal.csv"],
        result_file="mini_refusal_results.jsonl"
    )
]

# Run with custom mapping
results = await refusal_scorer.evaluate_async(
    file_mapping=custom_mapping,
    num_scorer_trials=1
)

for name, metrics in results.items():
    print(f"{name}: Accuracy = {metrics.accuracy:.2%}")

## Checking Existing Metrics from Registry

The PyRIT team maintains a registry of scorer evaluation results on official datasets.
You can check if your scorer configuration has been evaluated:

In [None]:
# Check if this scorer has metrics in the registry
existing_metrics = refusal_scorer.get_scorer_metrics()

if existing_metrics:
    print("Found existing metrics for this scorer configuration:")
    for name, metrics in existing_metrics.items():
        print(f"  {name}: Accuracy = {metrics.accuracy:.2%}")
else:
    print("No existing metrics found in registry.")

## Browsing All Registered Evaluations

You can browse all registered scorer evaluations to compare performance:

In [None]:
from pyrit.score.scorer_evaluation.scorer_metrics_io import load_all_objective_metrics

# Load all registered objective scorer evaluations
all_entries = load_all_objective_metrics()

print(f"Total registered evaluations: {len(all_entries)}")

# Display summary of each entry
for entry in all_entries[:5]:  # Show first 5
    scorer_type = entry.get('__type__', 'Unknown')
    metrics = entry.get('metrics', {})
    accuracy = metrics.get('accuracy', 'N/A')
    
    if isinstance(accuracy, float):
        print(f"  {scorer_type}: {accuracy:.2%}")
    else:
        print(f"  {scorer_type}: {accuracy}")

## Using ScorerEvaluator Directly

For more control, you can use `ScorerEvaluator` directly:

In [None]:
from pyrit.score import ScorerEvaluator

# Create evaluator from scorer
evaluator = ScorerEvaluator.from_scorer(refusal_scorer)

# Run evaluation with file mappings
results = await evaluator.run_evaluation_async(
    dataset_files=custom_mapping,
    num_scorer_trials=1,
    add_to_registry=False  # Set True only for official evaluations
)

for name, metrics in results.items():
    print(f"{name}: Accuracy = {metrics.accuracy:.2%}")

## Understanding the Metrics

For objective (true/false) scorers, the metrics are:

- **Accuracy**: Proportion of correct predictions
- **Precision**: Of all positive predictions, how many were correct
- **Recall**: Of all actual positives, how many were detected
- **F1 Score**: Harmonic mean of precision and recall
- **Accuracy Standard Error**: Statistical uncertainty in accuracy estimate

In [None]:
from dataclasses import asdict

# View all metrics for the first result
if results:
    first_metrics = list(results.values())[0]
    # Exclude trial_scores for cleaner output
    metrics_dict = {k: v for k, v in asdict(first_metrics).items() if k != 'trial_scores'}
    
    print("All metrics:")
    for key, value in metrics_dict.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")

## Best Practices

1. **Start small**: Use `mini_*.csv` datasets for quick iteration
2. **Multiple trials**: Use `num_scorer_trials=3` or higher for production evaluations
3. **Check registry first**: Use `get_scorer_metrics()` before running expensive evaluations
4. **Compare configurations**: Evaluate different models/prompts to find optimal settings
5. **Analyze trial_scores**: Identify inconsistent examples where the scorer disagrees with itself