In [1]:
# Setup
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.score import SelfAskRefusalScorer, SelfAskTrueFalseScorer
from pyrit.setup import IN_MEMORY, initialize_pyrit_async

await initialize_pyrit_async(memory_db_type=IN_MEMORY)  # type: ignore
target = OpenAIChatTarget()

Found default environment files: ['C:\\Users\\rlundeen\\.pyrit\\.env', 'C:\\Users\\rlundeen\\.pyrit\\.env.local']
Loaded environment file: C:\Users\rlundeen\.pyrit\.env
Loaded environment file: C:\Users\rlundeen\.pyrit\.env.local


## Running a Scorer Evaluation

The simplest way to evaluate a scorer is to call `evaluate_async()` directly on the scorer instance. This method:
- Automatically detects the scorer type (objective vs harm)
- Uses standard column names from the CSV
- Returns metrics immediately without saving files
- Optionally includes raw trial scores for debugging

In [2]:
# Evaluate a refusal scorer
refusal_scorer = SelfAskRefusalScorer(chat_target=target)

# Run evaluation on mini dataset (10 examples)
metrics = await refusal_scorer.evaluate_async(
    "pyrit/score/scorer_evals/true_false/mini_refusal.csv",
    num_scorer_trials=1,  # Number of times to score each response
)

print(f"Accuracy: {metrics.accuracy:.2%}")
print(f"Precision: {metrics.precision:.3f}")
print(f"Recall: {metrics.recall:.3f}")
print(f"F1 Score: {metrics.f1_score:.3f}")

FileNotFoundError: Path not found: C:\git\PyRIT\doc\code\scoring\pyrit\score\scorer_evals\true_false\mini_refusal.csv

## Accessing All Metrics

The returned `ObjectiveScorerMetrics` object contains all evaluation metrics:

In [None]:
from dataclasses import asdict

# View all metrics (excluding trial_scores for cleaner output)
all_metrics = {k: v for k, v in asdict(metrics).items() if k != 'trial_scores'}
print(all_metrics)

## Analyzing Trial Scores

When `num_scorer_trials > 1`, you can analyze score variance across trials:

In [None]:
# Run with multiple trials to measure scorer consistency
metrics_multi = await refusal_scorer.evaluate_async(
    "pyrit/score/scorer_evals/true_false/mini_refusal.csv",
    num_scorer_trials=3,
)

# Access raw trial scores
trial_scores = metrics_multi.trial_scores
print(f"Trial scores shape: {trial_scores.shape}")  # (num_trials, num_responses)
print(f"First response scored across trials: {trial_scores[:, 0]}")

## Evaluating Different Scorers

You can evaluate any true/false scorer using the same API:

In [None]:
from pathlib import Path
from pyrit.common.path import DATASETS_PATH

# Evaluate a custom true/false scorer
custom_scorer = SelfAskTrueFalseScorer(
    true_false_question_path=DATASETS_PATH / "score" / "true_false_question" / "task_achieved.yaml",
    chat_target=target
)

# Use a different evaluation dataset
metrics = await custom_scorer.evaluate_async(
    "pyrit/score/scorer_evals/true_false/privacy.csv"
)

print(f"Privacy Detection - Accuracy: {metrics.accuracy:.2%}, F1: {metrics.f1_score:.3f}")

## Auto-Discovery of Objective Datasets

By default, objective scorers automatically evaluate against all CSV files in their dataset directory and return a combined metric:

In [None]:
# Auto-discovery happens automatically - no need to manually list files
results = await custom_scorer.evaluate_async()

# Get combined metrics across all objective datasets
combined = results["combined"]
print(f"Combined metrics across all objective datasets:")
print(f"  Accuracy: {combined.accuracy:.1%}")
print(f"  Precision: {combined.precision:.3f}")
print(f"  Recall: {combined.recall:.3f}")
print(f"  F1 Score: {combined.f1_score:.3f}")

## Browsing Official Scorer Performance

The PyRIT team maintains a registry of scorer evaluation results on official consolidated datasets. You can browse these to compare scorer performance:

In [None]:
from pyrit.score.scorer_evaluation.scorer_metrics_utility import load_all_objective_metrics

# Load all registered objective scorer evaluations
all_entries = load_all_objective_metrics()

print(f"Total registered evaluations: {len(all_entries)}")

# Browse first few entries
for i, entry in enumerate(all_entries[:3], 1):
    scorer_type = entry.get('__type__', 'Unknown')
    metrics = entry.get('metrics', {})
    accuracy = metrics.get('accuracy', 'N/A')
    
    print(f"\nEntry {i}:")
    print(f"  Scorer: {scorer_type}")
    print(f"  Accuracy: {accuracy:.2%}" if isinstance(accuracy, float) else f"  Accuracy: {accuracy}")
    print(f"  Dataset Version: {entry.get('dataset_version', 'N/A')}")

## Checking if Your Scorer Configuration Has Registry Metrics

You can check if your specific scorer configuration has been evaluated on official datasets:

In [None]:
# Check if this scorer configuration is in the registry
registry_metrics = refusal_scorer.get_scorer_metrics_from_registry()

if registry_metrics:
    print("This scorer configuration has official metrics:")
    print(f"  Accuracy: {registry_metrics.accuracy:.2%}")
    print(f"  F1 Score: {registry_metrics.f1_score:.3f}")
else:
    print("No official metrics found for this scorer configuration.")
    print("Run evaluate_async() to generate metrics.")

## Adding to the Official Registry (PyRIT Team Only)

When evaluating scorers on official consolidated datasets, PyRIT maintainers can add results to the registry:

In [None]:
# Only set add_to_registry=True when using official consolidated datasets
# metrics = await refusal_scorer.evaluate_async(
#     "pyrit/score/scorer_evals/true_false/CONSOLIDATED_true_false_objective_dataset.csv",
#     add_to_registry=True,  # Appends to official registry JSONL
# )

## Understanding Scorer Identifiers

Each scorer has a unique identifier based on its configuration. This enables comparing different scorer setups:

In [None]:
# View scorer identifier
identifier = refusal_scorer.get_identifier()
print(f"Scorer Type: {identifier.get('__type__')}")
print(f"Configuration Hash: {identifier.get('hash')}")
print(f"\nFull identifier (compact):")
for key, value in list(identifier.items())[:5]:  # Show first 5 fields
    print(f"  {key}: {value}")

## Best Practices

1. **Start with small datasets**: Use mini_*.csv files (10-30 examples) for quick iteration
2. **Multiple trials for production**: Use `num_scorer_trials=3` or higher to measure scorer variance
3. **Check trial_scores**: Analyze `metrics.trial_scores` to identify inconsistent examples
4. **Compare configurations**: Evaluate different scorer setups (prompts, models, temperatures) to find optimal settings
5. **Registry lookup first**: Check `get_scorer_metrics_from_registry()` before running expensive evaluations

## Next Steps

- Create custom evaluation datasets with your own human labels
- Compare multiple scorer configurations systematically
- Use evaluation results to tune scorer prompts and parameters
- Integrate scorer evaluation into your CI/CD pipeline for regression testing