# Benchmark Orchestrator - optional

In [None]:
# Import necessary packages
import json

from pyrit.common import IN_MEMORY, initialize_pyrit
from pyrit.datasets import fetch_wmdp_dataset
from pyrit.memory import CentralMemory
from pyrit.models import (
    PromptRequestResponse,
    QuestionAnsweringDataset,
    QuestionAnsweringEntry,
    QuestionChoice,
)
from pyrit.orchestrator import QuestionAnsweringBenchmarkOrchestrator
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.score.question_answer_scorer import QuestionAnswerScorer

In [None]:
# Initialize PyRIT (load environment files and set central memory instance)
initialize_pyrit(memory_db_type=IN_MEMORY)

In [None]:
# Set up the Azure OpenAI prompt target
target = OpenAIChatTarget()

In [None]:
# Define helper function for scoring
def score_responses(responses: list[PromptRequestResponse]):
    correct_count = 0
    for response in responses:
        score_metadata = json.loads(response.request_pieces[0].scores[0].score_metadata)
        correct_answer = score_metadata["correct_answer"]
        received_answer = score_metadata["scored_answer"]
        print(f"Was answer correct: {response.request_pieces[0].scores[0].score_value}")
        print(f"Correct Answer: {correct_answer}")
        print(f"Answer Received: {received_answer}")
        correct_count += int(response.request_pieces[0].scores[0].score_value == "True")
    print(f"Correct / Total: {correct_count} / {len(responses)}")

In [None]:
# Create demo dataset for Q/A Model
qa_ds = QuestionAnsweringDataset(
    name="demo dataset",
    version="1.0",
    description="A dataset for testing the question answering model",
    author="PyRIT Team",
    group="",
    source="",
    questions=[
        QuestionAnsweringEntry(
            question="What is the capital of France?",
            answer_type="str",
            correct_answer="0",
            choices=[
                QuestionChoice(index=0, text="Paris"),
                QuestionChoice(index=1, text="London"),
                QuestionChoice(index=2, text="Berlin"),
                QuestionChoice(index=3, text="Madrid"),
            ],
        ),
        QuestionAnsweringEntry(
            question="What is the capital of Germany?",
            answer_type="str",
            correct_answer="2",
            choices=[
                QuestionChoice(index=0, text="Paris"),
                QuestionChoice(index=1, text="London"),
                QuestionChoice(index=2, text="Berlin"),
                QuestionChoice(index=3, text="Madrid"),
            ],
        ),
        QuestionAnsweringEntry(
            question="Which of the following is a prime number?",
            answer_type="str",
            correct_answer="3",
            choices=[
                QuestionChoice(index=0, text="507963"),
                QuestionChoice(index=1, text="507962"),
                QuestionChoice(index=2, text="507960"),
                QuestionChoice(index=3, text="507961"),
            ],
        ),
    ],
)

# Create the scorer
qa_scorer = QuestionAnswerScorer()

# Create the orchestrator with scorer and demo dataset
benchmark_orchestrator = QuestionAnsweringBenchmarkOrchestrator(
    objective_target=target, scorers=[qa_scorer], verbose=True
)

# Sends prompts associated with dataset
responses = await benchmark_orchestrator.send_prompts_async(dataset=qa_ds)  # type: ignore

In [None]:
score_responses(responses=responses)

In [None]:
# Fetch WMDP dataset for Q/A Model Testing

wmdp_ds = fetch_wmdp_dataset()
wmdp_ds.questions = wmdp_ds.questions[:3]

# Evaluate the Q/A Model response
responses = await benchmark_orchestrator.send_prompts_async(dataset=wmdp_ds)  # type: ignore

In [None]:
# Output if the results are correct
score_responses(responses=responses)

In [None]:
# Fetch WMDP dataset for Q/A Model Testing - Chem Subset

wmdp_ds = fetch_wmdp_dataset(category="chem")
wmdp_ds.questions = wmdp_ds.questions[:3]

# Evaluate the Q/A Model response
responses = await benchmark_orchestrator.send_prompts_async(dataset=wmdp_ds)  # type: ignore

In [None]:
# Output if the results are correct
score_responses(responses=responses)

In [None]:
# Fetch WMDP dataset for Q/A Model Testing - Bio Subset

wmdp_ds = fetch_wmdp_dataset(category="bio")
wmdp_ds.questions = wmdp_ds.questions[:3]

# Evaluate the Q/A Model response
responses = await benchmark_orchestrator.send_prompts_async(dataset=wmdp_ds)  # type: ignore

In [None]:
# Output if the results are correct
score_responses(responses=responses)

In [None]:
# Fetch WMDP dataset for Q/A Model Testing - Cyber Subset

wmdp_ds = fetch_wmdp_dataset(category="cyber")
wmdp_ds.questions = wmdp_ds.questions[:3]

# Evaluate the Q/A Model response
responses = await benchmark_orchestrator.send_prompts_async(dataset=wmdp_ds)  # type: ignore

In [None]:
# Output if the results are correct
score_responses(responses=responses)

In [None]:
# Close connection for memory instance
memory = CentralMemory.get_memory_instance()
memory.dispose_engine()