<a href="https://colab.research.google.com/github/EffiSciencesResearch/ML4G-2.0/blob/master/workshops/evals/evals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
# Evals

## Contents

- 
- 
..

## Goals

- Get experience using Inspect, the open source evals framework from UK AI Safety Institute.
- Get experience conducting evals.

## Setup

In [None]:
try:
    import google.colab

    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    %pip install inspect-ai openai
else:
    !pip install inspect-ai openai

In [None]:
import os
from inspect_ai import Task, task, eval
from inspect_ai.dataset import example_dataset
from inspect_ai.scorer import model_graded_fact
from inspect_ai.solver import chain_of_thought, generate, self_critique

from inspect_ai.log._log import EvalSample, EvalLog
from inspect_ai.model._chat_message import ChatMessageUser, ChatMessageAssistant

In [None]:
# Replace `your-api-key` with an actual openai api key.
os.environ["OPENAI_API_KEY"] = "your-api-key"

## Running a simple eval in Inspect

Appropriate text here

### Datasets

Appropriate text

In [None]:
dataset = example_dataset("theory_of_mind")
print(len(dataset))

Appropriate text

In [None]:
sample = dataset[2]

# print the raw input string for this sample
print(sample.input[0].content)
print()

# print the answer for this sample
print(sample.target)

Appropriate text

### Plans and solvers

Appropriate text

Appropriate text

In [None]:
DEFAULT_COT_TEMPLATE = r"""
{prompt}

Before answering, reason in a step-by-step manner as to get the right answer. Provide your answer at the end on its own line in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.
"""

Appropriate text

In [None]:
DEFAULT_CRITIQUE_TEMPLATE = r"""
Given the following question and answer, please critique the answer. A good answer comprehensively answers the question and NEVER refuses to answer. If the answer is already correct do not provide critique - simply respond 'The original answer is fully correct'.

[BEGIN DATA]
***
[Question]: {question}
***
[Answer]: {completion}
***
[END DATA]

Critique: """


DEFAULT_CRITIQUE_COMPLETION_TEMPLATE = r"""
Given the following question, initial answer and critique please generate an improved answer to the question:

[BEGIN DATA]
***
[Question]: {question}
***
[Answer]: {completion}
***
[Critique]: {critique}
***
[END DATA]

If the original answer is already correct, just repeat the original answer exactly. Provide your answer at the end on its own line in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question.
"""

### Scorers

Appropriate text

In [None]:
scorer = model_graded_fact(model="openai/gpt-4o-mini")

In [None]:
DEFAULT_MODEL_GRADED_FACT_TEMPLATE = """
You are comparing a submitted answer to an expert answer on a given question. Here is the data:

[BEGIN DATA]
************
[Question]: {question}
************
[Expert]: {criterion}
************
[Submission]: {answer}
************
[END DATA]

Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.

Does the submission contain the content in the expert answer?

{instructions}
"""

DEFAULT_INSTRUCTIONS = """
After assessing the submitted answer, reply with 'GRADE: $LETTER' (without quotes) where LETTER is one of C{partial_letter}I.  Please choose ONE option for the grade: either "C" for correct answers, {partial_prompt}or "I" for incorrect answers.

For example, after reviewing a correct answer you might write 'GRADE: C' or after reviewing an incorrect answer you might write 'GRADE: I'.

First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then, end with your answer formatted as 'GRADE: $LETTER' (without quotes) where LETTER is one of C{partial_letter}I.
"""

### Running the evals

Appropriate text

In [None]:
@task
def theory_of_mind(model: str):
    return Task(
        dataset=dataset[0:3],
        plan=[
            chain_of_thought(),
            generate(),
            self_critique(model=model),
        ],
        scorer=scorer,
    )

In [None]:
model = "openai/gpt-4o-mini"
logs = eval(
    theory_of_mind(model),
    model=model,
)

In [None]:
def print_info_from_sample(sample: EvalSample):
    print("---MESSAGE HISTORY---")
    for message in sample.messages:
        if isinstance(message, ChatMessageUser):
            print(f"User: {message.content}" + "\n")
        elif isinstance(message, ChatMessageAssistant):
            print(f"Assistant: {message.content}" + "\n")

    print("--SCORER INFORMATION---")
    value = sample.scores["model_graded_fact"].value
    print(f"Score: {'correct' if value=='C' else 'incorrect'}")
    print(f"Explanation: {sample.scores['model_graded_fact'].explanation}")


def print_info_from_logs(logs: list[EvalLog]):
    for i, sample in enumerate(logs[0].samples):
        print(f"===START OF SAMPLE {i}===")
        print_info_from_sample(sample)
        print(f"===END OF SAMPLE {i}===")

In [None]:
print_info_from_logs(logs)