### Use the AIW dataset to benchmark Claude Sonnet 3.7

https://github.com/LAION-AI/AIW


In [1]:
!pip install -qU  datasets huggingface_hub backoff anthropic


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import getpass
anthropic_api_key = getpass.getpass("Anthropic API key: ") 


In [3]:
from datasets import load_dataset

# Load the "Alice in Wonderland" dataset from Hugging Face
dataset = load_dataset("marianna13/aiw-prompts")
data = dataset['test'] if 'test' in dataset else dataset['train']

print(data[:5])  # Inspect the first 5 rows 


  from .autonotebook import tqdm as notebook_tqdm


{'id': [1, 2, 3, 4, 5], 'text': ["Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has 4 sisters and she also has 1 brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has four brothers and she also has one sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has four sisters and she also has one brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have?"], 'right_answer': ['2', '5', '2', '5', 

In [12]:
import anthropic

debug = True

async_client = anthropic.AsyncAnthropic(api_key=anthropic_api_key)
async def call_claude(client, user_message, use_extended_thinking=False):
    try:
        # Call the API asynchronously
        if use_extended_thinking:
            response = await client.messages.create(
                model="claude-3-7-sonnet-20250219",
                max_tokens=1050,
                thinking={
                    "type": "enabled",
                    "budget_tokens": 1024
                },
                messages=[
                    {"role": "user", "content": user_message}
                ],
            )
        
        else:
            response = await client.messages.create(
                model="claude-3-7-sonnet-20250219",
                messages=[
                    {"role": "user", "content": user_message}
                ],
                max_tokens=1050,
                temperature=0.1
            )
        
        return response
    
    except Exception as e:
        # Handle any exceptions
        print(f"An error occurred: {e}")
        return None


In [None]:
answer = await call_claude(async_client,"what is the capital of England",use_extended_thinking=False)

if answer:
    for content_item in answer.content:
        if content_item.type == "text":
            print(content_item.text)
        if content_item.type == "thinking":
            print(content_item.thinking)




The capital of England is London. London is not only the capital of England but also the capital of the United Kingdom, which consists of England, Scotland, Wales, and Northern Ireland.


In [15]:
print(answer)

Message(id='msg_01XKYyDtWK2EUFHSmTX5g9GH', content=[ThinkingBlock(signature='EuYBCkQYAiJA+3TKVwEX06rggoxmDmKl72FAOL4k8btTOivSobi+BfJ0XimsMVIWSaq4VqFIjAaUAQ+Bycl/mUIaHFztbRc8ABIM7SwSMIXVKcJkU2+wGgyQm9qMIwXoWA9P6KoiMPiE26rNu8baSp76ZqvsqTuTYm709CAlXUMf5lmegYzEUwq70vjZLE00iaiZEHAZIipQjHaxqZYX3AAA3ImxerhHHXAkiI5qdAkVjCsVQLBYzIKtks3RT9Xvr68fh6CHL4O3YTuXHd62JzHDe2+mlbXgjdqHT7NniHPMoE66nC4LYFY=', thinking="The capital of England is London. London is both the capital city of England and the United Kingdom. It's located in the southeast of England on the River Thames. London is a major global city with a history dating back to Roman times. It's one of the world's most important financial, cultural, and political centers.", type='thinking'), TextBlock(citations=None, text="The capital of England is London. London is also the capital city of the United Kingdom as a whole. It's located in the southeastern part of England on the River Thames and is a major global center for culture, finance, politic

In [None]:
from tqdm.notebook import tqdm

# Validate the model against the dataset
results = []
total_output_tokens = 0
max_single_answer_tokens = 0

for i in range(len(data["text"])):
    prompt = data["text"][i]
    expected = data["right_answer"][i]  

    print(f"prompt: {prompt}\ncorrect answer: {expected}")

    answer = await call_claude(async_client,prompt)
    if answer:
        generated = answer.content[0].text
        print(f"generated answer: {generated}\n")
        total_output_tokens += answer.usage.output_tokens
        if max_single_answer_tokens < total_output_tokens: max_single_answer_tokens = total_output_tokens
        
    else:
        print("failed to answer")

    # Record the results
    results.append({
        "prompt": prompt,
        "expected": expected,
        "generated": generated
    })

print(f"maximum tokens for a single answer: {max_single_answer_tokens}")


prompt: Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:
correct answer: 2
generated answer: ### Answer: 2

prompt: Alice has 4 sisters and she also has 1 brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:
correct answer: 5
generated answer: ### Answer: 5

prompt: Alice has four brothers and she also has one sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:
correct answer: 2
generated answer: ### Answer: 2

prompt: Alice has four sisters and she also has one brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### A

In [34]:
import json

def write_results(filename = "sonnet37_results.json"):

    # Serialize results to a JSON file
    with open(filename, "w") as f:
        json.dump({
            "results": results,
            "total_output_tokens": total_output_tokens,
        }, f, indent=4)

    print(f"Results saved to {filename}")


In [35]:
len(results)
write_results()

Results saved to sonnet37_results.json


In [18]:
import os
import json

def load_results(filename):
    """
    Checks if file exists, and if so, reads and returns its content.
    Returns None if the file does not exist.
    """
    if os.path.exists(filename):
        with open(filename, "r") as f:
            return json.load(f)
    else:
        print(f"{filename} does not exist.")
        return None

In [27]:
import re

debug = False

def print_results(results):

    total_timeouts = 0
    total_correct = 0
    total_incorrect = 0
    total_no_answer = 0

    for result in results["results"]:

        if debug: print(f"generated: {result["generated"]}\nexpected:{result["expected"]}")

        if "504" in result["generated"]:
            total_timeouts += 1
        else:
            match = re.search(r"### Answer:\s*(\d+)", result["generated"])
            if match:
                result["numeric_answer"] = match.group(1)  # The captured digits
                if result["numeric_answer"] == result["expected"]:
                    total_correct += 1
                else:
                    total_incorrect += 1
            else:
                result["numeric_answer"] = None
                if debug: print(f"malformatted answer: {result["generated"]}")
                total_no_answer +=1

    print(f"Results:\n\ttimeout: {total_timeouts}\n\tcorrect:{total_correct}\n\tincorrect:{total_incorrect}\n\tmalformatted answer: {total_no_answer}")


In [28]:
if "results" not in globals():
    results = load_results("sonnet37_results.json")
print_results(results)

Results:
	timeout: 0
	correct:206
	incorrect:18
	malformatted answer: 50


In [None]:

# Validate the model against the dataset - using extended thinking
results = []
total_output_tokens = 0
max_single_answer_tokens = 0

for i in range(len(data["text"])):
    prompt = data["text"][i]
    expected = data["right_answer"][i]  

    print(f"prompt: {prompt}\ncorrect answer: {expected}")

    answer = await call_claude(async_client,prompt,use_extended_thinking=True)
    if answer:
        for content in answer.content:
            if content.type=="text":
                generated = content.text
                print(f"generated answer: {generated}\n")

            elif content.type=="thinking":
                print(f"thought process={content.thinking}")

        total_output_tokens += answer.usage.output_tokens
        if max_single_answer_tokens < answer.usage.output_tokens: max_single_answer_tokens = answer.usage.output_tokens
        
    else:
        print("failed to answer")

    # Record the results
    results.append({
        "prompt": prompt,
        "expected": expected,
        "generated": generated
    })

print(f"maximum tokens for a single answer: {max_single_answer_tokens}")