### Use the AIW dataset to benchmark o1-preview 

https://github.com/LAION-AI/AIW


In [2]:
!pip install -qU openai datasets tqdm 

In [2]:
import getpass
openai_api_key = getpass.getpass("OpenAI API key: ")


In [4]:
from datasets import load_dataset

# Load the "Alice in Wonderland" dataset from Hugging Face
dataset = load_dataset("marianna13/aiw-prompts")
data = dataset['test'] if 'test' in dataset else dataset['train']

print(data[:5])  # Inspect the first 5 rows 


{'id': [1, 2, 3, 4, 5], 'text': ["Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has 4 sisters and she also has 1 brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has four brothers and she also has one sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has four sisters and she also has one brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have?"], 'right_answer': ['2', '5', '2', '5', 

In [5]:
from openai import AsyncOpenAI

# Function to query the OpenAI model
client = AsyncOpenAI(api_key=openai_api_key)

async def query_openai(prompt, model="o1-preview-2024-09-12"):
    try:
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt},
            ],
        )
        return response.choices[0].message.content.strip(), response.usage
    except Exception as e:
        return str(e), {}

In [16]:
answer, usage = await query_openai("what is the capital of England")
print(answer)
print(usage)


The capital of England is London.
CompletionUsage(completion_tokens=215, prompt_tokens=14, total_tokens=229, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=192, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))


In [15]:
from tqdm.notebook import tqdm

# Validate OpenAI's model against the dataset
results = []
total_output_tokens = 0
max_single_answer_tokens = 0

o1_output_token_cost = 60./1e6

for i in range(len(data["text"])):
    prompt = data["text"][i]
    expected = data["right_answer"][i]  

    print(f"prompt: {prompt}\ncorrect answer: {expected}")

    # Query the OpenAI model
    generated, usage = await query_openai(prompt)
    print(f"generated answer: {generated}\n")

    # Record the results
    results.append({
        "prompt": prompt,
        "expected": expected,
        "generated": generated
    })

    # Keep track of output tokens and total costs
    if usage: # may be null in case of an error
        total_output_tokens += usage.completion_tokens
        if usage.completion_tokens > max_single_answer_tokens: max_single_answer_tokens = usage.completion_tokens
        print(f"total output tokens cost: ${total_output_tokens*o1_output_token_cost}\n")

print(f"maximum tokens for a single answer: {max_single_answer_tokens}")


prompt: Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:
correct answer: 2
generated answer: ### Answer: 2

total output tokens cost: $0.01272

prompt: Alice has 4 sisters and she also has 1 brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:
correct answer: 5
generated answer: ### Answer: 5

total output tokens cost: $0.04068

prompt: Alice has four brothers and she also has one sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:
correct answer: 2
generated answer: ### Answer: 2

total output tokens cost: $0.06108

prompt: Alice has four sisters and she also has one brother. How many sisters does Alice's brother ha

CancelledError: 

In [16]:
import json

filename = "o1_preview_results.json"

# Serialize results to a JSON file
with open(filename, "w") as f:
    json.dump({
        "results": results,
        "total_output_tokens": total_output_tokens,
        "max_single_answer_tokens": max_single_answer_tokens,
        "total_cost": total_output_tokens * o1_output_token_cost
    }, f, indent=4)

print(f"Results saved to {filename}")


Results saved to o1_preview_results.json


In [18]:
len(results)

38