### Use the AIW dataset to benchmark DeepSeek R1

https://github.com/LAION-AI/AIW


In [2]:
!pip install -qU  datasets tqdm huggingface_hub together backoff

In [3]:
import getpass
together_api_key = getpass.getpass("Together API key: ") 


In [4]:
from datasets import load_dataset

# Load the "Alice in Wonderland" dataset from Hugging Face
dataset = load_dataset("marianna13/aiw-prompts")
data = dataset['test'] if 'test' in dataset else dataset['train']

print(data[:5])  # Inspect the first 5 rows 


  from .autonotebook import tqdm as notebook_tqdm


{'id': [1, 2, 3, 4, 5], 'text': ["Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has 4 sisters and she also has 1 brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has four brothers and she also has one sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has four sisters and she also has one brother. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:", "Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have?"], 'right_answer': ['2', '5', '2', '5', 

In [36]:
import backoff, asyncio
from together import AsyncTogether

debug = True

async_client = AsyncTogether(api_key=together_api_key)

@backoff.on_exception( backoff.expo, Exception, max_tries=3, )
async def async_chat_completion_togetherai(message):
    async_client = AsyncTogether(api_key=together_api_key)
    tasks = [
        async_client.chat.completions.create(
            model="deepseek-ai/DeepSeek-R1",
            messages=message,
            max_tokens=1000,
        )
    ]
    response = await asyncio.gather(*tasks)
    response = response[0]

    if debug:
        print(response.choices)
    
    if not response.usage:
        return response.choices[0].message.content,{}
    else:
        return response.choices[0].message.content,response.usage

async def query_deepseek_togetherai(question):
    message = [{"role":"user","content":question}]
    try:
        return await async_chat_completion_togetherai(message)
    except Exception as e:
        # If it fails after all retries, return error message and empty usage
        return str(e), {}


In [33]:
answer, usage = await query_deepseek_togetherai("what is the capital of England")
print(answer)
print(usage)


[ChatCompletionChoicesData(index=0, logprobs=None, seed=None, finish_reason=<FinishReason.StopSequence: 'stop'>, message=ChatCompletionMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='<think>\nOkay, so the user is asking, "What is the capital of England?" Let me think about how to approach this. First, I need to recall basic geography facts. England is part of the United Kingdom, right? The UK consists of England, Scotland, Wales, and Northern Ireland. Each of these countries has their own capitals, but the overall capital of the UK is London.\n\nWait, maybe the user is confused because sometimes people refer to the UK as a whole and England specifically. So I should make sure whether they\'re asking about England\'s capital or the UK\'s capital. But since the question specifically mentions England, the answer should be London. However, I should also clarify that London is the capital of both England and the UK to avoid confusion. Let me double-check: yes, London is indeed t

In [38]:
from tqdm.notebook import tqdm

# Validate DeepSeek's R1 model against the dataset
results = []
total_output_tokens = 0
max_single_answer_tokens = 0

r1_token_cost = 7./1e6 #based on Together.ai's pricing 

for i in range(len(data["text"])):
    prompt = data["text"][i]
    expected = data["right_answer"][i]  

    print(f"prompt: {prompt}\ncorrect answer: {expected}")

    generated, usage = await query_deepseek_togetherai(prompt)
    print(f"generated answer: {generated}\n")

    # Record the results
    results.append({
        "prompt": prompt,
        "expected": expected,
        "generated": generated
    })

    # Keep track of output tokens and total costs
    if usage: # may be null in case of an error
        total_output_tokens += usage.completion_tokens
        if usage.completion_tokens > max_single_answer_tokens: max_single_answer_tokens = usage.completion_tokens
        print(f"total output tokens cost: ${total_output_tokens*r1_token_cost}\n")

print(f"maximum tokens for a single answer: {max_single_answer_tokens}")


prompt: Alice has 4 brothers and she also has 1 sister. How many sisters does Alice's brother have? To answer the question, DO NOT OUTPUT ANY TEXT EXCEPT following format that contains final answer: ### Answer:
correct answer: 2
[ChatCompletionChoicesData(index=0, logprobs=None, seed=None, finish_reason=<FinishReason.StopSequence: 'stop'>, message=ChatCompletionMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='<think>\nOkay, let\'s try to figure this out. So the problem is: Alice has 4 brothers and 1 sister. How many sisters does Alice\'s brother have?\n\nHmm, first, let\'s break down the family members. Alice has 4 brothers, which means those brothers are her siblings. She also has 1 sister. So, including Alice herself, how many sisters are there in total?\n\nWait, Alice is a sister to her brothers. So if she has 1 sister, that means there are two girls in the family: Alice and her sister. Because each sister would share the same brothers. So the total siblings would be Alic

In [39]:
import json

def write_r1_results(filename = "r1_results_togetherai.json"):

    # Serialize results to a JSON file
    with open(filename, "w") as f:
        json.dump({
            "results": results,
            "total_output_tokens": total_output_tokens,
            "max_single_answer_tokens": max_single_answer_tokens,
            "total_cost": total_output_tokens * r1_token_cost
        }, f, indent=4)

    print(f"Results saved to {filename}")


In [40]:
len(results)

274

In [7]:
import re

debug = False

def print_results(results):

    total_timeouts = 0
    total_correct = 0
    total_incorrect = 0
    total_no_answer = 0

    for result in results:

        if debug: print(f"generated: {result["generated"]}\nexpected:{result["expected"]}")

        if "504" in result["generated"]:
            total_timeouts += 1
        else:
            match = re.search(r"### Answer:\s*(\d+)", result["generated"])
            if match:
                result["numeric_answer"] = match.group(1)  # The captured digits
                if result["numeric_answer"] == result["expected"]:
                    total_correct += 1
                else:
                    total_incorrect += 1
            else:
                result["numeric_answer"] = None
                if debug: print(f"malformatted answer: {result["generated"]}")
                total_no_answer +=1

    print(f"Results:\n\ttimeout: {total_timeouts}\n\tcorrect:{total_correct}\n\tincorrect:{total_incorrect}\n\tmalformatted answer: {total_no_answer}")


In [42]:
import os
import json

def load_r1_results(filename="r1_results_togetherai.json"):
    """
    Checks if r1_results.json exists, and if so, reads and returns its content.
    Returns None if the file does not exist.
    """
    if os.path.exists(filename):
        with open(filename, "r") as f:
            return json.load(f)
    else:
        print(f"{filename} does not exist.")
        return None

In [44]:
import re

debug = False

def print_results(results):

    total_timeouts = 0
    total_correct = 0
    total_incorrect = 0
    total_no_answer = 0

    for result in results:

        if debug: print(f"generated: {result["generated"]}\nexpected:{result["expected"]}")

        if "504" in result["generated"]:
            total_timeouts += 1
        else:
            match = re.search(r"### Answer:\s*(\d+)", result["generated"])
            if match:
                result["numeric_answer"] = match.group(1)  # The captured digits
                if result["numeric_answer"] == result["expected"]:
                    total_correct += 1
                else:
                    total_incorrect += 1
            else:
                result["numeric_answer"] = None
                if debug: print(f"malformatted answer: {result["generated"]}")
                total_no_answer +=1

    print(f"Results:\n\ttimeout: {total_timeouts}\n\tcorrect:{total_correct}\n\tincorrect:{total_incorrect}\n\tmalformatted answer: {total_no_answer}")

In [45]:
print_results(results)

Results:
	timeout: 0
	correct:169
	incorrect:19
	malformatted answer: 86
