In [None]:
# Uncomment the lines below to save data to a JSON file
# with open("data_out.json", "w") as f:
#     json.dump(data, f, indent=2)
# print("Data saved to data_out.json")

print("âœ“ Notebook execution complete!")
print("This self-contained notebook demonstrates the DKW benchmark data collection process.")

## 6. Optional: Export to File

If you want to save the data to a JSON file (as the original script did), uncomment and run the cell below:

In [None]:
# Analyze the data
import statistics

difficulties = [item["difficulty"] for item in data]
question_lengths = [len(item["question"]) for item in data]

print("=== Data Analysis ===")
print(f"Total examples: {len(data)}")
print(f"\nDifficulty scores:")
print(f"  Min: {min(difficulties):.3f}")
print(f"  Max: {max(difficulties):.3f}")
print(f"  Average: {statistics.mean(difficulties):.3f}")
print(f"  Median: {statistics.median(difficulties):.3f}")

print(f"\nQuestion lengths:")
print(f"  Min: {min(question_lengths)} characters")
print(f"  Max: {max(question_lengths)} characters")
print(f"  Average: {statistics.mean(question_lengths):.1f} characters")

## 5. Data Analysis

Let's analyze the collected data to understand the difficulty distribution and question characteristics:

In [None]:
# Execute the data collection
data = collect_data()

# Display results
print(f"Collected {len(data)} examples")
print("\nFirst 3 examples:")
for i in range(min(3, len(data))):
    print(f"\nExample {i}:")
    print(json.dumps(data[i], indent=2))

## 4. Execute Data Collection

Now let's run the data collection function and display the results. Instead of saving to a file, we'll display the data directly in the notebook:

In [None]:
def collect_data():
    """Collect benchmark data for DKW controller evaluation."""
    # Load HuggingFace dataset
    ds = load_dataset("gsm8k", "main", split="test[:200]")

    data = []
    for i, example in enumerate(ds):
        data.append({
            "id": f"example_{i:03d}",
            "question": example["question"],
            "answer": example["answer"],
            "difficulty": len(example["question"]) / 100,  # Simple proxy
        })

    return data

## 3. Data Collection Function

The main function that loads data from HuggingFace's GSM8K dataset and processes it for our benchmark:

In [None]:
# Sample data structure (normally read from data_out.json)
sample_output = [
    {
        "id": "example_000",
        "question": "What is 2+2?",
        "answer": "4",
        "difficulty": 0.15
    },
    {
        "id": "example_001",
        "question": "If x=5, what is 2x?",
        "answer": "10", 
        "difficulty": 0.22
    },
    {
        "id": "example_002",
        "question": "Solve: 3y + 6 = 15",
        "answer": "y=3",
        "difficulty": 0.28
    }
]

print("Sample output structure:")
print(json.dumps(sample_output, indent=2))

## 2. Sample Data Structure

Here's the structure of the data that would normally be saved to `data_out.json`. This shows what our processing function will generate:

In [None]:
"""Dataset collection script for DKW benchmark."""
import json
from datasets import load_dataset

## 1. Imports and Setup

First, let's import the necessary libraries for dataset loading and JSON handling.

# DKW Benchmark Dataset Collection

**Artifact Name:** data.py  
**Description:** Dataset collection script for DKW benchmark

This notebook demonstrates how to collect and process benchmark data for DKW controller evaluation using the GSM8K dataset from HuggingFace.

## Overview
This notebook will:
1. Load the GSM8K dataset from HuggingFace
2. Process the data to extract questions, answers, and difficulty scores
3. Display the results in an interactive format
4. Show sample output that would normally be saved to `data_out.json`