In [None]:
# Display as formatted JSON (equivalent to the file output)
print("JSON Export Format:")
print("=" * 30)
json_output = json.dumps(processed_data, indent=2)
print(json_output)

# Optional: Uncomment below to actually save to file
# with open("data_out.json", "w") as f:
#     json.dump(processed_data, f, indent=2)
# print("\nData saved to data_out.json")

## Export Format

This shows the data in the same JSON format that would be saved to `data_out.json` in the original script.

In [None]:
# Display the processed data
print("Processed Benchmark Data:")
print("=" * 50)
pprint.pprint(processed_data, width=80)

print(f"\nData Summary:")
print(f"- Total examples: {len(processed_data)}")
if processed_data:
    difficulties = [item['difficulty'] for item in processed_data]
    print(f"- Difficulty range: {min(difficulties):.2f} - {max(difficulties):.2f}")
    print(f"- Average difficulty: {sum(difficulties)/len(difficulties):.2f}")

## Results

Display the processed benchmark data with all metadata.

In [None]:
# Process the data
processed_data = collect_data(sample_dataset)

print(f"Successfully collected {len(processed_data)} examples")

## Execute Data Processing

Process the sample data and generate the benchmark dataset.

In [None]:
def collect_data(dataset: List[Dict[str, str]]) -> List[Dict[str, Any]]:
    """Collect benchmark data for DKW controller evaluation.
    
    Args:
        dataset: List of examples with 'question' and 'answer' keys
        
    Returns:
        List of processed examples with metadata
    """
    data = []
    for i, example in enumerate(dataset):
        processed_example = {
            "id": f"example_{i:03d}",
            "question": example["question"],
            "answer": example["answer"],
            "difficulty": len(example["question"]) / 100,  # Simple proxy for difficulty
        }
        data.append(processed_example)

    return data

## Data Collection Function

The main function that processes the raw dataset and adds metadata for benchmark evaluation.

In [None]:
# Inlined sample data (simulating HuggingFace GSM8K dataset format)
# This replaces: ds = load_dataset("gsm8k", "main", split="test[:200]")

sample_dataset = [
    {
        "question": "What is 2+2?",
        "answer": "4"
    },
    {
        "question": "If x=5, what is 2x?", 
        "answer": "10"
    },
    {
        "question": "Solve: 3y + 6 = 15",
        "answer": "y=3"
    }
]

print(f"Loaded {len(sample_dataset)} sample examples")

## Sample Data

Here we define sample dataset examples (originally from GSM8K test set). In the original script, this data would be loaded from HuggingFace datasets, but we've inlined it for self-containment.

In [None]:
"""Dataset collection script for DKW benchmark."""
import json
from typing import List, Dict, Any
import pprint

## Overview

This notebook contains a self-contained version of the dataset collection script. It processes benchmark data for DKW controller evaluation by:

1. **Loading sample data** (originally from HuggingFace GSM8K dataset)
2. **Processing examples** with metadata including ID, question, answer, and difficulty
3. **Displaying results** in a structured format

All data has been inlined to make this notebook completely runnable without external dependencies.

# Dataset Collection for DKW Benchmark

**Artifact: data.py**  
This notebook demonstrates dataset collection and processing for DKW controller evaluation. The original script has been converted into an interactive format with inlined data for complete self-containment.