In [None]:
# Example extension: Filter data by difficulty threshold
def filter_by_difficulty(data, min_difficulty=0.0, max_difficulty=1.0):
    """Filter examples by difficulty range."""
    filtered = [
        ex for ex in data 
        if min_difficulty <= ex['difficulty'] <= max_difficulty
    ]
    return filtered

# Example usage
print("=== Filtering Example ===")
easy_questions = filter_by_difficulty(collected_data, max_difficulty=0.5)
hard_questions = filter_by_difficulty(collected_data, min_difficulty=0.5)

print(f"Total examples: {len(collected_data)}")
print(f"Easy questions (difficulty â‰¤ 0.5): {len(easy_questions)}")
print(f"Hard questions (difficulty > 0.5): {len(hard_questions)}")

# Show example of easy question
if easy_questions:
    print(f"\nExample easy question:")
    print(f"  Question: {easy_questions[0]['question'][:50]}...")
    print(f"  Difficulty: {easy_questions[0]['difficulty']:.3f}")

## 6. Customization and Extensions

This notebook is designed to be easily modified. Here are some ways you can customize it:

### To use real HuggingFace data:
1. Uncomment the `datasets` import and `load_dataset` call in the `collect_data()` function
2. Replace the `simulated_examples` with actual dataset loading
3. Ensure you have the `datasets` library installed: `pip install datasets`

### To modify difficulty calculation:
- The current difficulty is based on question length (`len(question) / 100`)
- You could implement more sophisticated metrics like:
  - Number of mathematical operations
  - Presence of specific keywords
  - Answer complexity

### To add more data processing:
- Add data validation
- Implement filtering based on difficulty thresholds
- Add data cleaning steps
- Include additional metadata fields

In [None]:
# In the original script, this would save to file:
# with open("data_out.json", "w") as f:
#     json.dump(data, f, indent=2)

# For demonstration, let's show what the JSON output would look like
json_output = json.dumps(collected_data, indent=2)
print("=== JSON Output (first 500 characters) ===")
print(json_output[:500] + "..." if len(json_output) > 500 else json_output)

print(f"\n=== Summary ===")
print(f"Collected {len(collected_data)} examples")
print("JSON data structure ready for DKW benchmark evaluation")

# Optionally, uncomment the line below to actually save to file
# with open("data_out.json", "w") as f:
#     json.dump(collected_data, f, indent=2)
# print("Data saved to data_out.json")

## 5. Data Export and Saving

This section demonstrates how the original script would save data to a JSON file. In this self-contained version, we'll show the JSON output without actually writing to disk.

In [None]:
# Explore the collected data
print("=== Data Structure ===")
print(f"Total examples: {len(collected_data)}")
print(f"Keys in each example: {list(collected_data[0].keys())}")

print("\n=== Sample Examples ===")
for i, example in enumerate(collected_data):
    print(f"\nExample {i+1}:")
    print(f"  ID: {example['id']}")
    print(f"  Question: {example['question'][:50]}...")
    print(f"  Answer: {example['answer'][:50]}...")
    print(f"  Difficulty: {example['difficulty']:.3f}")

# Calculate some basic statistics
difficulties = [ex['difficulty'] for ex in collected_data]
print(f"\n=== Statistics ===")
print(f"Average difficulty: {sum(difficulties)/len(difficulties):.3f}")
print(f"Min difficulty: {min(difficulties):.3f}")
print(f"Max difficulty: {max(difficulties):.3f}")

## 4. Data Exploration and Analysis

Let's explore the collected data and understand its structure and characteristics.

In [None]:
def collect_data():
    """Collect benchmark data for DKW controller evaluation.
    
    In the original script, this would load from HuggingFace dataset:
    ds = load_dataset("gsm8k", "main", split="test[:200]")
    
    For this self-contained notebook, we'll simulate the process
    with sample questions that represent the type of data we'd get.
    """
    
    # Simulated dataset examples (replacing HuggingFace dataset loading)
    simulated_examples = [
        {"question": "Janet's ducks lay 16 eggs per day. She eats 3 for breakfast and bakes 4 for her friends. How many eggs does she have left?", "answer": "Janet's ducks lay 16 eggs per day. She eats 3 for breakfast, so she has 16 - 3 = 13 eggs left. She bakes 4 for her friends, so she has 13 - 4 = 9 eggs left. The answer is 9."},
        {"question": "A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts are needed total?", "answer": "The robe takes 2 bolts of blue fiber. It takes half that much white fiber, so it takes 2 / 2 = 1 bolt of white fiber. So the total is 2 + 1 = 3 bolts. The answer is 3."},
        {"question": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are there now?", "answer": "There were originally 3 cars. Then 2 more cars arrive. So there are 3 + 2 = 5 cars now. The answer is 5."},
    ]

    data = []
    for i, example in enumerate(simulated_examples):
        data.append({
            "id": f"example_{i:03d}",
            "question": example["question"],
            "answer": example["answer"],
            "difficulty": len(example["question"]) / 100,  # Simple proxy based on question length
        })

    return data

# Test the function
collected_data = collect_data()
print(f"Collected {len(collected_data)} examples")
print(f"First example: {collected_data[0]['id']}")

## 3. Data Collection Function

This function demonstrates how the original script would collect data from the GSM8K dataset. For demonstration purposes, we'll use simulated data to keep the notebook self-contained.

In [None]:
# Sample data that would be generated from GSM8K dataset
# This replaces the need to load external JSON files
sample_data_output = [
    {
        "id": "example_000",
        "question": "What is 2+2?",
        "answer": "4",
        "difficulty": 0.15
    },
    {
        "id": "example_001",
        "question": "If x=5, what is 2x?",
        "answer": "10",
        "difficulty": 0.22
    },
    {
        "id": "example_002",
        "question": "Solve: 3y + 6 = 15",
        "answer": "y=3",
        "difficulty": 0.28
    }
]

print(f"Sample dataset contains {len(sample_data_output)} examples")

## 2. Sample Output Data (Self-Contained)

Since we want this notebook to be completely self-contained, we'll inline sample data that would typically be generated from the GSM8K dataset. This allows the notebook to run without external dependencies.

In [None]:
"""Dataset collection script for DKW benchmark."""
import json
from datasets import load_dataset

## 1. Import Required Libraries

We'll import the necessary libraries for data manipulation and dataset loading.

# Dataset Collection Script - DKW Benchmark

**Artifact ID:** dataset_001  
**Name:** data.py  

This notebook demonstrates a dataset collection script for DKW benchmark controller evaluation. The script loads data from the GSM8K dataset and processes it for benchmarking purposes.

## Overview
- Loads HuggingFace dataset (GSM8K)
- Processes examples with ID, question, answer, and difficulty metrics
- Creates self-contained data structure for further analysis