#### Imports

In [1]:
import numpy as np
import json
import random
from typing import List, Dict, Tuple

#### Define categories and vocabulary

In [2]:
categories = {
   'fruits': ['apple', 'banana', 'orange', 'grape', 'strawberry', 'pineapple', 'mango', 'cherry', 'peach', 'watermelon'],
   'cities': ['Paris', 'Tokyo', 'London', 'Sydney', 'Cairo', 'Mumbai', 'Toronto', 'Berlin', 'Rio', 'Bangkok'],
   'animals': ['dog', 'cat', 'elephant', 'tiger', 'rabbit', 'horse', 'dolphin', 'eagle', 'snake', 'penguin'],
   'sports': ['soccer', 'basketball', 'tennis', 'swimming', 'baseball', 'golf', 'hockey', 'volleyball', 'boxing', 'cycling'],
   'professions': ['doctor', 'teacher', 'engineer', 'lawyer', 'chef', 'nurse', 'pilot', 'mechanic', 'architect', 'firefighter'],
   'misc': ['chair', 'computer', 'book', 'mirror', 'clock', 'bicycle', 'phone', 'lamp', 'bottle', 'keyboard']
}

In [3]:
category_names = list(categories.keys())

In [4]:
# Total vocabulary size
all_words = []
for category_items in categories.values():
   all_words.extend(category_items)

print(f"Total vocabulary size: {len(all_words)} words")
print(f"Categories: {category_names}")

Total vocabulary size: 60 words
Categories: ['fruits', 'cities', 'animals', 'sports', 'professions', 'misc']


#### Generate dataset

In [5]:
def generate_dataset(num_samples: int, min_len: int, max_len: int, categories: Dict[str, List[str]]) -> List[Dict]:
    """
    Generate a dataset of word counting examples.
    Args:
        num_samples: Number of examples to generate
        min_len: Minimum length of each word list
        max_len: Maximum length of each word list
        categories: Dictionary of category names to word lists
    Returns:
        List of dictionaries, each containing:
        - 'type': the target category
        - 'list': the word list
        - 'answer': the count of target category words
    """
    # Categories that can be the target (excluding 'misc')
    target_categories = [cat for cat in categories.keys() if cat != 'misc']
    
    # Use only misc words for non-target items
    misc_words = categories['misc']
    
    dataset = []
    
    for _ in range(num_samples):
        # Randomly select target category (not misc)
        target_category = random.choice(target_categories)
        
        # Randomly select list length
        list_length = random.randint(min_len, max_len)
        
        # Randomly decide how many target items to include
        num_target_items = random.randint(0, min(list_length, len(categories[target_category])))
        
        # Sample target items (without replacement from the target category)
        target_items = random.sample(categories[target_category], num_target_items)
        
        # Sample remaining items from misc words only
        remaining_slots = list_length - num_target_items
        non_target_items = random.choices(misc_words, k=remaining_slots)
        
        # Combine and shuffle the list
        word_list = target_items + non_target_items
        random.shuffle(word_list)
        
        # Create the example
        example = {
            'type': target_category,
            'list': word_list,
            'answer': num_target_items
        }
        
        dataset.append(example)
    
    return dataset

In [6]:
# Example usage:
dataset = generate_dataset(num_samples=10, min_len=5, max_len=12, categories=categories)

In [7]:
# Print a few examples
for i, example in enumerate(dataset[:7]):
    print(f"Example {i+1}:")
    print(f"Type: {example['type']}")
    print(f"List: {example['list']}")
    print(f"Answer: ({example['answer']})")
    print()

Example 1:
Type: professions
List: ['firefighter', 'lamp', 'clock', 'bottle', 'mirror', 'mirror', 'phone', 'clock', 'computer', 'clock', 'computer', 'clock']
Answer: (1)

Example 2:
Type: animals
List: ['keyboard', 'clock', 'chair', 'eagle', 'keyboard', 'clock', 'book', 'lamp', 'chair', 'clock']
Answer: (1)

Example 3:
Type: cities
List: ['phone', 'computer', 'bottle', 'chair', 'Paris', 'chair', 'Sydney', 'computer']
Answer: (2)

Example 4:
Type: animals
List: ['dolphin', 'cat', 'book', 'horse', 'tiger']
Answer: (4)

Example 5:
Type: cities
List: ['Toronto', 'Tokyo', 'Sydney', 'Rio', 'Cairo', 'Paris', 'Mumbai']
Answer: (7)

Example 6:
Type: fruits
List: ['mango', 'pineapple', 'banana', 'grape', 'cherry', 'peach', 'watermelon', 'strawberry', 'orange']
Answer: (9)

Example 7:
Type: animals
List: ['book', 'bottle', 'lamp', 'mirror', 'phone', 'mirror', 'computer']
Answer: (0)



#### Save dataset

In [8]:
def save_dataset(dataset: List[Dict], filename: str):
    """Save dataset to JSONL format"""
    with open(filename, 'w') as f:
        for example in dataset:
            f.write(json.dumps(example) + '\n')

In [9]:
# Generate and save your dataset
dataset = generate_dataset(num_samples=5000, min_len=5, max_len=15, categories=categories)
save_dataset(dataset, 'word_counting_dataset.jsonl')

#### Load dataset

In [10]:
def load_dataset(filename: str) -> List[Dict]:
    """Load dataset from JSONL format"""
    dataset = []
    with open(filename, 'r') as f:
        for line in f:
            dataset.append(json.loads(line.strip()))
    return dataset

In [11]:
# Later, load for benchmarking
loaded_dataset = load_dataset('word_counting_dataset.jsonl')