In [1]:
import os
import json

# Directory paths
query_dir = '.'  # This is the folder with No-tools, Non-replaceable, etc.
tools_dir = '../tools'   # This is the tools directory

def update_query_file(query_api, tool_file_path, query_file_path):
    # Load the tool JSON file (e.g., ../tools/Art/artchicago.json)
    with open(tool_file_path, 'r') as f:
        tool_data = json.load(f)

    # Find the matching API in the tool file's api_list
    for tool_api in tool_data.get('api_list', []):
        if tool_api['name'] == query_api['api_name']:
            # Copy required and optional parameters from tool to query
            query_api['required_parameters'] = tool_api.get('required_parameters', [])
            query_api['optional_parameters'] = tool_api.get('optional_parameters', [])
            # print(f"Copied parameters for API '{query_api['api_name']}' from {tool_file_path}")
            break
    else:
        # If API name is not found, print the query name and file path for manual inspection
        print(f"API '{query_api['api_name']}' not found in {tool_file_path} (Found in: {query_file_path})")

def process_query_file(query_file_path):
    # Load the query JSON file
    with open(query_file_path, 'r') as f:
        query_data = json.load(f)

    # Loop through each API in the query JSON
    for query in query_data:
        for api in query.get('api_list', []):
            category_name = api['category_name']
            tool_name = api['tool_name'].lower()  # Normalize to lowercase
            api_name = api['api_name']    # e.g., artworks_search

            # Build the tool file path (e.g., ../tools/Art/artchicago.json)
            tool_file_path = os.path.join(tools_dir, category_name, f"{tool_name}.json")

            # Check if the tool file exists
            if os.path.exists(tool_file_path):
                update_query_file(api, tool_file_path, query_file_path)
            else:
                # If tool file is not found, print the query name and file path for manual inspection
                print(f"Tool file for '{tool_name}' not found at {tool_file_path} (Found in: {query_file_path})")

    # Write the updated query JSON file
    with open(query_file_path, 'w') as f:
        json.dump(query_data, f, indent=4)

def traverse_query_directory():
    # Traverse the query directory and process each query file
    for root, dirs, files in os.walk(query_dir):
        for file in files:
            if file.endswith('.json'):
                query_file_path = os.path.join(root, file)
                # print(f"Processing {query_file_path}")
                process_query_file(query_file_path)

if __name__ == "__main__":
    traverse_query_directory()
    print("Done")

Done


In [10]:
import os
import re
from collections import defaultdict

# Directory where your datasets (No-Tools, Replaceable, Original, etc.) are located
base_dir = '.'

def get_query_number(filename):
    """
    Extracts the 'n' from a file named like query_n or query_n_y.
    """
    match = re.match(r'query_(\d+)', filename)
    if match:
        return int(match.group(1))
    return None

def traverse_folders(base_dir):
    """
    Traverse the base directory and collect dataset name, category, and query numbers.
    """
    result = defaultdict(lambda: defaultdict(set))

    # Walk through all the folders and files
    for dataset in os.listdir(base_dir):
        dataset_path = os.path.join(base_dir, dataset)

        if os.path.isdir(dataset_path):
            for category in os.listdir(dataset_path):
                category_path = os.path.join(dataset_path, category)

                if os.path.isdir(category_path):
                    for file in os.listdir(category_path):
                        query_number = get_query_number(file)
                        if query_number is not None:
                            result[dataset][category].add(query_number)

    return result

def print_result(result):
    """
    Print the result in the desired format.
    """
    for dataset, categories in result.items():
        print(f"Dataset: {dataset}")
        for category, queries in categories.items():
            print(f"  Category: {category}")
            print(f"    Query list: {sorted(queries)}")

if __name__ == "__main__":
    result = traverse_folders(base_dir)
    print_result(result)

Dataset: Underspecified
  Category: Finance
    Query list: [1, 2, 4, 5, 11, 12, 13, 35, 36, 110, 114, 134, 143, 150, 186, 187, 188, 263, 265, 278, 279, 281]
  Category: Language
    Query list: [2, 3, 4, 5, 7, 9, 12, 14, 15, 16, 17, 20]
  Category: Content
    Query list: [1, 2, 3, 4, 5, 6, 12, 13, 14, 21, 35, 36, 37, 106, 107, 108, 109, 117, 120, 121, 154, 155, 156, 157, 159, 160, 161, 162, 163, 166, 167, 168, 169, 170, 171, 172, 201, 202, 206, 207, 208]
  Category: Weather
    Query list: [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]
  Category: Food
    Query list: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
  Category: Art
    Query list: [1, 2, 3, 4, 5, 6, 7, 9, 11, 19, 26]
  Category: Jobs
    Query list: [1]
  Category: Music
    Query list: [1, 2, 4, 5, 6]
  Category: Geo
    Query list: [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 19, 22, 24]
  Category: Games
    Query list: [1, 4, 21, 22, 23, 33, 34, 42, 43, 44, 63, 159, 160, 218, 219, 432, 586, 587, 588, 673]
  Category: Transportation
    Que

In [13]:
import os
import re
from collections import defaultdict

# Directory where your datasets (No-Tools, Replaceable, Original, etc.) are located
base_dir = '.'

def get_query_number(filename):
    """
    Extracts the 'n' from a file named like query_n or query_n_y.
    """
    match = re.match(r'query_(\d+)', filename)
    if match:
        return int(match.group(1))
    return None

def traverse_folders(base_dir):
    """
    Traverse the base directory and collect dataset name, category, and count of queries.
    """
    result = defaultdict(lambda: defaultdict(int))

    # Walk through all the folders and files
    for dataset in os.listdir(base_dir):
        dataset_path = os.path.join(base_dir, dataset)

        if os.path.isdir(dataset_path):
            for category in os.listdir(dataset_path):
                category_path = os.path.join(dataset_path, category)

                if os.path.isdir(category_path):
                    query_numbers = set()  # To ensure uniqueness of query numbers
                    for file in os.listdir(category_path):
                        query_number = get_query_number(file)
                        if query_number is not None:
                            query_numbers.add(query_number)

                    # Store the count of unique queries for the category
                    result[dataset][category] = len(query_numbers)

    return result

def print_result(result):
    """
    Print the result in the desired format, showing total queries per category for each dataset.
    """
    for dataset, categories in result.items():
        print(f"Dataset: {dataset}")
        for category, query_count in categories.items():
            print(f"  Category: {category}")
            print(f"    Total queries: {query_count}")

if __name__ == "__main__":
    result = traverse_folders(base_dir)
    print_result(result)


Dataset: Underspecified
  Category: Finance
    Total queries: 22
  Category: Language
    Total queries: 12
  Category: Content
    Total queries: 41
  Category: Weather
    Total queries: 10
  Category: Food
    Total queries: 11
  Category: Art
    Total queries: 11
  Category: Jobs
    Total queries: 1
  Category: Music
    Total queries: 5
  Category: Geo
    Total queries: 16
  Category: Games
    Total queries: 20
  Category: Transportation
    Total queries: 5
  Category: Science
    Total queries: 28
  Category: FakeData
    Total queries: 3
  Category: Inspiration
    Total queries: 18
  Category: OpenData
    Total queries: 18
  Category: Sports
    Total queries: 6
  Category: Government
    Total queries: 18
  Category: Calendar
    Total queries: 2
  Category: DeveloperTools
    Total queries: 14
  Category: Health
    Total queries: 9
Dataset: Non-replaceable
  Category: Finance
    Total queries: 21
  Category: Language
    Total queries: 5
  Category: Content
    Total

In [None]:
import os
import shutil
import re

# Base directory where No-tools and Original datasets are located
base_dir = '.'

# Create a new base directory for pruned datasets
pruned_base_dir = './pruned_datasets'

# Maximum number of queries to keep per category
max_queries_to_keep = 53

# Dictionary of queries to keep for each category
queries_to_keep = {
    'Finance': [1, 2, 4, 5, 11, 12, 13, 35, 36, 110, 114, 134, 143, 150, 186, 187, 188, 263, 265, 278, 279, 281],
    'Language': [2, 3, 4, 5, 7, 9, 12, 14, 15, 16, 17, 20],
    'Content': [1, 2, 3, 4, 5, 6, 12, 13, 14, 21, 35, 36, 37, 106, 107, 108, 109, 117, 120, 121, 154, 155, 156, 157, 159, 160, 161, 162, 163, 166, 167, 168, 169, 170, 171, 172, 201, 202, 206, 207, 208],
    'Weather': [1, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    'Food': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    'Art': [1, 2, 3, 4, 5, 6, 7, 9, 11, 19, 26],
    'Jobs': [1],
    'Music': [1, 2, 4, 5, 6],
    'Geo': [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 19, 22, 24],
    'Games': [1, 4, 21, 22, 23, 33, 34, 42, 43, 44, 63, 159, 160, 218, 219, 432, 586, 587, 588, 673],
    'Transportation': [1, 2, 3, 4, 7],
    'Science': [1, 2, 3, 4, 5, 23, 27, 30, 33, 34, 35, 36, 38, 39, 43, 45, 48, 49, 51, 52, 53, 55, 56, 61, 64, 83, 84, 92],
    'FakeData': [1, 2, 3],
    'Inspiration': [15, 16, 17, 19, 21, 24, 26, 27, 32, 34, 37, 39, 43, 48, 51, 58, 59, 62],
    'OpenData': [1, 3, 4, 6, 8, 9, 10, 12, 14, 16, 17, 18, 26, 28, 52, 59, 72, 78],
    'Sports': [2, 3, 4, 5, 6, 7],
    'Government': [2, 3, 5, 6, 7, 8, 9, 14, 15, 16, 17, 21, 22, 23, 24, 25, 27, 28],
    'Calendar': [1, 2],
    'DeveloperTools': [1, 2, 3, 4, 6, 85, 86, 89, 103, 107, 110, 111, 201, 202],
    'Health': [1, 2, 3, 4, 5, 6, 7, 8, 9]
}

def get_query_number(filename):
    """Extracts the 'n' from a file named like query_n or query_n_y."""
    match = re.match(r'query_(\d+)', filename)
    if match:
        return int(match.group(1))
    return None

def copy_selected_queries(category_path, pruned_category_path, category_name):
    """
    Copies the selected queries from the category to the pruned folder.
    """
    copied_queries = set()  # To track already copied queries
    if category_name in queries_to_keep:
        query_files = os.listdir(category_path)
        
        # Copy the queries specified in queries_to_keep
        for query_file in query_files:
            query_number = get_query_number(query_file)
            if query_number in queries_to_keep[category_name]:
                src = os.path.join(category_path, query_file)
                dest = os.path.join(pruned_category_path, query_file)
                shutil.copyfile(src, dest)
                copied_queries.add(query_number)
    
    return copied_queries

def copy_remaining_queries(category_path, pruned_category_path, copied_queries, max_queries):
    """
    Copies additional queries until the total reaches max_queries.
    """
    query_files = os.listdir(category_path)
    copied_count = len(copied_queries)
    
    # Sort queries by filename to ensure order
    query_files_sorted = sorted(query_files)

    # Copy additional queries that haven't been copied yet
    for query_file in query_files_sorted:
        query_number = get_query_number(query_file)
        if query_number is not None and query_number not in copied_queries:
            src = os.path.join(category_path, query_file)
            dest = os.path.join(pruned_category_path, query_file)
            shutil.copyfile(src, dest)
            copied_count += 1
            
            if copied_count >= max_queries:
                break

def prune_dataset(dataset):
    """
    Prune the dataset by copying the selected queries and then adding more until 53.
    """
    dataset_path = os.path.join(base_dir, dataset)
    pruned_dataset_path = os.path.join(pruned_base_dir, dataset)

    # Create the pruned dataset folder if it doesn't exist
    os.makedirs(pruned_dataset_path, exist_ok=True)

    for category in os.listdir(dataset_path):
        category_path = os.path.join(dataset_path, category)
        pruned_category_path = os.path.join(pruned_dataset_path, category)

        if os.path.isdir(category_path):
            os.makedirs(pruned_category_path, exist_ok=True)

            # Copy selected queries for this category
            copied_queries = copy_selected_queries(category_path, pruned_category_path, category)

            # Copy additional queries to reach the max limit (53 queries total)
            copy_remaining_queries(category_path, pruned_category_path, copied_queries, max_queries_to_keep)

if __name__ == "__main__":
    # Create the base pruned directory
    os.makedirs(pruned_base_dir, exist_ok=True)

    # Prune the No-tools and Original datasets by copying the selected and remaining queries
    prune_dataset('No-tools')
    prune_dataset('Original')

    print("Pruning complete!")
