In [8]:
import itertools
import json
import random
import csv
import os

In [2]:
#dict of feature names and their domains (collection (list, set, range) of values for each variable) 
#{f1: {f1_v1,f1_v2,...}, f2: {...}}
#e.g. {"size" : ["s","m","l"], "material" : {"leather", "cotton", ...}}
#Note, an item might need to take on multiple values for various features, e.g. material: cotton+leather+... . Perhaps these should be LLM generated...

toy_furniture_aspect_dict = {
    "type": ["table", "bed", "chair", "dresser", "nightstand", "couch"],
    "style": ["modern", "vintage", "classic", "beachhouse", "cottage"],
    "color": ["black", "white", "blue", "red", "purple", "green", "yellow", "orange"],
}

toy_furniture_synonym_dict = {
    "type": ["desk", "bunk", "stool", "chest", "bedstand", "loveseat"],
    "style": ["sleek", "retro", "timeless", "coastal", "rustic"],
    "color": ["charcoal", "ivory", "cobalt", "burgundy", "plum", "emerald", "gold", "bronze"],
} 

In [9]:
# Function to write all permutations of the aspect dictionary to a JSON file
def write_all_permutations(aspect_dict, key_order, filename):
    # Create all permutations of the feature combinations based on the given key order
    aspect_values = [aspect_dict[key] for key in key_order]
    permutations = list(itertools.product(*aspect_values))
    
    # Create a dictionary in the format {docID: {'text': "<color> <style> <type>"}}
    data = {}
    for i, values in enumerate(permutations):
        doc_id = f"d{i+1}"
        text = " ".join(values)
        data[doc_id] = {'text': text}
    
    # Write data to JSON file
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    
    return data

In [6]:

def write_all_permutations_ndjson(aspect_dict, key_order, filename):
    # Create all permutations of the feature combinations based on the given key order
    aspect_values = [aspect_dict[key] for key in key_order]
    permutations = list(itertools.product(*aspect_values))

    # Ensure the directory exists before writing to the file
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    data = {}
    # Write each permutation as a JSON object in a new line to the NDJSON file
    with open(filename, 'w') as ndjson_file:
        for i, values in enumerate(permutations):
            doc_id = f"d{i + 1}"
            text = " ".join(values)
            data[doc_id] = {'text': text}
            ndjson_file.write(json.dumps({'docID': doc_id, 'text': text}) + '\n')
    
    return data


In [17]:
# Generate keyword data
data = write_all_permutations_ndjson(toy_furniture_aspect_dict, ["color", "style", "type"], 'keywords/collection.jsonl')

# Generate synonym data by replacing keywords with synonyms using the same order as the original dictionary
synonym_data = write_all_permutations_ndjson(toy_furniture_synonym_dict, ["color", "style", "type"], 'synonyms/collection.jsonl')

In [18]:
#select 10 random docs to serve as (pseudo) queries - these should be trivial on the initial corpus
random.seed(72)
docID_sample = random.sample(list(data.keys()), 10)
random_toy_furniture_keyword_queries = {f'q{docID}': data[docID]['text'] for docID in docID_sample}

# Write the queries to a JSON file
with open('keywords/queries.tsv', 'w', newline='') as file:
    writer = csv.writer(file,delimiter = '\t')
    for q_id, q_text in random_toy_furniture_keyword_queries.items():
        writer.writerow([q_id, q_text])

# Generate qrels file for keyword queries
# Format: <qID> 0 <dID> <relevance>
qrels_entries = []
for qID, query_text in random_toy_furniture_keyword_queries.items():
    qrels_entries.append(f"{qID} 0 {qID[1:]} 1")

# Write qrels to a file
filepath = "keywords/qrels.qrels"
with open(filepath, 'w') as qrels_file:
    for entry in qrels_entries:
        qrels_file.write(entry + '\n')

In [20]:

# Generate synonym queries by looking up sampled docIDs in synonym data
random_toy_furniture_synonym_queries = {qID: synonym_data[docID]['text'] for qID, docID in zip(random_toy_furniture_keyword_queries.keys(), docID_sample)}

# Write the synonym queries to a JSON file
with open('synonyms/queries.tsv', 'w', newline='') as file:
    writer = csv.writer(file,delimiter = '\t')
    for q_id, q_text in random_toy_furniture_synonym_queries.items():
        writer.writerow([q_id, q_text])

# Generate qrels file for synonym queries
qrels_synonym_entries = []
for qID in random_toy_furniture_synonym_queries.keys():
    qrels_synonym_entries.append(f"{qID} 0 {qID[1:]} 1")

# Write synonym qrels to a file
synonym_filepath = "synonyms/qrels.qrels"
with open(synonym_filepath, 'w') as qrels_file:
    for entry in qrels_synonym_entries:
        qrels_file.write(entry + '\n')
