In [1]:
import pandas as pd
import random
import os
import json

In [None]:
#generate a tiny subset of MS-MARCO
#select a small subset (e.g. 100 queries) from the development set
#generate a new qrels file based on the selected queries
#build the document corpus by:
# - only considering queries for which there is a qrels entry (about half are missing)
# - adding all documents marked relevant to the subset of queries in qrels
# - sampling X (e.g 10K) random documents not already in qrels
#- save all into same original .tsv format with same qIDs and dIDs

In [7]:

def generate_subset(queries_file, qrels_file, documents_file, output_queries_file, output_qrels_file, output_documents_file, num_queries=100, num_random_docs=10000):
    # Generate a smaller subset of the dataset including queries, qrels, and documents
    
    # Arguments:
    # queries_file: str - Path to the queries TSV file.
    # qrels_file: str - Path to the qrels TSV file.
    # documents_file: str - Path to the documents TSV file.
    # output_queries_file: str - Path to save the subset queries TSV file.
    # output_qrels_file: str - Path to save the subset qrels TSV file.
    # output_documents_file: str - Path to save the subset documents TSV file.
    # num_queries: int - Number of queries to include in the subset (default is 100).
    # num_random_docs: int - Number of random non-relevant documents to include (default is 10000).

    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(output_queries_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Load the queries
    df_queries = pd.read_csv(queries_file, sep="\t", header=None, names=["qID", "text"], dtype={"qID": str, "text": str})

    # Load the qrels
    df_qrels = pd.read_csv(qrels_file, sep="\t", header=None, names=["qID", "zero", "docID", "binary_relevance"], dtype={"qID": str, "docID": str, "binary_relevance": int})

    # Filter queries to include only those that have relevant documents in qrels
    valid_query_ids = df_qrels["qID"].unique()
    df_queries = df_queries[df_queries["qID"].isin(valid_query_ids)]

    # Select a random subset of queries that have relevant documents
    subset_queries = df_queries.sample(n=num_queries, random_state=42)
    subset_query_ids = set(subset_queries["qID"])

    # Filter qrels to include only those for the selected queries
    subset_qrels = df_qrels[df_qrels["qID"].isin(subset_query_ids)]

    # Get the set of relevant document IDs from the filtered qrels
    relevant_doc_ids = set(subset_qrels["docID"])

    # Load document IDs only to filter relevant and non-relevant documents
    chunk_iterator = pd.read_csv(documents_file, sep="\t", header=None, names=["docID", "text"], dtype={"docID": str, "text": str}, usecols=[0], chunksize=1000000)

    relevant_docs = []
    non_relevant_docs = []

    for chunk in chunk_iterator:
        relevant_chunk = chunk[chunk["docID"].isin(relevant_doc_ids)]
        non_relevant_chunk = chunk[~chunk["docID"].isin(relevant_doc_ids)]
        relevant_docs.append(relevant_chunk)
        non_relevant_docs.append(non_relevant_chunk)

    # Combine all relevant documents
    df_relevant_documents = pd.concat(relevant_docs)

    # Sample additional random documents that are not already in the relevant set
    non_relevant_docs_combined = pd.concat(non_relevant_docs)
    random_docs = non_relevant_docs_combined.sample(n=num_random_docs, random_state=42)

    # Load the text for the filtered documents
    doc_ids_to_load = set(df_relevant_documents["docID"]).union(set(random_docs["docID"]))
    chunk_iterator = pd.read_csv(documents_file, sep="\t", header=None, names=["docID", "text"], dtype={"docID": str, "text": str}, chunksize=1000000)

    final_documents = []
    for chunk in chunk_iterator:
        filtered_chunk = chunk[chunk["docID"].isin(doc_ids_to_load)]
        final_documents.append(filtered_chunk)

    final_documents = pd.concat(final_documents).drop_duplicates(subset=["docID"])

    # Save the subset queries, qrels, and documents
    subset_queries.to_csv(output_queries_file, sep="\t", index=False, header=False)
    subset_qrels.to_csv(output_qrels_file, sep="\t", index=False, header=False)


    # Save documents to JSONL
    with open(output_documents_file, 'w') as f:
        for _, row in final_documents.iterrows():
            json_record = {"docID": row["docID"], "text": row["text"]}
            f.write(json.dumps(json_record) + "\n")

    print("Subset generation complete.")



In [8]:
num_queries = 100 
num_random_docs = 10000

original_dir = 'C:/Users/anton/source/data/LLM-QPP/MSMARCO/original'

# Example usage
generate_subset(
    queries_file=f"{original_dir}/queries/queries.dev.tsv",
    qrels_file=f"{original_dir}/qrels.dev.tsv",
    documents_file=f"{original_dir}/collection/collection.tsv",
    output_queries_file=f"subset_q{num_queries}_d{num_random_docs}/queries.tsv",
    output_qrels_file=f"subset_q{num_queries}_d{num_random_docs}/qrels.qrels",
    output_documents_file=f"subset_q{num_queries}_d{num_random_docs}/collection.jsonl",
    num_queries=num_queries,
    num_random_docs=num_random_docs
)

Subset generation complete.


In [3]:
# Read the JSONL file into a DataFrame
output_documents_file = "subsetTEST_q10_d100/collection.jsonl"
df_documents = pd.read_json(output_documents_file, lines=True)

# Display the first few rows of the DataFrame
print(df_documents.head())

    docID                                               text
0  108035  Ever wonder how much people playing music on t...
1  159521  Hospitality industry. The hospitality industry...
2  175537  A tremor is a repetitive movement of a part of...
3  210222  Hardware is the physical parts of the computer...
4  319963  Chronic Neutropenia Individuals with the diagn...
