In [2]:
import pandas as pd
import random
import os
import json
from collections import Counter

In [3]:
#inspect nl qa formats
file_path = 'C:/Users/anton/source/data/LLM-QPP/nq/corpus.jsonl'
df = pd.read_json(file_path, lines=True, nrows=5)  # Reads only 5 lines
print(df)

    _id              title                                               text  \
0  doc0  Minority interest  In accounting, minority interest (or non-contr...   
1  doc1  Minority interest  It is, however, possible (such as through spec...   
2  doc2  Minority interest  The reporting of 'minority interest' is a cons...   
3  doc3  Minority interest  Some investors have expressed concern that the...   
4  doc4  Minority interest  Minority interest is an integral part of the e...   

  metadata  
0       {}  
1       {}  
2       {}  
3       {}  
4       {}  


In [4]:
file_path = 'C:/Users/anton/source/data/LLM-QPP/nq/queries.jsonl'
df = pd.read_json(file_path, lines=True, nrows=5)  # Reads only 5 lines
print(df)

     _id                                               text metadata
0  test0  what is non controlling interest on balance sheet       {}
1  test1     how many episodes are in chicago fire season 4       {}
2  test2    who sings love will keep us alive by the eagles       {}
3  test3          who is the leader of the ontario pc party       {}
4  test4    nitty gritty dirt band fishin in the dark album       {}


In [None]:
#Qrels:
#query-id	corpus-id	score
#test0	doc0	1
#test0	doc1	1
#test1	doc6	1

In [5]:
#generate a smaller subset of dataset
#select a small subset (e.g. 100 queries) from the development set
#generate a new qrels file based on the selected queries
#build the document corpus by:
# - only considering queries for which there is a qrels entry (about half are missing)
# - adding all documents marked relevant to the subset of queries in qrels
# - sampling X (e.g 10K) random documents not already in qrels
#- save all into same original .tsv format with same qIDs and dIDs

In [24]:
import os
import pandas as pd
import json

def generate_nlqa_subset(queries_file, qrels_file, corpus_file, output_queries_file, output_qrels_file, output_corpus_file, num_queries=100, num_random_docs=10000):
    """
    Generate a smaller subset of the NLQA dataset including queries (TSV), qrels (TSV), and documents (JSONL).
    The output corpus JSONL will align with MS MARCO format: 'docID' and 'text'.
    """
    # Ensure output directory exists
    output_dir = os.path.dirname(output_queries_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Load the queries JSONL
    queries_list = []
    with open(queries_file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                queries_list.append(json.loads(line))
            except json.JSONDecodeError:
                print(f"Skipping malformed line: {line.strip()}")
    queries = pd.DataFrame(queries_list)
    queries.rename(columns={"_id": "qID"}, inplace=True)

    # Load qrels TSV
    qrels = pd.read_csv(qrels_file, sep="\t", header=0, names=["qID", "docID", "score"], dtype={"qID": str, "docID": str, "score": int})
    
    # Filter queries to include only those with relevant documents in qrels
    valid_query_ids = qrels["qID"].unique()
    filtered_queries = queries[queries["qID"].isin(valid_query_ids)]
    
    # Sample a subset of queries
    subset_queries = filtered_queries.sample(n=num_queries, random_state=42)
    subset_query_ids = set(subset_queries["qID"])

    # Filter qrels for selected queries
    subset_qrels = qrels[qrels["qID"].isin(subset_query_ids)]

    # Get relevant document IDs
    relevant_doc_ids = set(subset_qrels["docID"])

    # Load and process corpus JSONL
    relevant_docs = []
    non_relevant_docs = []
    with open(corpus_file, 'r', encoding="utf-8") as corpus:
        for line in corpus:
            doc = json.loads(line)
            docID = doc["_id"]
            if docID in relevant_doc_ids:
                # Align with MS MARCO format: Rename '_id' to 'docID' and prepend title to text
                processed_doc = {
                    "docID": doc["_id"],
                    "text": f"{doc['title']}. {doc['text']}"
                }
                relevant_docs.append(processed_doc)
            else:
                non_relevant_docs.append(doc)

    # Sample additional non-relevant documents
    sampled_non_relevant_docs = pd.DataFrame(non_relevant_docs).sample(n=num_random_docs, random_state=42).to_dict(orient="records")

    final_corpus = relevant_docs + [
        {"docID": doc["_id"], "text": f"{doc['title']}. {doc['text']}"}
        for doc in sampled_non_relevant_docs
    ]

    # Save output files
    # Save queries as TSV
    subset_queries[["qID", "text"]].to_csv(output_queries_file, sep="\t", index=False, header=False)
    # Save qrels as TSV
    subset_qrels.insert(1, "zero", 0)
    subset_qrels.to_csv(output_qrels_file, sep="\t", index=False, header=False)
    # Save documents as JSONL
    with open(output_corpus_file, 'w') as f:
        for doc in final_corpus:
            f.write(json.dumps(doc) + "\n")

    print("NLQA subset generation complete.")

# Example usage:
# generate_nlqa_subset(
#     queries_file='path/to/queries.jsonl',
#     qrels_file='path/to/qrels.tsv',
#     corpus_file='path/to/corpus.jsonl',
#     output_queries_file='output/queries_subset.tsv',
#     output_qrels_file='output/qrels_subset.tsv',
#     output_corpus_file='output/collection_subset.jsonl',
#     num_queries=100,
#     num_random_docs=10000
# )


In [32]:
num_queries = 100 
num_random_docs = 100000

original_dir = 'C:/Users/anton/source/data/LLM-QPP/nq'

# Example usage
generate_nlqa_subset(
    queries_file=f"{original_dir}/queries.jsonl",
    qrels_file=f"{original_dir}/qrels/test.tsv",
    corpus_file=f"{original_dir}/corpus.jsonl",
    output_queries_file=f"subset_q{num_queries}_d{num_random_docs}/queries.tsv",
    output_qrels_file=f"subset_q{num_queries}_d{num_random_docs}/qrels.qrels",
    output_corpus_file=f"subset_q{num_queries}_d{num_random_docs}/collection.jsonl",
    num_queries=num_queries,
    num_random_docs=num_random_docs
)

NLQA subset generation complete.


In [23]:
# Read the JSONL file into a DataFrame
output_documents_file = "C:/Users/anton/source/repos/llm-qpp/data/NLQA/subset_q100_d10000/collection.jsonl"
df_documents = pd.read_json(output_documents_file, lines=True)

# Display the first few rows of the DataFrame
print(df_documents.head())

     docID                                               text
0   doc449  Wake Island. With the annexation of Hawaii in ...
1   doc450  Wake Island. On January 17, 1899, under orders...
2   doc908  Bull. Other than the few bulls needed for bree...
3   doc916  Bull. Many cattle ranches and stations run bul...
4  doc1420  Jesse Bennett. Dr. Jesse Bennett (July 10, 176...


In [31]:
import pandas as pd
import json

def count_docids_in_corpus(corpus_file):
    """
    Count the total number of document IDs and unique document IDs in the corpus.jsonl file.

    Arguments:
    - corpus_file: Path to the corpus.jsonl file.
    """
    doc_ids = []

    # Load document IDs
    with open(corpus_file, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            doc_ids.append(doc["_id"])

    # Calculate counts
    total_doc_ids = len(doc_ids)
    unique_doc_ids = len(set(doc_ids))

    # Print results
    print(f"Total number of docIDs: {total_doc_ids}")
    print(f"Number of unique docIDs: {unique_doc_ids}")
    print(f"Number of duplicate docIDs: {total_doc_ids - unique_doc_ids}")


In [30]:
corpus_file=f"{original_dir}/corpus.jsonl"
count_docids_in_corpus(corpus_file)

Total number of docIDs: 2681468
Number of unique docIDs: 2681468
Number of duplicate docIDs: 0
