In [11]:
import json
import random
import csv
import pandas as pd
from pathlib import Path

# Paths
try:
    base_dir = Path(__file__).resolve().parent
except NameError:
    # __file__ is not defined in some notebook environments; fall back to CWD
    base_dir = Path.cwd()

sources_dir = base_dir / "sources"
sampled_dir = base_dir / "sampled"

vacos_path = sources_dir / "VACOS_NLQ_data.jsonl"
debateqa_path = sources_dir / "DebateQA_dev.jsonl"
hotpot_path = sources_dir / "hotpot_dev_distractor_v1.json"
pinocchio_realworld_path = sources_dir / "Pinocchios_Lite" / "realworld.jsonl"
pinocchio_domain_path = sources_dir / "Pinocchios_Lite" / "domainSpecific.jsonl"
quora_path = sources_dir / "quora_questions_pairs_test.csv"

# Target total number of queries to collect
TARGET_TOTAL = 1000

# Relative weights for each dataset (can adjust these)
vacos_weight = 1.0
debateqa_weight = 1.0
hotpot_weight = 1.0
pinocchio_weight = 1.0
quora_weight = 1.0

# Mapping from source file to canonical dataset name
SOURCE_NAME_MAP = {
    "VACOS_NLQ_data.jsonl": "VACOS",
    "DebateQA_dev.jsonl": "DebateQA",
    "hotpot_dev_distractor_v1.json": "HotpotQA",
    "Pinocchios_Lite/realworld.jsonl": "Pinocchios",
    "Pinocchios_Lite/domainSpecific.jsonl": "Pinocchios",
    "quora_questions_pairs_test.csv": "QuoraQuestions",
}


In [12]:
# Count how many entries each source dataset has

def count_jsonl_lines(path: Path) -> int:
    n = 0
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                n += 1
    return n


def count_hotpot_items(path: Path) -> int:
    """Count number of top-level items in hotpot_dev_distractor_v1.json.

    Handles common structures:
    - top-level list
    - dict with a "data" list
    """
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, list):
        return len(data)
    if isinstance(data, dict):
        if "data" in data and isinstance(data["data"], list):
            return len(data["data"])
        return 1
    return 0


def count_csv_rows(path: Path, has_header: bool = True) -> int:
    n = 0
    with path.open("r", encoding="utf-8") as f:
        for _ in f:
            n += 1
    if has_header and n > 0:
        n -= 1
    return n


# Get dataset sizes
vacos_count = count_jsonl_lines(vacos_path)
debateqa_count = count_jsonl_lines(debateqa_path)
hotpot_count = count_hotpot_items(hotpot_path)
pinocchio_realworld_count = count_jsonl_lines(pinocchio_realworld_path)
pinocchio_domain_count = count_jsonl_lines(pinocchio_domain_path)
pinocchio_count = pinocchio_realworld_count + pinocchio_domain_count
quora_count = count_csv_rows(quora_path)

print("VACOS_NLQ_data.jsonl rows:", vacos_count)
print("DebateQA_dev.jsonl rows:", debateqa_count)
print("hotpot_dev_distractor_v1.json items:", hotpot_count)

print("Pinocchios_Lite/realworld.jsonl rows:", pinocchio_realworld_count)
print("Pinocchios_Lite/domainSpecific.jsonl rows:", pinocchio_domain_count)
print("Total Pinocchios rows:", pinocchio_count)

print("quora_questions_pairs_test.csv rows:", quora_count)

# Compute per-dataset target sizes based on weights
total_weight = vacos_weight + debateqa_weight + hotpot_weight + pinocchio_weight + quora_weight

raw_vacos_target = int(TARGET_TOTAL * vacos_weight / total_weight)
raw_debateqa_target = int(TARGET_TOTAL * debateqa_weight / total_weight)
raw_hotpot_target = int(TARGET_TOTAL * hotpot_weight / total_weight)
raw_pinocchio_target = int(TARGET_TOTAL * pinocchio_weight / total_weight)
# Last one takes the remainder to ensure we sum to TARGET_TOTAL
raw_quora_target = TARGET_TOTAL - (raw_vacos_target + raw_debateqa_target + raw_hotpot_target + raw_pinocchio_target)

# Clamp by available counts
vacos_target = min(raw_vacos_target, vacos_count)
debateqa_target = min(raw_debateqa_target, debateqa_count)
hotpot_target = min(raw_hotpot_target, hotpot_count)
pinocchio_target = min(raw_pinocchio_target, pinocchio_count)
quora_target = min(raw_quora_target, quora_count)

print("Planned sample sizes (vacos, debateqa, hotpot, pinocchio, quora):",
      vacos_target, debateqa_target, hotpot_target, pinocchio_target, quora_target)
print("Sum of planned samples:",
      vacos_target + debateqa_target + hotpot_target + pinocchio_target + quora_target)


VACOS_NLQ_data.jsonl rows: 3540
DebateQA_dev.jsonl rows: 1941
hotpot_dev_distractor_v1.json items: 7405
Pinocchios_Lite/realworld.jsonl rows: 200
Pinocchios_Lite/domainSpecific.jsonl rows: 200
Total Pinocchios rows: 400
quora_questions_pairs_test.csv rows: 2345805
Planned sample sizes (vacos, debateqa, hotpot, pinocchio, quora): 200 200 200 200 200
Sum of planned samples: 1000


In [13]:
# Extract and sample from VACOS_NLQ_data.jsonl (field: "text")

VACOS_SUFFIX = " Please provide some product recommendations for me"


def strip_outer_double_quotes(text: str) -> str:
    if not isinstance(text, str):
        return text
    text = text.strip()
    # Remove matching outer quote pairs repeatedly
    while len(text) >= 2 and text[0] == '"' and text[-1] == '"':
        text = text[1:-1].strip()
    # Finally, drop any remaining double quotes inside
    text = text.replace('"', "")
    return text


def load_sample_vacos(path: Path, target_n: int, seed: int = 42) -> pd.DataFrame:
    random.seed(seed)
    texts = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            if "text" in obj:
                cleaned = strip_outer_double_quotes(obj["text"])
                texts.append(cleaned)
    n_total = len(texts)
    n_sample = min(max(1, target_n), n_total)
    sampled = random.sample(texts, n_sample)
    # Append suffix to each VACOS query
    sampled = [s + VACOS_SUFFIX for s in sampled]
    return pd.DataFrame({"query": sampled})


vacos_df = load_sample_vacos(vacos_path, vacos_target)
vacos_df.to_csv(sampled_dir / "VACOS_NLQ_sampled.csv", index=False, quoting=csv.QUOTE_ALL)
vacos_df.head()


Unnamed: 0,query
0,"It's a simple, light bomber-style jacket has a..."
1,"This laptop will have High spec, good memory, ..."
2,I would order the newest macbook pro and proba...
3,"I'll get something loose, and easy to move aro..."
4,I want an apple laptop which is fast and relia...


In [14]:
# Extract and sample from DebateQA_dev.jsonl (field: "question")


def load_sample_debateqa(path: Path, target_n: int, seed: int = 42) -> pd.DataFrame:
    random.seed(seed)
    questions = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            if "question" in obj:
                questions.append(obj["question"])
    n_total = len(questions)
    n_sample = min(max(1, target_n), n_total)
    sampled = random.sample(questions, n_sample)
    return pd.DataFrame({"query": sampled})


debateqa_df = load_sample_debateqa(debateqa_path, debateqa_target)
debateqa_df.to_csv(sampled_dir / "DebateQA_sampled.csv", index=False, quoting=csv.QUOTE_ALL)
debateqa_df.head()


Unnamed: 0,query
0,Are women more demanding these days? If so wha...
1,What is the worst that could happen with Trump...
2,Should growth hormone and antibiotic use in li...
3,When does political correctness become lies an...
4,"How could religion teach it's members peace, g..."


In [15]:
# Extract and sample from hotpot_dev_distractor_v1.json (field: "questions")


def load_sample_hotpot(path: Path, target_n: int, seed: int = 42) -> pd.DataFrame:
    """Load HotpotQA-style JSON and sample from its questions.

    This function is robust to two common structures:
    - Top-level list of objects, each with a "questions" or "question" field
    - Top-level dict with a "data" list inside
    """
    random.seed(seed)
    questions = []
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    def extract_q(obj):
        if isinstance(obj, dict):
            if "questions" in obj and isinstance(obj["questions"], list):
                for q in obj["questions"]:
                    if isinstance(q, str):
                        questions.append(q)
            elif "question" in obj and isinstance(obj["question"], str):
                questions.append(obj["question"])

    if isinstance(data, list):
        for item in data:
            extract_q(item)
    elif isinstance(data, dict):
        # Common HotpotQA format: {"data": [...]} or flat dict with questions
        if "data" in data and isinstance(data["data"], list):
            for item in data["data"]:
                extract_q(item)
        else:
            extract_q(data)

    n_total = len(questions)
    if n_total == 0:
        raise ValueError("No questions found in hotpot_dev_distractor_v1.json")
    n_sample = min(max(1, target_n), n_total)
    sampled = random.sample(questions, n_sample)
    return pd.DataFrame({"query": sampled})


hotpot_df = load_sample_hotpot(hotpot_path, hotpot_target)
hotpot_df.to_csv(sampled_dir / "hotpot_dev_distractor_sampled.csv", index=False, quoting=csv.QUOTE_ALL)
hotpot_df.head()


Unnamed: 0,query
0,In what year was the university where Sergei A...
1,Black Book starred the actress and writer of w...
2,Which actor does American Beauty and American ...
3,Ken Pruitt was a Republican member of an uppe...
4,"Between Greyia and Calibanus, which genus cont..."


In [16]:
# Extract and sample from Pinocchios_Lite realworld + domainSpecific (field: "claim")


def load_sample_pinocchio(paths, target_n: int, seed: int = 42) -> pd.DataFrame:
    random.seed(seed)
    claims = []
    for path in paths:
        with path.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                obj = json.loads(line)
                if "claim" in obj:
                    claims.append(obj["claim"])
    n_total = len(claims)
    n_sample = min(max(1, target_n), n_total)
    sampled = random.sample(claims, n_sample)
    return pd.DataFrame({"query": sampled})


pinocchio_df = load_sample_pinocchio([pinocchio_realworld_path, pinocchio_domain_path], pinocchio_target)
pinocchio_df.to_csv(sampled_dir / "Pinocchios_realworld_sampled.csv", index=False, quoting=csv.QUOTE_ALL)
pinocchio_df.head()


Unnamed: 0,query
0,Is Bone Marrow transplantation used to treat a...
1,Did the entire state of Florida lead the natio...
2,Are ninety six percent of employers in the Gra...
3,Does JMJD6 slow down the hydroxylation of C-te...
4,Does Oregon remain among the top five hungries...


In [17]:
# Extract and sample from quora_questions_pairs_test.csv (field: "question2")


def load_sample_quora(path: Path, target_n: int, seed: int = 42) -> pd.DataFrame:
    # Only load the needed column for efficiency
    df = pd.read_csv(path, usecols=["question2"])
    total = len(df)
    n_sample = min(max(1, target_n), total)
    sampled = df["question2"].sample(n=n_sample, random_state=seed).reset_index(drop=True)
    return pd.DataFrame({"query": sampled})


quora_df = load_sample_quora(quora_path, quora_target)
quora_df.to_csv(sampled_dir / "Quora_questions_sampled.csv", index=False, quoting=csv.QUOTE_ALL)
quora_df.head()


Unnamed: 0,query
0,Can I be source of energy that create universe?
1,Is email marketing good for small businesses?
2,"If there I determine when to use had, have, ha..."
3,How do two or more alters become a single alte...
4,What do feel pain when leaves are cut?


In [None]:
# Combine sampled queries into a single DataFrame and save

def normalize_query_text(text: str) -> str:
    if not isinstance(text, str):
        return text
    # Strip leading/trailing whitespace
    text = text.strip()
    # Remove matching outer double-quote pairs repeatedly
    while len(text) >= 2 and text[0] == '"' and text[-1] == '"':
        text = text[1:-1].strip()
    # Finally, drop any remaining double quotes inside so CSV only adds the outer layer
    text = text.replace('"', "")
    return text


combined_df = pd.concat(
    [
        vacos_df.assign(source=SOURCE_NAME_MAP["VACOS_NLQ_data.jsonl"]),
        debateqa_df.assign(source=SOURCE_NAME_MAP["DebateQA_dev.jsonl"]),
        hotpot_df.assign(source=SOURCE_NAME_MAP["hotpot_dev_distractor_v1.json"]),
        pinocchio_df.assign(source=SOURCE_NAME_MAP["Pinocchios_Lite/realworld.jsonl"]),
        quora_df.assign(source=SOURCE_NAME_MAP["quora_questions_pairs_test.csv"]),
    ],
    ignore_index=True,
)

# Normalize query text to avoid extra layers of quotes inside the CSV
combined_df["query"] = combined_df["query"].apply(normalize_query_text)

print("Total queries collected:", len(combined_df))

# Write CSV manually so that header is unquoted and all data fields are quoted once
output_path = base_dir / "combined_queries_1000.csv"
with output_path.open("w", encoding="utf-8", newline="") as f:
    # Header without quotes
    f.write("query,source\n")
    # All data fields fully quoted
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    for row in combined_df.itertuples(index=False):
        writer.writerow([row.query, row.source])

combined_df.head()


Total queries collected: 1000


Unnamed: 0,query,source
0,"It's a simple, light bomber-style jacket has a...",VACOS
1,"This laptop will have High spec, good memory, ...",VACOS
2,I would order the newest macbook pro and proba...,VACOS
3,"I'll get something loose, and easy to move aro...",VACOS
4,I want an apple laptop which is fast and relia...,VACOS
