In [None]:
#!/usr/bin/env python3

import os
import random
import pandas as pd
from tqdm import tqdm

import chromadb
from chromadb.api.types import Documents, Metadatas

# --------------------------- CONFIG ---------------------------

INPUT_CSV = "../LLM Caller/generated_pairs_without_commas.csv"     # your 2-column CSV: query,chunk_id
OUTPUT_CSV = "training_pairs.csv"       # output file with query,chunk,label

ABSOLUTE_DB_PATH = "../VectorDB/chroma_Data_with_BERT_embeddings"  # path to your ChromaDB persistent directory
COLLECTION_NAME = "HP_Chunks_BERT_Embeddings_collection"

NUM_NEGATIVES_PER_QUERY = 5             # how many negatives to sample per positive

# ---------------------------------------------------------------


def load_chroma_collection(path, name):
    client = chromadb.PersistentClient(path=path)
    return client.get_collection(name=name)


def fetch_chunk_by_id(collection, chunk_id):
    """Return the chunk content from Chroma given its ID."""
    result = collection.get(ids=[chunk_id], include=["documents"])
    if len(result["documents"]) == 0:
        raise ValueError(f"Chunk ID {chunk_id} not found in ChromaDB.")
    return result["documents"][0]


def get_all_chunk_ids(collection):
    results = collection.get(include=["metadatas"])
    ids = []
    for i, meta in enumerate(results["metadatas"]):
        if meta.get("ischunk") is True:
            ids.append(results["ids"][i])
    return ids


def main():

    print("[INFO] Loading CSV...")
    df = pd.read_csv(INPUT_CSV)
    assert {"query", "chunk_id"} <= set(df.columns), \
        "CSV must contain: query, chunk_id"

    print("[INFO] Connecting to ChromaDB...")
    collection = load_chroma_collection(ABSOLUTE_DB_PATH, COLLECTION_NAME)

    # Gather all chunk IDs
    print("[INFO] Fetching all chunk IDs from Chroma...")
    all_chunk_ids = get_all_chunk_ids(collection)

    # For fast lookup, build a dict of id → chunk_text
    print("[INFO] Building chunk lookup table...")
    id_to_doc = {}
    for cid in tqdm(all_chunk_ids, desc="Loading chunks"):
        id_to_doc[cid] = fetch_chunk_by_id(collection, cid)

    # Safety check
    print(f"[INFO] Total chunks: {len(id_to_doc)}")

    final_rows = []

    print("[INFO] Generating positive and negative pairs...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):

        query = row["query"]
        positive_id = row["chunk_id"]

        # -------------------- POSITIVE PAIR --------------------
        if positive_id not in id_to_doc:
            raise ValueError(f"Positive chunk ID {positive_id} missing from Chroma.")

        positive_chunk = id_to_doc[positive_id]
        final_rows.append({
            "query": query,
            "chunk": positive_chunk,
            "label": 1
        })

        # -------------------- NEGATIVE PAIRS --------------------
        negative_pool = [cid for cid in all_chunk_ids if cid != positive_id]

        neg_ids = random.sample(
            negative_pool,
            k=min(NUM_NEGATIVES_PER_QUERY, len(negative_pool))
        )

        for neg_id in neg_ids:
            final_rows.append({
                "query": query,
                "chunk": id_to_doc[neg_id],
                "label": 0
            })

    # Save final CSV
    output_df = pd.DataFrame(final_rows)
    output_df.to_csv(OUTPUT_CSV, index=False)

    print(f"[INFO] Done! Saved {len(output_df)} pairs to {OUTPUT_CSV}")


if __name__ == "__main__":
    main()

[INFO] Loading CSV...
[INFO] Connecting to ChromaDB...
[INFO] Fetching all chunk IDs from Chroma...
[INFO] Building chunk lookup table...


Loading chunks: 100%|██████████| 4014/4014 [00:04<00:00, 893.74it/s]


[INFO] Total chunks: 4014
[INFO] Generating positive and negative pairs...


100%|██████████| 20067/20067 [00:05<00:00, 3881.66it/s]


[INFO] Done! Saved 120402 pairs to training_pairs.csv
