1st Attempt

In [None]:
#!/usr/bin/env python3

import os
import time
import pandas as pd
from tqdm import tqdm
from typing import List
from datetime import datetime

import chromadb
from chromadb.api.types import Documents, Metadatas
from google import genai

# -------------------- Configuration --------------------
ABSOLUTE_DB_PATH = "../VectorDB/chroma_Data"
COLLECTION_NAME = "harry_potter_collection"
API_KEY = "AIzaSyAyUmY_mL3O8qZk4jGXgEYM41A12kPtSm4"
BATCH_SIZE = 5                 # Smaller batch to respect RPM
NUM_QUERIES_PER_CHUNK = 5
GEMINI_MODEL = "gemini-2.0-flash"
OUTPUT_CSV = "generated_pairs.csv"

# Gemini 2.0 Flash rate limits
MAX_RPM = 15                   # Requests per minute
MAX_RPD = 200                  # Requests per day

# Derived sleep time between requests in seconds
SECONDS_PER_REQUEST = 60 / MAX_RPM

# -------------------- Gemini Utilities --------------------
def init_gemini_client(api_key: str):
    os.environ["GENAI_API_KEY"] = api_key
    return genai.Client(api_key=api_key)

def ask_gemini_generate_queries(client, chunk_text: str, chunk_id: str, num_queries: int) -> List[str]:
    """
    Ask Gemini to generate user-style queries for a given chunk.
    Includes few-shot examples to ensure short, natural queries.
    """
    prompt = f"""
You are an AI that generates realistic search queries a user might input to an LLM or search system.
Each query should be short, relevant, and reflect what someone might actually ask.

Here are a few examples:

Example 1:
Chunk: "Harry receives his first letter from Hogwarts, but Uncle Vernon tries to stop him."
Queries:
- "How did Harry get his Hogwarts letter?"
- "Why did Uncle Vernon hide Harry's letter?"
- "First Hogwarts letter incident"

Example 2:
Chunk: "Hagrid visits Harry to explain that he is a wizard."
Queries:
- "Who is Hagrid and why did he visit Harry?"
- "How did Harry find out he is a wizard?"
- "Hagrid tells Harry he's a wizard"

Now, generate {num_queries} short user queries for the following chunk:
Chunk ID: {chunk_id}
Chunk Text: "{chunk_text}"
Queries:
- 
"""
    response = client.models.generate_content(
        model=GEMINI_MODEL,
        contents=prompt
    )

    text = response.text.strip()
    queries = [q.strip("- ").strip() for q in text.split("\n") if q.strip()]
    return queries[:num_queries]

# -------------------- Main workflow --------------------
def main():
    # Initialize persistent ChromaDB client
    client_db = chromadb.PersistentClient(path=ABSOLUTE_DB_PATH)
    print(f"[INFO] ChromaDB client initialized at: {ABSOLUTE_DB_PATH}")

    # Access the existing collection directly
    collection = client_db.get_collection(name=COLLECTION_NAME)
    print(f"[INFO] Using existing collection: {COLLECTION_NAME}")

    # Fetch all documents and their metadata
    results = collection.get(include=["documents", "metadatas"])
    chunks = [
        {"id": meta["id"], "text": doc}  # Use ID stored in metadata
        for doc, meta in zip(results["documents"], results["metadatas"])
        if meta.get("ischunk") is True
    ]
    print(f"[INFO] Found {len(chunks)} chunks (ischunk=True)")

    # Initialize Gemini client
    gemini = init_gemini_client(API_KEY)

    all_pairs = []
    total_requests_today = 0

    for i in tqdm(range(0, len(chunks), BATCH_SIZE), desc="Processing chunk-batches"):
        batch = chunks[i : i + BATCH_SIZE]

        for chunk in batch:
            if total_requests_today >= MAX_RPD:
                print(f"[INFO] Reached daily limit of {MAX_RPD} requests. Stopping.")
                break

            try:
                queries = ask_gemini_generate_queries(
                    gemini, chunk["text"], chunk["id"], NUM_QUERIES_PER_CHUNK
                )
                total_requests_today += 1
            except Exception as e:
                print(f"[ERROR] Failed to generate for chunk {chunk['id']}: {e}")
                continue

            for q in queries:
                all_pairs.append({"query": q, "chunk_id": chunk["id"]})

            # Sleep to respect RPM
            time.sleep(SECONDS_PER_REQUEST)

        # Optional: extra pause between batches
        time.sleep(1)

    # Save results
    df = pd.DataFrame(all_pairs)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"[INFO] Saved {len(df)} query-chunk pairs to {OUTPUT_CSV}")

# -------------------- Run --------------------
if __name__ == "__main__":
    main()

2nd Attempt => Gemini => Round Robin fashion

In [None]:
#!/usr/bin/env python3

import os
import time
import pandas as pd
from tqdm import tqdm
from typing import List
from datetime import datetime

import chromadb
from chromadb.api.types import Documents, Metadatas
from google import genai

# -------------------- Configuration --------------------
ABSOLUTE_DB_PATH = "../VectorDB/chroma_Data"
COLLECTION_NAME = "harry_potter_collection"

# ***** 5 API KEYS FOR ROUND-ROBIN *****
API_KEYS = [
    "API_KEY_1",
    "API_KEY_2",
    "API_KEY_3",
    "API_KEY_4",
    "API_KEY_5"
]

BATCH_SIZE = 5                 # Smaller batch to respect RPM
NUM_QUERIES_PER_CHUNK = 5
GEMINI_MODEL = "gemini-2.0-flash"
OUTPUT_CSV = "generated_pairs.csv"

# Gemini 2.0 Flash rate limits
MAX_RPM = 15                   # Requests per minute
MAX_RPD = 200                  # Requests per day

# Derived sleep time between requests in seconds
SECONDS_PER_REQUEST = 60 / MAX_RPM

# -------------------- Gemini Utilities --------------------
def init_gemini_client(api_key: str):
    os.environ["GENAI_API_KEY"] = api_key
    return genai.Client(api_key=api_key)

# Round-robin state
rr_index = 0

def get_next_client():
    """
    Returns a Gemini client using the next API key in round robin fashion.
    """
    global rr_index
    key = API_KEYS[rr_index]
    rr_index = (rr_index + 1) % len(API_KEYS)
    return init_gemini_client(key)

def ask_gemini_generate_queries(client, chunk_text: str, chunk_id: str, num_queries: int) -> List[str]:
    """
    Ask Gemini to generate user-style queries for a given chunk.
    Includes few-shot examples to ensure short, natural queries.
    """
    prompt = f"""
You are an AI that generates realistic search queries a user might input to an LLM or search system.
Each query should be short, relevant, and reflect what someone might actually ask.

Here are a few examples:

Example 1:
Chunk: "Harry receives his first letter from Hogwarts, but Uncle Vernon tries to stop him."
Queries:
- "How did Harry get his Hogwarts letter?"
- "Why did Uncle Vernon hide Harry's letter?"
- "First Hogwarts letter incident"

Example 2:
Chunk: "Hagrid visits Harry to explain that he is a wizard."
Queries:
- "Who is Hagrid and why did he visit Harry?"
- "How did Harry find out he is a wizard?"
- "Hagrid tells Harry he's a wizard"

Now, generate {num_queries} short user queries for the following chunk:
Chunk ID: {chunk_id}
Chunk Text: "{chunk_text}"
Queries:
- 
"""
    response = client.models.generate_content(
        model=GEMINI_MODEL,
        contents=prompt
    )

    text = response.text.strip()
    queries = [q.strip("- ").strip() for q in text.split("\n") if q.strip()]
    return queries[:num_queries]

# -------------------- Main workflow --------------------
def main():
    # Initialize persistent ChromaDB client
    client_db = chromadb.PersistentClient(path=ABSOLUTE_DB_PATH)
    print(f"[INFO] ChromaDB client initialized at: {ABSOLUTE_DB_PATH}")

    # Access the existing collection directly
    collection = client_db.get_collection(name=COLLECTION_NAME)
    print(f"[INFO] Using existing collection: {COLLECTION_NAME}")

    # Fetch all documents and their metadata
    results = collection.get(include=["documents", "metadatas"])
    chunks = [
        {"id": meta["id"], "text": doc}  # Use ID stored in metadata
        for doc, meta in zip(results["documents"], results["metadatas"])
        if meta.get("ischunk") is True
    ]
    print(f"[INFO] Found {len(chunks)} chunks (ischunk=True)")

    all_pairs = []
    total_requests_today = 0

    for i in tqdm(range(0, len(chunks), BATCH_SIZE), desc="Processing chunk-batches"):
        batch = chunks[i : i + BATCH_SIZE]

        for chunk in batch:
            if total_requests_today >= MAX_RPD:
                print(f"[INFO] Reached daily limit of {MAX_RPD} requests. Stopping.")
                break

            # ***** Get next Gemini client using round robin *****
            gemini = get_next_client()

            try:
                queries = ask_gemini_generate_queries(
                    gemini, chunk["text"], chunk["id"], NUM_QUERIES_PER_CHUNK
                )
                total_requests_today += 1
            except Exception as e:
                print(f"[ERROR] Failed to generate for chunk {chunk['id']}: {e}")
                continue

            for q in queries:
                all_pairs.append({"query": q, "chunk_id": chunk["id"]})

            # Sleep to respect RPM
            time.sleep(SECONDS_PER_REQUEST)

        # Optional: extra pause between batches
        time.sleep(1)

    # Save results
    df = pd.DataFrame(all_pairs)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"[INFO] Saved {len(df)} query-chunk pairs to {OUTPUT_CSV}")

# -------------------- Run --------------------
if __name__ == "__main__":
    main()

3rd attempt => Groq => Round Robin Fashion

In [None]:
#!/usr/bin/env python3

import os
import time
import pandas as pd
from tqdm import tqdm
from typing import List
from datetime import datetime

import chromadb
from chromadb.api.types import Documents, Metadatas
from groq import Groq

# -------------------- Configuration --------------------
ABSOLUTE_DB_PATH = "../VectorDB/chroma_Data"
COLLECTION_NAME = "harry_potter_collection"

# ***** 5 API KEYS FOR ROUND-ROBIN (GROQ KEYS) *****
API_KEYS = [
    "GROQ_KEY_1",
    "GROQ_KEY_2",
    "GROQ_KEY_3",
    "GROQ_KEY_4",
    "GROQ_KEY_5"
]

BATCH_SIZE = 5
NUM_QUERIES_PER_CHUNK = 5

# You can use any Groq-supported model; let's use the fast LLaMA model
GROQ_MODEL = "llama-3.1-8b-instant"

OUTPUT_CSV = "generated_pairs.csv"

# Groq rate limits (adjust manually to stay safe)
MAX_RPM = 15
MAX_RPD = 200

SECONDS_PER_REQUEST = 60 / MAX_RPM

# -------------------- Groq Utilities --------------------
def init_groq_client(api_key: str):
    return Groq(api_key=api_key)

# Round-robin state
rr_index = 0

def get_next_client():
    """
    Returns a Groq client using the next API key in round robin fashion.
    """
    global rr_index
    key = API_KEYS[rr_index]
    rr_index = (rr_index + 1) % len(API_KEYS)
    return init_groq_client(key)

def ask_groq_generate_queries(client, chunk_text: str, chunk_id: str, num_queries: int) -> List[str]:
    """
    Ask Groq LLM to generate user-style queries for a given chunk.
    Few-shot prompt unchanged.
    """
    prompt = f"""
You are an AI that generates realistic search queries a user might input to an LLM or search system.
Each query should be short, relevant, and reflect what someone might actually ask.

Here are a few examples:

Example 1:
Chunk: "Harry receives his first letter from Hogwarts, but Uncle Vernon tries to stop him."
Queries:
- "How did Harry get his Hogwarts letter?"
- "Why did Uncle Vernon hide Harry's letter?"
- "First Hogwarts letter incident"

Example 2:
Chunk: "Hagrid visits Harry to explain that he is a wizard."
Queries:
- "Who is Hagrid and why did he visit Harry?"
- "How did Harry find out he is a wizard?"
- "Hagrid tells Harry he's a wizard"

Now, generate {num_queries} short user queries for the following chunk:
Chunk ID: {chunk_id}
Chunk Text: "{chunk_text}"
Queries:
-
"""

    response = client.chat.completions.create(
        model=GROQ_MODEL,
        messages=[{"role": "user", "content": prompt}]
    )

    text = response.choices[0].message.content.strip()
    queries = [q.strip("- ").strip() for q in text.split("\n") if q.strip()]
    return queries[:num_queries]

# -------------------- Main workflow --------------------
def main():
    # Initialize persistent ChromaDB client
    client_db = chromadb.PersistentClient(path=ABSOLUTE_DB_PATH)
    print(f"[INFO] ChromaDB client initialized at: {ABSOLUTE_DB_PATH}")

    # Access the existing collection directly
    collection = client_db.get_collection(name=COLLECTION_NAME)
    print(f"[INFO] Using existing collection: {COLLECTION_NAME}")

    # Fetch all documents and their metadata
    results = collection.get(include=["documents", "metadatas"])
    chunks = [
        {"id": meta["id"], "text": doc}
        for doc, meta in zip(results["documents"], results["metadatas"])
        if meta.get("ischunk") is True
    ]
    print(f"[INFO] Found {len(chunks)} chunks (ischunk=True)")

    all_pairs = []
    total_requests_today = 0

    for i in tqdm(range(0, len(chunks), BATCH_SIZE), desc="Processing chunk-batches"):
        batch = chunks[i : i + BATCH_SIZE]

        for chunk in batch:
            if total_requests_today >= MAX_RPD:
                print(f"[INFO] Reached daily limit of {MAX_RPD} requests. Stopping.")
                break

            # ***** Get next Groq client using round robin *****
            groq_client = get_next_client()

            try:
                queries = ask_groq_generate_queries(
                    groq_client, chunk["text"], chunk["id"], NUM_QUERIES_PER_CHUNK
                )
                total_requests_today += 1
            except Exception as e:
                print(f"[ERROR] Failed to generate for chunk {chunk['id']}: {e}")
                continue

            for q in queries:
                all_pairs.append({"query": q, "chunk_id": chunk["id"]})

            # Sleep to respect RPM
            time.sleep(SECONDS_PER_REQUEST)

        time.sleep(1)

    # Save results
    df = pd.DataFrame(all_pairs)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"[INFO] Saved {len(df)} query-chunk pairs to {OUTPUT_CSV}")

# -------------------- Run --------------------
if __name__ == "__main__":
    main()