In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
from groq import Groq
import time
from tqdm import tqdm
import json

In [2]:
df = pd.read_excel("FreshQADataset_combined.xlsx")
df.head()

Unnamed: 0,id,question,needle,relevant_source,irrelevant_source
0,42,By how many seats do Democrats currently contr...,Democrats currently do not control the US Senate.,https://en.wikipedia.org/wiki/United_States_Se...,https://en.wikipedia.org/wiki/Parliament_of_th...
1,43,How long has Elon Musk been X Corp.'s CEO?,Elon Musk is no longer X Corp.'s CEO.,https://en.wikipedia.org/wiki/X_Corp.#:~:text=...,https://en.wikipedia.org/wiki/OpenAI
2,44,Where will the FIFA World Cup be hosted this y...,There won't be a FIFA World Cup this year.,https://en.wikipedia.org/wiki/FIFA_World_Cup#:...,https://en.wikipedia.org/wiki/Premier_League
3,92,Alphabet's market capitalization reached its h...,The all-time highest value of Alphabet was in ...,https://en.wikipedia.org/wiki/List_of_public_c...,https://en.wikipedia.org/wiki/Google_DeepMind
4,95,Which Republican was elected Speaker of the Ho...,No one received a majority of the votes on the...,https://en.wikipedia.org/wiki/January_2023_Spe...,https://en.wikipedia.org/wiki/October_2023_Spe...


In [3]:
# df.drop(columns=["statements_misleading"], inplace=True)
# df

In [4]:
def build_rewrite_prompt(question, needle):
    return (
        f"You are helping refine a question-answer dataset for retrieval.\n"
        f"Given the question and the needle (i.e. the key answer), do one of the following:\n"
        f"1. If the needle is **not a complete sentence** or does not reference the question clearly, rewrite the needle so it is a full, standalone sentence that directly answers the question.\n"
        f"2. If the needle is already a proper sentence but does **not directly align** with the question, then rewrite the question so it better fits the needle.\n"
        f"3. If everything is fine, return both unchanged.\n\n"
        f"Return the final result in this exact JSON format:\n"
        f'{{"question": "<rewritten_or_original_question>", "needle": "<rewritten_or_original_needle>"}}\n\n'
        f"---\n"
        f"Original Question: {question}\n"
        f"Original Needle: {needle}\n"
    )


In [5]:
# Load .env and API key
load_dotenv()
groq_api_key = os.getenv("snlp_api_key")

# Initialize Groq client
client = Groq(api_key=groq_api_key)

def rewrite_question_and_needle(question, needle):
    while True:
        try:
            prompt = build_rewrite_prompt(question, needle)
            completion = client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=512,
                top_p=1.0,
                stream=False,
                response_format={"type": "json_object"},
                stop=None,
            )
            content = completion.choices[0].message.content
            data = json.loads(content)
            return data["question"], data["needle"]
        except Exception as e:
            print(f"Error: {e}")
            print("Retrying now...")


In [6]:
updated_questions = []
updated_needles = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Refining Q&A pairs"):
    q, n = row.get("question", ""), row.get("needle", "")
    
    if idx > 0 and idx % 5 == 0:
        print("⏳ Rate limit pause: sleeping for 15 seconds...")
        time.sleep(15)
    
    new_q, new_n = rewrite_question_and_needle(q, n)
    updated_questions.append(new_q)
    updated_needles.append(new_n)

# Update the DataFrame
df['question_refined'] = updated_questions
df['needle_refined'] = updated_needles


Refining Q&A pairs:   0%|          | 0/150 [00:00<?, ?it/s]

Refining Q&A pairs:   3%|▎         | 5/150 [00:01<00:49,  2.92it/s]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:   7%|▋         | 10/150 [00:18<03:25,  1.47s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  10%|█         | 15/150 [00:35<03:39,  1.63s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  13%|█▎        | 20/150 [00:52<03:33,  1.64s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  17%|█▋        | 25/150 [01:09<03:29,  1.67s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  20%|██        | 30/150 [01:25<03:17,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  23%|██▎       | 35/150 [01:42<03:11,  1.67s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  27%|██▋       | 40/150 [01:59<03:01,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  30%|███       | 45/150 [02:16<02:52,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  33%|███▎      | 50/150 [02:33<02:45,  1.66s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  37%|███▋      | 55/150 [02:49<02:35,  1.64s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  40%|████      | 60/150 [03:06<02:28,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  43%|████▎     | 65/150 [03:23<02:18,  1.63s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  47%|████▋     | 70/150 [03:40<02:16,  1.70s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  50%|█████     | 75/150 [03:57<02:08,  1.71s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  53%|█████▎    | 80/150 [04:14<01:57,  1.68s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  57%|█████▋    | 85/150 [04:31<01:48,  1.67s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  60%|██████    | 90/150 [04:48<01:40,  1.68s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  63%|██████▎   | 95/150 [05:05<01:30,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  67%|██████▋   | 100/150 [05:21<01:22,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  70%|███████   | 105/150 [05:38<01:13,  1.63s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  73%|███████▎  | 110/150 [05:55<01:06,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  77%|███████▋  | 115/150 [06:12<00:57,  1.64s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  80%|████████  | 120/150 [06:30<00:52,  1.76s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  83%|████████▎ | 125/150 [06:47<00:42,  1.70s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  87%|████████▋ | 130/150 [07:03<00:33,  1.66s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  90%|█████████ | 135/150 [07:20<00:24,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  93%|█████████▎| 140/150 [07:34<00:14,  1.42s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  97%|█████████▋| 145/150 [07:51<00:08,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs: 100%|██████████| 150/150 [08:08<00:00,  3.26s/it]


In [7]:
df['question_changed'] = df['question'] != df['question_refined']
df['needle_changed'] = df['needle'] != df['needle_refined']

In [8]:
# Prompt builder using your custom format
def build_prompt(question, answer):
    return (
        f"Give me 5 sentences that slightly contradict this sentence (i.e. add semantic noise to the needle for the needle in haystack test):\n"
        f"Q: {question}\n"
        f"Needle: {answer}\n\n"
        f"Give it to me in JSON format."
    )

# Get model response from Groq API
def get_contradictions(question, answer):
    while True:
        try:
            prompt = build_prompt(question, answer)
            completion = client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                temperature=1.0,
                max_tokens=1024,
                top_p=1.0,
                stream=False,
                response_format={"type": "json_object"},
                stop=None,
            )
            content = completion.choices[0].message.content
            data = json.loads(content)["sentences"]
            return data
        except Exception as e:
            print(f"Error: {e}")
            print("Retrying now...")

# Store misleading sentences
misleading_statements = []

# Generate misleading sentences per row
misleading_statements = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating misleading sentences"):
    question = row.get("question_refined", "")
    needle = row.get("needle_refined", "")
    if idx > 0 and idx % 5 == 0:
        print("⏳ Rate limit pause: sleeping for 15 seconds...")
        time.sleep(15)
    misleading = get_contradictions(question, needle)
    misleading_statements.append(misleading)

# Add to DataFrame
df['statements_misleading'] = misleading_statements

# Save to Excel
df.to_excel("FreshQADataset_with_misleading_combined.xlsx", index=False)
print("✅ Data saved to FreshQADataset_with_misleading.xlsx")

Generating misleading sentences:   3%|▎         | 5/150 [00:03<01:38,  1.47it/s]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:   7%|▋         | 10/150 [00:22<04:22,  1.87s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  10%|█         | 15/150 [00:41<04:29,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  13%|█▎        | 20/150 [00:59<04:20,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...
Error: 'sentences'
Retrying now...


Generating misleading sentences:  17%|█▋        | 25/150 [01:19<04:24,  2.12s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  20%|██        | 30/150 [01:37<04:01,  2.01s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  21%|██▏       | 32/150 [01:54<08:52,  4.52s/it]

Error: 'sentences'
Retrying now...


Generating misleading sentences:  23%|██▎       | 35/150 [01:57<04:04,  2.13s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  27%|██▋       | 40/150 [02:15<03:37,  1.98s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  29%|██▊       | 43/150 [02:32<05:57,  3.34s/it]

Error: 'sentences'
Retrying now...


Generating misleading sentences:  30%|███       | 45/150 [02:34<03:39,  2.09s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  33%|███▎      | 50/150 [02:53<03:24,  2.04s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  37%|███▋      | 55/150 [03:11<03:04,  1.94s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  40%|████      | 60/150 [03:29<02:55,  1.95s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  43%|████▎     | 65/150 [03:48<02:47,  1.96s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  47%|████▋     | 70/150 [04:06<02:42,  2.03s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  50%|█████     | 75/150 [04:25<02:24,  1.93s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  53%|█████▎    | 80/150 [04:43<02:21,  2.02s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  57%|█████▋    | 85/150 [05:02<02:10,  2.01s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  60%|██████    | 90/150 [05:20<01:57,  1.96s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  63%|██████▎   | 95/150 [05:40<01:57,  2.13s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  67%|██████▋   | 100/150 [05:58<01:40,  2.01s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  70%|███████   | 105/150 [06:16<01:28,  1.97s/it]

⏳ Rate limit pause: sleeping for 15 seconds...
Error: 'sentences'
Retrying now...


Generating misleading sentences:  73%|███████▎  | 110/150 [06:36<01:23,  2.09s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  77%|███████▋  | 115/150 [06:54<01:10,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  80%|████████  | 120/150 [07:13<00:59,  1.99s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  83%|████████▎ | 125/150 [07:32<00:51,  2.07s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  87%|████████▋ | 130/150 [07:50<00:40,  2.01s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  90%|█████████ | 135/150 [08:09<00:30,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  93%|█████████▎| 140/150 [08:28<00:20,  2.01s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  97%|█████████▋| 145/150 [08:46<00:10,  2.01s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences: 100%|██████████| 150/150 [09:08<00:00,  3.65s/it]

✅ Data saved to FreshQADataset_with_misleading.xlsx





In [9]:
json_output = df.to_json(orient="records", indent=4, force_ascii=False)
with open("context.json", "w", encoding="utf-8") as f:
    f.write(json_output)