In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
from groq import Groq
import time
from tqdm import tqdm
import json

In [2]:
df = pd.read_excel("Fresh_QA_without_misleading_statements.xlsx")
df.head()

Unnamed: 0,id,question,needle,real_needle,context_relevant,context_irrelevant,statements_misleading
0,43,How long has Elon Musk been X Corp.'s CEO?,Elon Musk is no longer X Corp.'s CEO.,Elon Musk is no longer X Corp.'s CEO.,43.txt,43.txt,"['test1', 'test2', 'test3']"
1,44,Where will the FIFA World Cup be hosted this y...,There won't be a FIFA World Cup this year.,There won't be a FIFA World Cup this year.,44.txt,44.txt,"['test1', 'test2', 'test3']"
2,92,Alphabet's market capitalization reached its h...,The all-time highest value of Alphabet was in ...,The all-time highest value of Alphabet was in ...,92.txt,92.txt,"['test1', 'test2', 'test3']"
3,95,Which Republican was elected Speaker of the Ho...,No one received a majority of the votes on the...,No one received a majority of the votes on the...,95.txt,95.txt,"['test1', 'test2', 'test3']"
4,96,"In January 2023, the NHC revised the fatality ...","The reported death toll decreased to 1,392","The reported death toll decreased to 1,392",96.txt,96.txt,"['test1', 'test2', 'test3']"


In [3]:
df.drop(columns=["statements_misleading"], inplace=True)
df

Unnamed: 0,id,question,needle,real_needle,context_relevant,context_irrelevant
0,43,How long has Elon Musk been X Corp.'s CEO?,Elon Musk is no longer X Corp.'s CEO.,Elon Musk is no longer X Corp.'s CEO.,43.txt,43.txt
1,44,Where will the FIFA World Cup be hosted this y...,There won't be a FIFA World Cup this year.,There won't be a FIFA World Cup this year.,44.txt,44.txt
2,92,Alphabet's market capitalization reached its h...,The all-time highest value of Alphabet was in ...,The all-time highest value of Alphabet was in ...,92.txt,92.txt
3,95,Which Republican was elected Speaker of the Ho...,No one received a majority of the votes on the...,No one received a majority of the votes on the...,95.txt,95.txt
4,96,"In January 2023, the NHC revised the fatality ...","The reported death toll decreased to 1,392","The reported death toll decreased to 1,392",96.txt,96.txt
...,...,...,...,...,...,...
79,586,What is the name of the most recent hurricane ...,Milton,Milton,586.txt,586.txt
80,587,What is King Gizzard’s most recent studio album?,Flight b741,Flight b741,587.txt,587.txt
81,588,Which comedy series won the most recent Primet...,Hacks (Season 3),Hacks (Season 3),588.txt,588.txt
82,589,What institution won the most recent ACM-ICPC ...,Peking University,Peking University,589.txt,589.txt


In [5]:
def build_rewrite_prompt(question, needle):
    return (
        f"You are helping refine a question-answer dataset for retrieval.\n"
        f"Given the question and the needle (i.e. the key answer), do one of the following:\n"
        f"1. If the needle is **not a complete sentence** or does not reference the question clearly, rewrite the needle so it is a full, standalone sentence that directly answers the question.\n"
        f"2. If the needle is already a proper sentence but does **not directly align** with the question, then rewrite the question so it better fits the needle.\n"
        f"3. If everything is fine, return both unchanged.\n\n"
        f"Return the final result in this exact JSON format:\n"
        f'{{"question": "<rewritten_or_original_question>", "needle": "<rewritten_or_original_needle>"}}\n\n'
        f"---\n"
        f"Original Question: {question}\n"
        f"Original Needle: {needle}\n"
    )


In [6]:
# Load .env and API key
load_dotenv()
groq_api_key = os.getenv("snlp_api_key")

# Initialize Groq client
client = Groq(api_key=groq_api_key)

def rewrite_question_and_needle(question, needle):
    while True:
        try:
            prompt = build_rewrite_prompt(question, needle)
            completion = client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=512,
                top_p=1.0,
                stream=False,
                response_format={"type": "json_object"},
                stop=None,
            )
            content = completion.choices[0].message.content
            data = json.loads(content)
            return data["question"], data["needle"]
        except Exception as e:
            print(f"Error: {e}")
            print("Retrying now...")


In [7]:
updated_questions = []
updated_needles = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Refining Q&A pairs"):
    q, n = row.get("question", ""), row.get("needle", "")
    
    if idx > 0 and idx % 5 == 0:
        print("⏳ Rate limit pause: sleeping for 15 seconds...")
        time.sleep(15)
    
    new_q, new_n = rewrite_question_and_needle(q, n)
    updated_questions.append(new_q)
    updated_needles.append(new_n)

# Update the DataFrame
df['question_refined'] = updated_questions
df['needle_refined'] = updated_needles


Refining Q&A pairs:   6%|▌         | 5/84 [00:01<00:30,  2.61it/s]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  12%|█▏        | 10/84 [00:18<01:52,  1.52s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  18%|█▊        | 15/84 [00:36<01:54,  1.66s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  24%|██▍       | 20/84 [00:53<01:46,  1.67s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  30%|██▉       | 25/84 [01:10<01:39,  1.69s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  36%|███▌      | 30/84 [01:26<01:29,  1.66s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  42%|████▏     | 35/84 [01:43<01:20,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  48%|████▊     | 40/84 [02:00<01:12,  1.66s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  54%|█████▎    | 45/84 [02:17<01:05,  1.67s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  60%|█████▉    | 50/84 [02:35<00:59,  1.76s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  65%|██████▌   | 55/84 [02:52<00:49,  1.69s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  71%|███████▏  | 60/84 [03:09<00:40,  1.68s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  77%|███████▋  | 65/84 [03:26<00:32,  1.69s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  83%|████████▎ | 70/84 [03:42<00:23,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  89%|████████▉ | 75/84 [03:59<00:14,  1.65s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs:  95%|█████████▌| 80/84 [04:16<00:06,  1.69s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Refining Q&A pairs: 100%|██████████| 84/84 [04:34<00:00,  3.26s/it]


In [8]:
df['question_changed'] = df['question'] != df['question_refined']
df['needle_changed'] = df['needle'] != df['needle_refined']

In [10]:
# Prompt builder using your custom format
def build_prompt(question, answer):
    return (
        f"Give me 5 sentences that slightly contradict this sentence (i.e. add semantic noise to the needle for the needle in haystack test):\n"
        f"Q: {question}\n"
        f"Needle: {answer}\n\n"
        f"Give it to me in JSON format."
    )

# Get model response from Groq API
def get_contradictions(question, answer):
    while True:
        try:
            prompt = build_prompt(question, answer)
            completion = client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                temperature=1.0,
                max_tokens=1024,
                top_p=1.0,
                stream=False,
                response_format={"type": "json_object"},
                stop=None,
            )
            content = completion.choices[0].message.content
            data = json.loads(content)["sentences"]
            return data
        except Exception as e:
            print(f"Error: {e}")
            print("Retrying now...")

# Store misleading sentences
misleading_statements = []

# Generate misleading sentences per row
misleading_statements = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating misleading sentences"):
    question = row.get("question_refined", "")
    needle = row.get("needle_refined", "")
    if idx > 0 and idx % 5 == 0:
        print("⏳ Rate limit pause: sleeping for 15 seconds...")
        time.sleep(15)
    misleading = get_contradictions(question, needle)
    misleading_statements.append(misleading)

# Add to DataFrame
df['statements_misleading'] = misleading_statements

# Save to Excel
df.to_excel("FreshQADataset_with_misleading.xlsx", index=False)
print("✅ Data saved to FreshQADataset_with_misleading.xlsx")

Generating misleading sentences:   6%|▌         | 5/84 [00:03<00:57,  1.37it/s]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  12%|█▏        | 10/84 [00:22<02:13,  1.81s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  18%|█▊        | 15/84 [00:40<02:14,  1.95s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  24%|██▍       | 20/84 [00:59<02:08,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  30%|██▉       | 25/84 [01:17<02:01,  2.05s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  33%|███▎      | 28/84 [01:34<03:06,  3.32s/it]

Error: 'sentences'
Retrying now...


Generating misleading sentences:  36%|███▌      | 30/84 [01:37<01:55,  2.13s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  40%|████      | 34/84 [01:54<02:08,  2.57s/it]

Error: 'sentences'
Retrying now...


Generating misleading sentences:  42%|████▏     | 35/84 [01:56<01:48,  2.20s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  48%|████▊     | 40/84 [02:14<01:27,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  54%|█████▎    | 45/84 [02:33<01:20,  2.05s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  60%|█████▉    | 50/84 [02:52<01:08,  2.01s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  65%|██████▌   | 55/84 [03:11<00:59,  2.05s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  71%|███████▏  | 60/84 [03:29<00:48,  2.04s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  77%|███████▋  | 65/84 [03:48<00:38,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  83%|████████▎ | 70/84 [04:06<00:27,  1.97s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  89%|████████▉ | 75/84 [04:25<00:17,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  95%|█████████▌| 80/84 [04:44<00:08,  2.00s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences: 100%|██████████| 84/84 [05:02<00:00,  3.60s/it]

✅ Data saved to FreshQADataset_with_misleading.xlsx





In [11]:
json_output = df.to_json(orient="records", indent=4, force_ascii=False)
with open("context.json", "w", encoding="utf-8") as f:
    f.write(json_output)