In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
from groq import Groq
import time
from tqdm import tqdm
import json

In [2]:
df = pd.read_excel("FreshQADataset_filtered.xlsx")
df.head()

Unnamed: 0,id,question,needle,relevant_source,irrelevant_source
0,42,By how many seats do Democrats currently contr...,Democrats currently do not control the US Senate.,https://en.wikipedia.org/wiki/United_States_Se...,
1,43,How long has Elon Musk been X Corp.'s CEO?,Elon Musk is no longer X Corp.'s CEO.,https://en.wikipedia.org/wiki/X_Corp.#:~:text=...,https://en.wikipedia.org/wiki/OpenAI
2,44,Where will the FIFA World Cup be hosted this y...,There won't be a FIFA World Cup this year.,https://en.wikipedia.org/wiki/FIFA_World_Cup#:...,https://en.wikipedia.org/wiki/Premier_League
3,92,Alphabet's market capitalization reached its h...,The all-time highest value of Alphabet was in ...,https://en.wikipedia.org/wiki/List_of_public_c...,https://en.wikipedia.org/wiki/Google_DeepMind
4,95,Which Republican was elected Speaker of the Ho...,No one received a majority of the votes on the...,https://en.wikipedia.org/wiki/January_2023_Spe...,https://en.wikipedia.org/wiki/October_2023_Spe...


In [3]:
# Load .env and API key
load_dotenv()
groq_api_key = os.getenv("snlp_api_key")

# Initialize Groq client
client = Groq(api_key=groq_api_key)

# Prompt builder
def build_prompt(sentence):
    return f"Give me 5 sentences that slightly contradict this sentence (i.e. add semantic noise to the sentence):\n\n\"{sentence}\" \n\n Give it to me in JSON format."

# Get model response from Groq API
def get_contradictions(sentence):
    try:
        prompt = build_prompt(sentence)
        completion = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[{"role": "user", "content": prompt}],
            temperature=1.0,
            max_tokens=1024,
            top_p=1.0,
            stream=False,
            response_format={"type": "json_object"},
            stop=None,
        )
        content = completion.choices[0].message.content
        data = json.loads(content)["sentences"]
        return data
    except Exception as e:
        print(f"Error: {e}")
        return []

# Store misleading sentences
misleading_statements = []

for idx, needle in enumerate(tqdm(df['needle'], desc="Generating misleading sentences")):
    if idx > 0 and idx % 5 == 0:
        print("⏳ Rate limit pause: sleeping for 15 seconds...")
        time.sleep(15)
    misleading = get_contradictions(needle)
    misleading_statements.append(misleading)

# Add to DataFrame
df['statements_misleading'] = misleading_statements

# Save to Excel
df.to_excel("FreshQADataset_with_misleading.xlsx", index=False)
print("✅ Data saved to FreshQADataset_with_misleading.xlsx")


Generating misleading sentences:   4%|▍         | 5/120 [00:03<01:15,  1.52it/s]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:   8%|▊         | 10/120 [00:21<03:20,  1.83s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  12%|█▎        | 15/120 [00:39<03:09,  1.81s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  17%|█▋        | 20/120 [00:57<03:00,  1.80s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  21%|██        | 25/120 [01:15<02:53,  1.83s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  25%|██▌       | 30/120 [01:32<02:42,  1.81s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  29%|██▉       | 35/120 [01:50<02:40,  1.89s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  33%|███▎      | 40/120 [02:08<02:28,  1.85s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  38%|███▊      | 45/120 [02:25<02:13,  1.78s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  42%|████▏     | 50/120 [02:43<02:12,  1.90s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  46%|████▌     | 55/120 [03:01<01:59,  1.84s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  50%|█████     | 60/120 [03:19<01:52,  1.87s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  54%|█████▍    | 65/120 [03:36<01:40,  1.83s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  58%|█████▊    | 70/120 [03:54<01:31,  1.83s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  61%|██████    | 73/120 [04:11<02:31,  3.23s/it]

Error: 'sentences'


Generating misleading sentences:  62%|██████▎   | 75/120 [04:12<01:24,  1.87s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  67%|██████▋   | 80/120 [04:30<01:13,  1.83s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  71%|███████   | 85/120 [04:47<01:04,  1.85s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  75%|███████▌  | 90/120 [05:05<00:55,  1.84s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  79%|███████▉  | 95/120 [05:23<00:45,  1.84s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  83%|████████▎ | 100/120 [05:40<00:36,  1.81s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  88%|████████▊ | 105/120 [05:58<00:26,  1.78s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  92%|█████████▏| 110/120 [06:16<00:18,  1.88s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences:  96%|█████████▌| 115/120 [06:33<00:09,  1.82s/it]

⏳ Rate limit pause: sleeping for 15 seconds...


Generating misleading sentences: 100%|██████████| 120/120 [06:51<00:00,  3.43s/it]


PermissionError: [Errno 13] Permission denied: 'FreshQADataset_with_misleading.xlsx'

In [4]:
# Save to Excel
df.to_excel("FreshQADataset_with_misleading.xlsx", index=False)
print("✅ Data saved to FreshQADataset_with_misleading.xlsx")

✅ Data saved to FreshQADataset_with_misleading.xlsx


In [5]:
df.to_json("dataset_with_misleading_quotes.json")