In [5]:
from openai import OpenAI
import json
import csv
from tqdm import tqdm
import random
import re
import os

api_key = ""
# Initialize the OpenAI client
client = OpenAI(
    api_key=api_key
)

# Only neutral instanecs

In [7]:
with open("raw_data/healthver_climate-fever_all_neutral.json", 'r', encoding='utf-8') as f:
    data = json.load(f)

In [42]:
import json
from tqdm import tqdm
from openai import OpenAI  # make sure your OpenAI client is installed and set up


# ========== PROMPTS ==========

prompt_qa = """Given a claim, your task is to:
Read the claim carefully.

Generate two questions based on the claim that refer to information mentioned or implied in it:

- One WH-question.
- One Yes/No question.

Additionally:
- Try to focus on specific elements or details from the claim that could naturally invite questions.

For the WH-question, classify it as one of the following two types:
- Function Question – A question with a single definitive answer (e.g., “What is the capital of France?”).
- List Question – A question that may have multiple valid answers (e.g., “What causes air pollution?”).

Your response must follow the exact JSON format shown below:
{
  "wh_question": "<Your WH-question here>",
  "question_type": "<Function Question or List Question>",
  "yes_no_question": "<Your Yes/No question here>"
}
"""

prompt_verification = """You are given an evidence sentence and a question.

Task: Determine whether the evidence provides an answer to the question.

Evidence:
\"\"\"{evidence}\"\"\"

Question:
\"\"\"{question}\"\"\"

Instructions:
- If the evidence answers the question, respond only with "Yes".
- If the evidence does not answer the question, respond only with "No".
- Do not explain your answer. Just output "Yes" or "No".
"""

# ========== FUNCTIONS ==========

def get_gpt_response(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )
    content = response.choices[0].message.content
    content = content.replace("```json", "").replace("```", "").strip()
    return content

def generate_questions(claim):
    combined_prompt = f"{prompt_qa}\n\nClaim:\n{claim}\n"
    raw_output = get_gpt_response(combined_prompt)
    try:
        return json.loads(raw_output)
    except json.JSONDecodeError:
        print("Failed to decode JSON from:", raw_output)
        return None

def check_evidence_answers_question(evidence, question):
    verification_prompt = prompt_verification.format(evidence=evidence, question=question)
    response = get_gpt_response(verification_prompt)
    return response.strip()

# ========== MAIN PROCESS ==========

# Load your input data
with open("raw_data/healthver_climate-fever_all_neutral.json", 'r', encoding='utf-8') as f:
    instances = json.load(f)

for instance in tqdm(instances[:2]):  # Only first 2 for testing
    claim = instance['claim']
    evidences = instance['evidences']
    
    # Phase 1: Generate questions based only on the claim
    questions = generate_questions(claim)
    if questions is None:
        instance['generated_questions'] = {}
        continue  # No questions generated

    valid_questions = {}

    # Phase 2: Check WH-question
    wh_is_answered = False
    for evidence in evidences:
        wh_answered = check_evidence_answers_question(evidence, questions['wh_question'])
        if wh_answered.lower() == "yes":
            wh_is_answered = True
            break

    if not wh_is_answered:
        valid_questions["wh_question"] = questions["wh_question"]
        valid_questions["question_type"] = questions["question_type"]

    # Phase 3: Check Yes/No question
    yesno_is_answered = False
    for evidence in evidences:
        yesno_answered = check_evidence_answers_question(evidence, questions['yes_no_question'])
        if yesno_answered.lower() == "yes":
            yesno_is_answered = True
            break

    if not yesno_is_answered:
        valid_questions["yes_no_question"] = questions["yes_no_question"]

    # Add valid generated questions to the instance
    instance['generated_questions'] = valid_questions

# Save all results (original data + new field)
with open("raw_data/healthver_climate-fever_all_neutral_with_questions.json", "w", encoding='utf-8') as f_out:
    json.dump(instances, f_out, indent=2, ensure_ascii=False)

print("✅ Finished updating instances with generated questions!")

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.10s/it]

✅ Finished updating instances with generated questions!



