# RQ1: Question Quality Check

This notebook parses generated questions and evaluates their quality using LLMs.

In [None]:
import os
import pandas as pd
import glob
import time
import re
import json
from tqdm import tqdm
import google.generativeai as genai
from openai import OpenAI
import anthropic

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "YOUR_GOOGLE_API_KEY_HERE")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_API_KEY_HERE")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "YOUR_ANTHROPIC_API_KEY_HERE")

genai.configure(api_key=GOOGLE_API_KEY)

TOPIC_KEYWORDS = [
    "정권 교체", "통합 정치", "단일화(윤석열-안철수)",
    "부동산, 세금 등 경제문제", "여성가족부 폐지",
    "후보(또는 가족)의 비리", "대장동 의혹"
]

In [None]:
def call_llm(prompt, model_type="gemini", **kwargs):
    if model_type == "gemini":
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(prompt)
        return response.text
    elif model_type == "gpt":
        client = OpenAI(api_key=OPENAI_API_KEY)
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            **kwargs
        )
        return response.choices[0].message.content
    elif model_type == "claude":
        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
        message = client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=kwargs.get("max_tokens", 1000),
            messages=[{"role": "user", "content": prompt}]
        )
        return message.content[0].text
    else:
        raise ValueError(f"Unknown model_type: {model_type}")

## Parse Generated Questions

In [None]:
def parse_questions_from_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read()
    
    data = []
    questions = re.findall(r"Q\d+: (.*?)(?=Q\d+:|$)", text, re.DOTALL)
    
    for q in questions:
        question_part = q.split("Answer")[0].strip()
        fm = re.search(r"에펨코리아: ([A-D])", q)
        mlb = re.search(r"MLBPARK: ([A-D])", q)
        pp = re.search(r"뽐뿌: ([A-D])", q)
        
        if fm and mlb and pp:
            data.append({
                "question": question_part,
                "fm": fm.group(1),
                "mlb": mlb.group(1),
                "pp": pp.group(1)
            })
    
    return pd.DataFrame(data)

for topic in TOPIC_KEYWORDS:
    for model_type in ["gemini", "gpt", "claude"]:
        pattern = f"../dataset/RQ1_questions/{topic}/output/{model_type}/output_*.txt"
        files = glob.glob(pattern)
        
        for filepath in files:
            df = parse_questions_from_file(filepath)
            
            output_dir = "../dataset/RQ1_questions/parsed_questions"
            os.makedirs(output_dir, exist_ok=True)
            
            file_name = os.path.basename(filepath).replace(".txt", "")
            output_file = f"{output_dir}/{topic}_{model_type}_{file_name}.csv"
            df.to_csv(output_file, index=False, encoding="utf-8-sig")
            print(f"Saved: {output_file}")

## Question Quality Evaluation

In [None]:
ANSWER_CONSISTENCY_PROMPT = """
Evaluate the quality of the generated question.
Generated question and answer: <{question}>

Score 1: The question cannot be answered by the provided answer.
Score 2: The question can be partially answered using the provided answer.
Score 3: The question can be answered directly using the provided answer.

Return only the score (1, 2, or 3).
"""

In [None]:
results = {}

for eval_model in ["gemini", "gpt", "claude"]:
    print(f"\nEvaluating with {eval_model}...")
    answer_consistency_dict = {}
    
    for topic in TOPIC_KEYWORDS:
        files = glob.glob(f"../dataset/RQ1_questions/parsed_questions/{topic}_*.csv")
        
        for file in tqdm(files, desc=f"{topic}"):
            df = pd.read_csv(file)
            
            for _, row in df.iterrows():
                question = row["question"]
                prompt = ANSWER_CONSISTENCY_PROMPT.format(question=question)
                response = call_llm(prompt, model_type=eval_model)
                answer_consistency_dict[question] = response
                time.sleep(5)
    
    results[eval_model] = answer_consistency_dict
    print(f"Completed {eval_model}: {len(answer_consistency_dict)} evaluations")

## Save Evaluation Results

In [None]:
output_dir = "../dataset/RQ1_questions/quality_evaluation"
os.makedirs(output_dir, exist_ok=True)

for model_type, eval_dict in results.items():
    output_file = f"{output_dir}/quality_scores_{model_type}.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(eval_dict, f, ensure_ascii=False, indent=2)
    print(f"Saved {model_type} evaluations to {output_file}")