# RQ1: Generate Survey Questions from Community Comments

This notebook generates multi-choice survey questions from Korean online community comments using LLMs (Gemini, GPT-4o-mini, Claude).

In [None]:
import os
import pandas as pd
import time
from tqdm import tqdm
import google.generativeai as genai
from openai import OpenAI
import anthropic

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "YOUR_GOOGLE_API_KEY_HERE")

genai.configure(api_key=GOOGLE_API_KEY)

TOPIC_KEYWORDS = [
    "정권 교체", "통합 정치", "단일화(윤석열-안철수)",
    "부동산, 세금 등 경제문제", "여성가족부 폐지",
    "후보(또는 가족)의 비리", "대장동 의혹"
]

COMMUNITIES = ["에펨코리아", "MLBPARK", "뽐뿌"]

In [None]:
def call_llm(prompt, model_type="gemini", **kwargs):
    if model_type == "gemini":
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(prompt)
        return response.text
    elif model_type == "gpt":
        client = OpenAI(api_key=OPENAI_API_KEY)
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            **kwargs
        )
        return response.choices[0].message.content
    elif model_type == "claude":
        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
        message = client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=kwargs.get("max_tokens", 4096),
            messages=[{"role": "user", "content": prompt}]
        )
        return message.content[0].text
    else:
        raise ValueError(f"Unknown model_type: {model_type}")

In [None]:
QUESTION_GENERATION_PROMPT = """
Below are comments from 3 different Korean online communities related to a topic.
The topic can be represented using these keywords: {topic_keyword}.

{community_comments}

Write five questions (Q1-Q5) on the topic:<{topic_keyword}> that can be answered based on these communities.
For a community, each question should be answered in a way that the members from the community would do, and the answers should echo the comments shown above.
Do NOT rely on your background knowledge about the specific community to answer the questions.
The questions should be low-level, detailed, trigger different responses that differentiate between different communities.
The questions should not be in the style of reading comprehension ones, and they are intended for members in the community to answer.
The questions should not contain "comment" in them.
Each question should be paired with answers from all 3 communities.
The answers should be concise (fewer than 32 tokens), legible, grammatically correct.
For the 5 questions, they are multi-choice questions with four options (A through D).

Format:
Q1: [multi-choice question]
A.xxx  B.xxx  C.xxx  D.xxx
Answer from Community 에펨코리아: A/B/C/D
Answer from Community MLBPARK: A/B/C/D
Answer from Community 뽐뿌: A/B/C/D

(!Note. Always use Korean)
"""

def format_prompt_with_all_comments(communities, comments, topic_keyword):
    community_comments = ""
    for i, community in enumerate(communities):
        community_comments += f"Comments from {community}:\n"
        for j, comment in enumerate(comments[i]):
            community_comments += f"Comment {j + 1}: {comment}\n"
        community_comments += "\n"
    return QUESTION_GENERATION_PROMPT.format(
        topic_keyword=topic_keyword,
        community_comments=community_comments.strip()
    )

## Generate Prompts from Community Data

In [None]:
prompts = []
num_iterations = 5

for i in range(num_iterations):
    for topic in TOPIC_KEYWORDS:
        output_dir = f"../dataset/prompts/{topic}"
        os.makedirs(output_dir, exist_ok=True)

        fm_sample = pd.read_csv(f"../dataset/question_generation_samples/fm_{topic}.csv")
        mlb_sample = pd.read_csv(f"../dataset/question_generation_samples/mlb_{topic}.csv")
        pp_sample = pd.read_csv(f"../dataset/question_generation_samples/pp_{topic}.csv")

        fm_sample = fm_sample.sample(n=50, random_state=i).reset_index(drop=True)
        mlb_sample = mlb_sample.sample(n=50, random_state=i).reset_index(drop=True)
        pp_sample = pp_sample.sample(n=50, random_state=i).reset_index(drop=True)

        comments = [
            [str(row.get("Text", "")).strip() for row in df.to_dict("records")]
            for df in [fm_sample, mlb_sample, pp_sample]
        ]
        prompt = format_prompt_with_all_comments(COMMUNITIES, comments, topic)

        with open(os.path.join(output_dir, f"{i}.txt"), "w", encoding="utf-8") as f:
            f.write(prompt)
        prompts.append(prompt)

print(f"Generated {len(prompts)} prompts")

## Generate Questions Using All Three LLMs

In [None]:
def read_txt_files_from_directory(directory):
    txt_files = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                txt_files[filename] = file.read()
    return txt_files

for topic in TOPIC_KEYWORDS:
    prompts_directory = f"../dataset/prompts/{topic}"
    txt_files = read_txt_files_from_directory(prompts_directory)

    for model_type in ["gemini"]:
        print(f"\nProcessing {topic} with {model_type}...")
        results = {}

        for filename, content in tqdm(txt_files.items(), desc=f"{model_type}"):
            response = call_llm(content, model_type=model_type)
            results[filename] = response

            output_dir = f"../dataset/RQ1_questions/{topic}/output/{model_type}"
            os.makedirs(output_dir, exist_ok=True)

            iteration_num = filename.replace(".txt", "")
            output_filename = os.path.join(output_dir, f"output_{iteration_num}.txt")

            with open(output_filename, "w", encoding="utf-8") as output_file:
                output_file.write(response)

            time.sleep(5)

        print(f"Saved {len(results)} responses for {topic} using {model_type}")