# RQ2: Real Survey Analysis Experiment

This notebook analyzes the 20th Presidential Election voter consciousness survey data.
Supports both jinbo (liberal) and bosu (conservative) perspectives with 23 political opinion questions.

## Setup and Configuration

In [None]:
import os
import pandas as pd
import numpy as np
import re
import time
import glob
from datetime import datetime
from tqdm import tqdm
import faiss
from sentence_transformers import SentenceTransformer
from scipy.special import rel_entr
import google.generativeai as genai
from openai import OpenAI
import anthropic

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "YOUR_GOOGLE_API_KEY_HERE")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_API_KEY_HERE")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "YOUR_ANTHROPIC_API_KEY_HERE")

genai.configure(api_key=GOOGLE_API_KEY)

PERSPECTIVE = "jinbo"

In [None]:
def call_llm(prompt, model_type="gemini", **kwargs):
    if model_type == "gemini":
        model = genai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(prompt)
        return response.text
    elif model_type == "gpt":
        client = OpenAI(api_key=OPENAI_API_KEY)
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            **kwargs
        )
        return response.choices[0].message.content
    elif model_type == "claude":
        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
        message = client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=kwargs.get("max_tokens", 4096),
            messages=[{"role": "user", "content": prompt}]
        )
        return message.content[0].text
    else:
        raise ValueError(f"Unknown model_type: {model_type}")

## Load Data

In [None]:
if PERSPECTIVE == "jinbo":
    fm_sample = pd.read_csv("../dataset/community_samples/pp_sample_50000.csv", encoding="utf-8-sig")
    index_path = "../dataset/faiss/pp_embeddings_faiss_50000.index"
else:
    fm = pd.read_csv("../dataset/community_samples/fm_sample_50000.csv")
    mlb = pd.read_csv("../dataset/community_samples/mlb_sample_50000.csv")
    fm_sample = pd.concat([fm, mlb])
    index_path = "../dataset/faiss/bosu_embeddings_faiss_50000.index"

embedding_model = SentenceTransformer("intfloat/multilingual-e5-small", device="cuda")

if os.path.exists(index_path):
    index = faiss.read_index(index_path)
    print(f"Loaded FAISS index: {index.ntotal} vectors")
else:
    raise FileNotFoundError(f"FAISS index not found: {index_path}")

In [None]:
def chunk_text(text, max_chunk_length=512):
    words = text.split()
    chunks, current = [], []
    for word in words:
        if len(" ".join(current + [word])) <= max_chunk_length:
            current.append(word)
        else:
            chunks.append(" ".join(current))
            current = [word]
    if current:
        chunks.append(" ".join(current))
    return chunks

text_chunks_unique = []
seen_chunks = set()

for _, row in fm_sample.iterrows():
    for chunk in chunk_text(str(row["Text"])):
        if chunk not in seen_chunks:
            seen_chunks.add(chunk)
            chunked_row = row.to_dict()
            chunked_row["Text"] = chunk
            text_chunks_unique.append(chunked_row)

chunked_df = pd.DataFrame(text_chunks_unique)
chunked_df["ProcessedText"] = chunked_df["Text"].apply(lambda x: f"query: {x}")
print(f"Chunked DataFrame: {len(chunked_df)} rows")

## Survey Questions (23 Questions)

#### Refer to ksdc data (https://www.ksdcdb.kr/data/dataSearchResView.do?surveyId=2825)

In [None]:
SURVEY_QUESTIONS = {
    "33": [
        "나 같은 사람에게는 투표만이 정부에 대해 말할 수 있는 유일한 방법이다",
        "나 같은 사람이 정부가 하는 일에 대해 뭐라고 평가할 수 없다",
        "나 같은 사람에게는 정치나 정부가 하는 일이 너무 복잡해서 이해할 수가 없다",
        "투표는 아주 많은 사람들이 하기 때문에 내가 투표하는가 안하는가는 그리 중요하지 않다",
        "만약 내가 지지하는 후보(또는 정당)가 선거에서 이길 확률이 없다면, 내가 투표하는 것은 별로 의미가 없다",
        "어떤 후보에게 표를 던지느냐가 미래의 일에 중요한 영향을 미친다"
    ],
    "34_1": ["한미동맹관계를 더욱 강화해야 한다"],
    "34_2": ["상황에 관계 없이 대북지원은 지속되어야 한다"],
    "34_3": ["소수자에 대한 지원과 보호는 더욱 강화되어야 한다"],
    "34_4": ["난민과 이민자에 대한 문호를 더 개방해야 한다"],
    "34_5": ["복지보다는 경제발전에 더욱 힘을 기울여야 한다"],
    "34_6": ["기업과 고소득자들이 현재보다 세금을 더 많이 내게 해야 한다"],
    "34_7": ["자유가 평등보다 더 중요하다"],
    "34_8": ["자유가 안보보다 더 중요하다"],
    "36": [
        '대부분의 정치인은 국민에게 관심이 없다',
        '대부분의 정치인은 신뢰할 수 없다',
        '정치인들이 우리나라의 가장 큰 문제이다',
        '정치인은 문제해결을 위해 때로는 규칙을 어길수도 있다',
        '가장 중요한 정책결정은 국민보다 정치인한테 맡겨야 한다',
        '대부분의 정치인은 부자와 권력자의 이익을 지킬 뿐이다',
    ],
    "37": [
        "나라가 어려운 상황에 처했을 때 정부는 법을 어기더라도 일을 해내는 것이 중요하다",
        "국회와 언론이 정부를 지나치게 감시하면 정부는 일을 할 수 없다",
        "우리나라의 발전을 위해서 독재시절의 방식도 사용해야 한다"
    ]
}

QUESTION_LABELS = {
    "33": "투표", "34_1": "동맹", "34_2": "대북지원", "34_3": "소수자",
    "34_4": "난민", "34_5": "경제발전", "34_6": "세금", "34_7": "평등",
    "34_8": "안보", "36": "정치인", "37": "독재"
}

## Retriever and Summarizer Modules

In [None]:
AUGMENT_PROMPT = """
Please write exactly two lines in Korean about the topic '{topic}':
1. The first line should briefly state the reason for supporting.
2. The second line should briefly state the reason for opposing.
"""

SUMMARIZER_PROMPT = """
You are a sophisticated qualitative analysis AI.
Your primary function is to extract and analyze multiple dominant themes from social media posts.
Your goal is to identify the various topics, sentiments, and arguments that emerge in public discussions.

Core Task: Extract Multiple Dominant Themes
Answer the question: "{question}" based on the provided posts.
Rather than finding a single 'center', identify ALL significant themes that appear in the conversation.

Analysis Workflow:
1. Initial Scanning: Read all posts to identify distinct themes and topics being discussed.
2. Theme Extraction: Identify 2-4 major themes based on frequency, engagement, and significance.
3. Sentiment Analysis per Theme: For each identified theme, analyze the associated sentiments and arguments.

Theme Identification Guidelines:
- Each theme should be distinct and substantive enough to stand alone
- Themes can be completely unrelated to each other
- Include both majority and significant minority themes
- For each theme, accurately represent its dominant sentiment without artificial balancing

Posts:
{opinions}

Output Format (Korean):
<answer>
1. [Theme]: [Key points and sentiment summary]
2. [Theme]: [Key points and sentiment summary]
3. [Theme]: [Key points and sentiment summary]
</answer>
"""

AGENT_PROMPT = """
Based on the analysis below, select your answer from the options.
{analysis}

Question: {question}
Options: {options}

Return only your choice (e.g., "1. Strongly Disagree").
"""

def run_retriever(question, top_k=50, model_type="gemini"):
    aug_prompt = AUGMENT_PROMPT.format(topic=question)
    aug_response = call_llm(aug_prompt, model_type=model_type)
    
    query_emb = embedding_model.encode([f"query: {aug_response}"], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    distances, indices = index.search(query_emb, top_k)
    return chunked_df.iloc[indices[0]]["Text"].tolist()

def run_summarizer(question, texts, model_type="gemini"):
    opinions = "\n---\n".join(texts)
    prompt = SUMMARIZER_PROMPT.format(question=question, opinions=opinions)
    return call_llm(prompt, model_type=model_type)

def run_agent(analysis, question, options, model_type="gemini"):
    prompt = AGENT_PROMPT.format(analysis=analysis, question=question, options=options)
    return call_llm(prompt, model_type=model_type)

## Run Experiments

In [None]:
OPTIONS_5 = "['1. Strongly Disagree', '2. Generally Disagree', '3. So-so', '4. Generally Agree', '5. Strongly Agree']"
OPTIONS_4 = "['1. Strongly Disagree', '2. Generally Disagree', '3. Generally Agree', '4. Strongly Agree']"

top_k = 50
num_iterations = 100
save_path = f"../results/RQ2/{PERSPECTIVE}"
os.makedirs(save_path, exist_ok=True)

for model_type in ["gemini", "gpt", "claude"]:
    print(f"\n=== Running with {model_type} ===")
    
    for option_key, questions in SURVEY_QUESTIONS.items():
        options = OPTIONS_5 if option_key == "36" else OPTIONS_4
        results = []
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        for question in tqdm(questions, desc=f"{option_key}"):
            for seed in range(num_iterations):
                retrieved = run_retriever(question, top_k, model_type)
                summary = run_summarizer(question, retrieved, model_type)
                response = run_agent(summary, question, options, model_type)
                
                results.append({
                    "question": question,
                    "seed": seed,
                    "answer_response": response,
                    "method": "ours"
                })
                time.sleep(2)
        
        pd.DataFrame(results).to_csv(
            f"{save_path}/{PERSPECTIVE}_ours_{model_type}_{option_key}_{timestamp}.csv",
            index=False, encoding="utf-8-sig"
        )

## Load Real Survey Data

In [None]:
president_20th = pd.read_csv("../dataset/RQ2_ksdc/20th_president.csv", encoding="cp949")
president_20th = president_20th.dropna()

if PERSPECTIVE == "bosu":
    survey_data = president_20th[president_20th["q5"].isin([1, 2])]
else:
    survey_data = president_20th[president_20th["q5"].isin([4, 5])]

print(f"Survey respondents ({PERSPECTIVE}): {len(survey_data)}")

## Statistical Analysis (KL Divergence)

In [None]:
def calculate_distribution(series, num_categories=5):
    counts = series.value_counts()
    dist = np.zeros(num_categories)
    for cat, count in counts.items():
        if 1 <= cat <= num_categories:
            dist[int(cat)-1] = count
    return dist / dist.sum() if dist.sum() > 0 else dist

def kl_divergence(P, Q, epsilon=1e-10):
    P = np.asarray(P, dtype=np.float64) + epsilon
    Q = np.asarray(Q, dtype=np.float64) + epsilon
    P = P / np.sum(P)
    Q = Q / np.sum(Q)
    return np.sum(rel_entr(P, Q))

def compare_distributions(ground_truth_df, predictions_dict, questions, num_categories=5):
    results = []
    for question in questions:
        gt_dist = calculate_distribution(ground_truth_df[question], num_categories)
        row = {"question": question}
        for method_name, pred_df in predictions_dict.items():
            pred_dist = calculate_distribution(pred_df[question], num_categories)
            row[method_name] = kl_divergence(gt_dist, pred_dist)
        results.append(row)
    return pd.DataFrame(results)

## Parse Results and Calculate KL Divergence

In [None]:
CHOICE_MAP = {
    "Strongly Disagree": 1, "1. Strongly Disagree": 1, "1": 1,
    "Generally Disagree": 2, "2. Generally Disagree": 2, "2": 2,
    "So-so": 3, "3. So-so": 3, "3": 3,
    "Generally Agree": 3, "3. Generally Agree": 3,
    "Strongly Agree": 4, "4. Strongly Agree": 4, "4": 4,
    "5. Strongly Agree": 5, "5": 5
}

def parse_response(x):
    if pd.isna(x):
        return None
    x_str = str(x).strip()
    for key, val in CHOICE_MAP.items():
        if key in x_str:
            return val
    match = re.search(r"(\d)", x_str)
    if match:
        return int(match.group(1))
    return None

result_files = glob.glob(f"{save_path}/{PERSPECTIVE}_ours_*.csv")
if result_files:
    all_results = pd.concat([pd.read_csv(f, encoding="utf-8-sig") for f in result_files])
    all_results["parsed_response"] = all_results["answer_response"].apply(parse_response)
    print(f"Total responses: {len(all_results)}")
    print(f"Valid responses: {all_results['parsed_response'].notna().sum()}")