In [None]:
import IPython
IPython.display.clear_output()

In [None]:
import sys
import os
from components import rag
from components.rag import query_answering_system
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)
document_dataset = "../data/1K_news.csv"

In [None]:
import pandas as pd
import ollama
import random

# 配置
INPUT_CSV = "./data/1K_news.csv"
OUTPUT_CSV = "./Q3/TestQuestion20.csv"
MODEL_NAME = "gemma"  # 使用本地Ollama模型名称

def generate_question(news, label):
    # 构建不同的提示模板
    role_map = {
        "good": "You are a rigorous journalism professor, ask a short question that requires deeper understanding based on the following news item:",
        "bad": "You're an AI who likes to play pranks, ask a completely short unrelated and ridiculous question based on the following news item:", 
        "general": "You are a news reporter, ask a simple short factual question based on the following:"
    }
    
    prompt = f"{role_map[label]}\n\n{news['text']}\n\nPlease output the question directly without explanation"
    
    response = ollama.generate(
        model=MODEL_NAME,
        prompt=prompt,
        options={'temperature': 0.8 if label == "bad" else 0.4}
    )
    
    return response['response'].split("?")[0].strip() + "?"

# 读取数据
df = pd.read_csv(INPUT_CSV)
results = []

# 生成问题
for _ in range(20):
    # 随机选择1-3篇新闻
    selected = df.sample(random.randint(1,3))
    combined_text = " ".join(selected['text'])
    category = selected.iloc[0]['category']
    ids = ",".join(selected['content_id'].astype(str))
    
    # 确保三种类型均衡
    label = ["good", "bad", "general"][_ % 3] if _ < 18 else random.choice(["good", "bad", "general"])
    
    try:
        question = generate_question({"text": combined_text}, label)
        results.append({
            "question": question,
            "label": label,
            "id": ids,
            "category": category
        })
    except Exception as e:
        print(f"Build failure: {str(e)}")

# 保存结果
pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False, encoding='utf-8')

In [None]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

df = pd.read_csv("./TestQuestion20.csv")
df_1K = pd.read_csv("../data/1K_news.csv")

if 'question' not in df.columns:
    raise ValueError("The CSV file must contain a 'question' column.")

answers = []
retrieved_docs_id = []
retrieved_docs1 = []
retrieved_docs2 = []
retrieved_docs3 = []

document_dataset = "../data/1K_news.csv"

for question in df['question']:

    output = query_answering_system(question, document_dataset)
    print(output)

    answers.append(output['answer'])
    retrieved_docs_id.append(str(output['retrieved_docs_id']))

    retrieved_docs1.append(
        df_1K.loc[df_1K['content_id'] == output['retrieved_docs_id'][0], 'text'].values[0]
    )
    retrieved_docs2.append(
        df_1K.loc[df_1K['content_id'] == output['retrieved_docs_id'][1], 'text'].values[0]
    )
    retrieved_docs3.append(
        df_1K.loc[df_1K['content_id'] == output['retrieved_docs_id'][2], 'text'].values[0]
    )

In [None]:
df['answer'] = answers
df['retrieved_docs_id'] = retrieved_docs_id
df['retrieved document1'] = retrieved_docs1
df['retrieved document2'] = retrieved_docs2
df['retrieved document3'] = retrieved_docs3

df.to_csv("TestQuestion20_with_answers.csv", index=False)

print("Processing complete. The output file is saved as 'TestQuestion20_with_answers.csv'.")