In [30]:
import sys
import os
from components import rag
from components.rag import query_answering_system
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

query = "What option do civil servants in Malaysia have for their working hours during Ramadan, according to Communications Minister Fahmi Fadzil?"
document_dataset = "./data/1K_news.csv"
output = rag.query_answering_system(query, document_dataset)
print('Answer: {}'.format(output['answer']))
print('Reference1: {}'.format(output['title'][0]))
print('Reference1: {}'.format(output['title'][1]))
print('Reference1: {}'.format(output['title'][2]))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Answer: Civil servants in Malaysia have the option to choose flexible working hours during Ramadan, according to Communications Minister Fahmi Fadzil. They can either maintain their existing working hours or shorten their lunch break by 30 minutes, allowing them to leave work 30 minutes earlier. This is in line with discussions with Chief Secretary to the Government Tan Sri Shamsul Azri Abu Bakar and Public Service director-general Tan Sri Wan Ahmad Dahlan Abdul Aziz. The state government will implement flexible working hours, which shortens rest periods by 30 minutes to allow staff to go home earlier. The flexible working hours' implementation for federal department staff in Perlis will depend on the decisions made by their respective department heads. The state government remains committed to ensuring the welfare of its civil servants without compromising service delivery efficiency, as part of its efforts to improve public service quality for the people.
Reference1: Civil servants c

In [13]:
import ollama
import pandas as pd

# 配置参数
INPUT_CSV = "./data/1K_news.csv"
OUTPUT_CSV = "./data/Q3/TestQuestion20.csv"
N_NEWS = 5  # 处理前5条新闻
X_GOOD = 2  # 每条生成2个好问题
Y_BAD = 1   # 每条生成1个坏问题
Z_NORMAL = 1 # 每条生成1个普通问题

# 读取数据时直接获取所需字段
df = pd.read_csv(INPUT_CSV)
news_data = df[["content_id", "title", "text", "category"]].values.tolist()[:N_NEWS]

def generate_questions(news, question_type, num):
    """核心生成函数"""
    # 直接从输入数据解包（这里固定获取原始数据）
    content_id, title, text, category = news
    
    # 简化后的prompt（只让AI生成问题和原因）
    prompt = f"""Generate {num} {question_type} questions from this news:
    
    News Content: {text[:500]}
    
    Output Format（只需生成前两列）:
    [Question] || [Reason]
    
    Example:
    How do mergers affect market competition? || Analyzes business strategy impact"""
    
    # 调用模型
    response = ollama.chat(
        model="gemma",
        messages=[{"role": "user", "content": prompt}]
    )
    
    # 解析响应时直接使用原始数据（关键修改点）
    questions = []
    for line in response["message"]["content"].split("\n"):
        if "||" in line:
            parts = line.split("||")
            if len(parts) >= 2:  # 只取问题和原因
                questions.append({
                    "question": parts[0].strip(),
                    "reason": parts[1].strip(),
                    # 以下字段直接使用原始数据
                    "source_id": content_id,
                    "source_title": title,
                    "source_category": category,
                    "label": question_type
                })
    return questions[:num]

# 批量生成问题
all_questions = []
for news in news_data:
    all_questions.extend(generate_questions(news, "Good", X_GOOD))
    all_questions.extend(generate_questions(news, "Bad", Y_BAD))
    all_questions.extend(generate_questions(news, "General", Z_NORMAL))

# 保存结果
pd.DataFrame(all_questions).to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

In [12]:
import IPython
IPython.display.clear_output()

In [40]:
df = pd.read_csv("TestQuestion20.csv")
df_1K = pd.read_csv("./data/1K_news.csv")

if 'Question' not in df.columns:
    raise ValueError("The CSV file must contain a 'Question' column.")

answers = []
retrieved_docs_id = []
retrieved_docs1 = []
retrieved_docs2 = []
retrieved_docs3 = []

document_dataset = "../data/1K_news.csv"

for question in df['Question']:

    output = query_answering_system(question, document_dataset)

    answers.append(output['answer'])
    retrieved_docs_id.append(str(output['retrieved_docs_id']))

    retrieved_docs1.append(
        df_1K.loc[df_1K['content_id'] == output['retrieved_docs_id'][0], 'text'].values[0]
    )
    retrieved_docs2.append(
        df_1K.loc[df_1K['content_id'] == output['retrieved_docs_id'][1], 'text'].values[0]
    )
    retrieved_docs3.append(
        df_1K.loc[df_1K['content_id'] == output['retrieved_docs_id'][2], 'text'].values[0]
    )

ValueError: The CSV file must contain a 'Question' column.

In [41]:
df['answer'] = answers
df['retrieved_docs_id'] = retrieved_docs_id
df['retrieved document1'] = retrieved_docs1
df['retrieved document2'] = retrieved_docs2
df['retrieved document3'] = retrieved_docs3

df.to_csv("TestQuestion20_with_answers.csv", index=False)

print("Processing complete. The output file is saved as 'TestQuestion20_with_answers.csv'.")

NameError: name 'answers' is not defined