In [1]:
from components import rag

query = "What option do civil servants in Malaysia have for their working hours during Ramadan, according to Communications Minister Fahmi Fadzil?"
document_dataset = "./data/1K_news.csv"
output = rag.query_answering_system(query, document_dataset)
print('Answer: {}'.format(output['answer']))
print('Reference1: {}'.format(output['title'][0]))
print('Reference1: {}'.format(output['title'][1]))
print('Reference1: {}'.format(output['title'][2]))


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Answer: Civil servants in Malaysia can opt for flexible working hours during Ramadan, according to Communications Minister Fahmi Fadzil. They can either maintain their existing working hours or shorten their lunch break by 30 minutes, allowing them to leave work 30 minutes earlier. This follows discussions with Chief Secretary to the Government Tan Sri Shamsul Azri Abu Bakar and Public Service director-general Tan Sri Wan Ahmad Dahlan Abdul Aziz, he said.
Reference1: Civil servants can opt for flexible working hours during Ramadan, says Fahmi.
Reference1: Mixed rice issue: State fatwa committees decide on zakat fitrah rate, says Religious Affairs Minister.
Reference1: Penang extends exemption from land, parcel tax penalty arrears until May 31.


In [13]:
import ollama
import pandas as pd

# 配置参数
INPUT_CSV = "./data/1K_news.csv"
OUTPUT_CSV = "./data/Q3/TestQuestion20.csv"
N_NEWS = 5  # 处理前5条新闻
X_GOOD = 2  # 每条生成2个好问题
Y_BAD = 1   # 每条生成1个坏问题
Z_NORMAL = 1 # 每条生成1个普通问题

# 读取数据时直接获取所需字段
df = pd.read_csv(INPUT_CSV)
news_data = df[["content_id", "title", "text", "category"]].values.tolist()[:N_NEWS]

def generate_questions(news, question_type, num):
    """核心生成函数"""
    # 直接从输入数据解包（这里固定获取原始数据）
    content_id, title, text, category = news
    
    # 简化后的prompt（只让AI生成问题和原因）
    prompt = f"""Generate {num} {question_type} questions from this news:
    
    News Content: {text[:500]}
    
    Output Format（只需生成前两列）:
    [Question] || [Reason]
    
    Example:
    How do mergers affect market competition? || Analyzes business strategy impact"""
    
    # 调用模型
    response = ollama.chat(
        model="gemma",
        messages=[{"role": "user", "content": prompt}]
    )
    
    # 解析响应时直接使用原始数据（关键修改点）
    questions = []
    for line in response["message"]["content"].split("\n"):
        if "||" in line:
            parts = line.split("||")
            if len(parts) >= 2:  # 只取问题和原因
                questions.append({
                    "question": parts[0].strip(),
                    "reason": parts[1].strip(),
                    # 以下字段直接使用原始数据
                    "source_id": content_id,
                    "source_title": title,
                    "source_category": category,
                    "label": question_type
                })
    return questions[:num]

# 批量生成问题
all_questions = []
for news in news_data:
    all_questions.extend(generate_questions(news, "Good", X_GOOD))
    all_questions.extend(generate_questions(news, "Bad", Y_BAD))
    all_questions.extend(generate_questions(news, "General", Z_NORMAL))

# 保存结果
pd.DataFrame(all_questions).to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

In [12]:
import IPython
IPython.display.clear_output()

In [30]:
import pandas as pd
from components import rag  # Importing the RAG retrieval system

# **1️⃣ Load TestQuestion20.csv**
input_csv = "./data/Q3/TestQuestion20.csv"
df_questions = pd.read_csv(input_csv, encoding="utf-8")

# **2️⃣ Load Document Dataset**
document_dataset = "./data/1K_news.csv"

# **3️⃣ Process Each Question**
retrieved_data = []
for idx, row in df_questions.iterrows():
    query = row["Question"]  # Get the query from CSV
    output = rag.query_answering_system(query, document_dataset)

    # Store retrieved documents (titles)
    retrieved_docs = output.get("title", ["N/A", "N/A", "N/A"])  # Ensure there are always 3 elements

    # Append data
    retrieved_data.append({
        "Generated_Answer": output["answer"],
        "Retrieved_Docs_ID1": idx * 3 + 1,
        "Retrieved_Docs_ID2": idx * 3 + 2,
        "Retrieved_Docs_ID3": idx * 3 + 3,
        "Retrieved_Document1": retrieved_docs[0],
        "Retrieved_Document2": retrieved_docs[1],
        "Retrieved_Document3": retrieved_docs[2]
    })

# **4️⃣ Convert Retrieved Data to DataFrame**
df_retrieved = pd.DataFrame(retrieved_data)

# **5️⃣ Merge with Original Data**
df_enriched = pd.concat([df_questions, df_retrieved], axis=1)

# **6️⃣ Save to New CSV**
output_csv = "./data/Q3/TestQuestion20_with_answers.csv"
df_enriched.to_csv(output_csv, index=False, encoding="utf-8")

print(f"✅ Enriched dataset saved to {output_csv}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


✅ Enriched dataset saved to ./data/Q3/TestQuestion20_with_answers.csv
