In [None]:
import os.path
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
    Settings,
)
from llama_parse import LlamaParse

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from dotenv import load_dotenv
import pickle

# NESTED ASYNCIO LOOP NEEDED TO RUN ASYNC IN A NOTEBOOK
import nest_asyncio

nest_asyncio.apply()

# load the environment variables
load_dotenv()

# check if storage already exists
PERSIST_DIR = "./storage"

# bge-large embedding model
# Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-zh-v1.5")


# create the parser
parser = LlamaParse(result_type="markdown")

file_extractor = {".pdf": parser}

# chunk
text_splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

if not os.path.exists(PERSIST_DIR):
    # load the documents
    documents = SimpleDirectoryReader("data",file_extractor=file_extractor).load_data()
    # 將 documents 存成二進位檔案
    with open("documents.pkl", "wb") as f:
        pickle.dump(documents, f)
    # create the index
    index = VectorStoreIndex.from_documents(documents, transformations=[text_splitter])
    
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

In [None]:
from llama_index.llms.ollama import Ollama

Settings.llm = Ollama(model="llama3.1:latest", request_timeout=60.0,temperature=0.3)

In [None]:
query = "校園行事曆有甚麼活動？請詳細列出"

In [None]:
# Each will retrieve the top-2 most similar nodes
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.retrievers.bm25 import BM25Retriever

# configure retriever
vector_retriever = index.as_retriever(similarity_top_k=2, verbose=True)
# vector_retriever = VectorIndexRetriever(index=index, similarity_top_k=3, verbose=True)

bm25_retriever = BM25Retriever.from_defaults(
    docstore=index.docstore, similarity_top_k=2
)

In [None]:
from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=3,
    num_queries=4,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True,
    # query_gen_prompt="...",  # we could override the query generation prompt here
)

In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import get_response_synthesizer

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
    
)
response = query_engine.query("一月有甚麼活動？")
print(response)

# file_names = [os.path.basename(path) for path in response.references]
# print(
#     f"response: \n answer_content: {response.answer}\n Reference_file_path: {file_names}"
# )
print("-----------------")
for i, node in enumerate(response.source_nodes):
    print(f"Node {i + 1}:")
    for key, value in node.metadata.items():
        print(f"{key}: {value}")
    print(f"content: {node.get_content()}")
    print("-----------------")

In [None]:
prompts_dict = query_engine.get_prompts()
for k, p in prompts_dict.items():
    print(f"Prompt Key: {k}\nText:\n{p.get_template()}\n")


In [None]:
# apply nested async to run in a notebook
import nest_asyncio

nest_asyncio.apply()

nodes_with_scores = retriever.retrieve(query)

for node in nodes_with_scores:
    print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")

In [None]:
# prompt
qa_prompt_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query in the style of a Shakespeare play.\n"
    "Query: {query_str}\n"
    "Answer: "
)

# prompt_tmpl = PromptTemplate(
#     qa_prompt_tmpl_str,
#     context_str="\n\n".join([n.get_content() for n in retrieved_nodes]),
#     query_str=query,
# )
# # configure response synthesizer
# response_synthesizer = get_response_synthesizer(text_qa_template=qa_prompt_tmpl_str)


query = "What is the capital of France?"

In [None]:
# 過濾固定無用問句
def clean_examples(examples, num_questions_per_chunk):
    cleaned_examples = []

    for idx, example in enumerate(examples):
        # 根據num_questions_per_chunk的值刪除索引對應的資料
        if idx % num_questions_per_chunk != 0:  # 保留1, 3, 5, 7...
            cleaned_examples.append(example)

    return cleaned_examples

In [22]:
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from types import SimpleNamespace
# check if dataset already exists
PERSIST_DATASET_FILE = "./rag_dataset.json"

if not os.path.exists(PERSIST_DATASET_FILE):
    # 讀取 documents
    with open("documents.pkl", "rb") as f:
        documents = pickle.load(f)

    QUESTION_GEN_QUERY = "你是一名教師/教授。你的任務是為即將到來的測驗/考試設計 {num_questions_per_chunk} 道題目。這些題目應該涵蓋文件中的多樣內容，並且限定在所提供的上下文資訊內。"
    QUESTION_GENERATION_PROMPT = """\
    以下是背景資訊：
    ---------------------
    {context_str}
    ---------------------
    根據上述背景資訊，而非先前知識(prior knowledge)，  
    僅基於以下查詢生成問題：  
    {query_str}
    """
    NUM_QUESTIONS_PER_CHUNK = 2

    # define generator, generate questions
    dataset_generator = RagDatasetGenerator.from_documents(
        documents=documents,
        num_questions_per_chunk= NUM_QUESTIONS_PER_CHUNK,  # set the number of questions per nodes
        transformations=[text_splitter],
        question_gen_query=QUESTION_GEN_QUERY,
    )

    # rag_dataset = dataset_generator.generate_questions_from_nodes()
    rag_dataset = await dataset_generator.agenerate_dataset_from_nodes()

    # clean the examples
    rag_dataset.examples = clean_examples(rag_dataset.examples, NUM_QUESTIONS_PER_CHUNK)

    # save this dataset as it is required for the submission
    rag_dataset.save_json("rag_dataset.json")
else:
    import json
    with open(PERSIST_DATASET_FILE, "r") as f:
        # 將字典轉換為支持屬性訪問的結構
        rag_dataset = json.load(f, object_hook=lambda d: SimpleNamespace(**d))

In [23]:
from llama_index.core.evaluation import (
    EvaluationResult,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
)
from llama_index.core import Response
import pandas as pd
import asyncio
from llama_index.core.llama_dataset import (
    LabelledRagDataExample,
)
# 累積評估結果的 DataFrame
eval_results_df = pd.DataFrame()


# 累積顯示評估結果的函數
def display_eval_df(
    response: Response,
    dataset_example: LabelledRagDataExample,
    relevancy_eval_result: EvaluationResult,
    correctness_eval_result: EvaluationResult,
) -> None:
    if not response.source_nodes:
        print("no response!")
        return
    eval_row = pd.DataFrame(
        [
            {   "Query": dataset_example.query,
                "Response": str(response),
                "Reference Answer": dataset_example.reference_answer,
                "Source": response.source_nodes[0].node.text[:1000] + "...",
                "Relevancy Eval Result": f"{'Pass' if relevancy_eval_result.passing else 'Fail'} \nscore: {relevancy_eval_result.score}",
                "Relevancy Reasoning": relevancy_eval_result.feedback,
                "Correctness Eval Result": f"{'Pass' if correctness_eval_result.passing else 'Fail'} \n\nscore: {correctness_eval_result.score}",
                "Correctness Reasoning": correctness_eval_result.feedback,
            }
        ]
    )
    global eval_results_df
    eval_results_df = pd.concat([eval_results_df, eval_row], ignore_index=True)


# 評估查詢引擎並顯示最終結果
async def evaluate_chat_engine(chat_engine, dataset_examples:LabelledRagDataExample):
    dataset_examples = dataset_examples
    questions = [e.query for e in dataset_examples]

    c = [chat_engine.aquery(q) for q in questions]
    results = await asyncio.gather(*c)
    print("finished query")
    print('------------------------------')

    # 定義評估器
    relevancy_evaluator = RelevancyEvaluator()
    correctness_evaluator = CorrectnessEvaluator()

    relevancy_total_correct = 0
    correctness_total_correct = 0
    for response, dataset_example in zip(results, dataset_examples):
        relevancy_eval_result = relevancy_evaluator.evaluate_response(
            query=dataset_example.query, response=response
        )
        correctness_eval_result = correctness_evaluator.evaluate_response(
            query=dataset_example.query,
            response=response,
            reference=dataset_example.reference_answer,
        )

        # 累積每個查詢的評估結果
        display_eval_df(
            response, dataset_example, relevancy_eval_result, correctness_eval_result
        )

        if relevancy_eval_result.passing:
            relevancy_total_correct += 1
        if correctness_eval_result.passing:
            correctness_total_correct += 1

    return relevancy_total_correct, correctness_total_correct, len(results)


# 執行評估查詢並顯示最終累積結果
def run_evaluate_chat_engine(chat_engine, dataset_examples):
    relevancy_total_correct, correctness_total_correct, total_questions = asyncio.run(
        evaluate_chat_engine(chat_engine, dataset_examples)
    )
    global eval_results_df
    # styled_df = eval_results_df.style.set_properties(
    #     **{"inline-size": "400px", "overflow-wrap": "break-word"},
    #     subset=["Response", "Source"]
    # )
    styled_df = eval_results_df.style.set_properties(
        **{"white-space": "pre-wrap"}, 
    )

    display(styled_df)
    # 顯示總結結果
    print(f"Total Relevancy correct: {relevancy_total_correct} out of {total_questions}")
    print(
        f"Total Correctness correct: {correctness_total_correct} out of {total_questions}"
    )

In [24]:
# 執行查詢引擎並評估結果
run_evaluate_chat_engine(query_engine, rag_dataset.examples)

Generated queries:
Here are three search queries related to your input:
1. 未來大學校園活動安排
2. 大學行事曆和重要日期
3. 學校生活和未來大學校園文化介紹
Generated queries:
Here are three search queries related to the input:
1. 九月份學校開學典禮
2. 學校新學期入學安排
3. 九月份教育局活動清單
finished query
------------------------------


Unnamed: 0,Query,Response,Reference Answer,Source,Relevancy Eval Result,Relevancy Reasoning,Correctness Eval Result,Correctness Reasoning
0,什麼是未來大學校園行事曆?,未來大學校園行事曆是一份綜合了各項校園活動的日曆，涵蓋了學生的權益、環保義工、創業中心開放日等多個方面。,未來大學校園行事曆是一份包含各月活動安排的清單，涵蓋健康日、文化節、環保義工日、創業中心開放日等多種活動。,# 九月 # 9月5日：新學年開學典禮 地點：大禮堂 內容：校長致辭、新生介紹、校園生活指導 # 9月20日：學生權益講座 地點：學生權益保護機構 內容：學生權益保護講座、法律援助介紹 # 十月 # 10月10日：校園節能日 地點：校園各處 內容：節能設備展示、節能比賽 # 10月24日：國際交流活動 地點：國際合作中心 內容：留學生交流、國際文化展示 # 十一月 # 11月8日：科技創新大會 地點：科研與創新區 內容：科技創新展示、專題研討會 # 十二月 # 12月21日：聖誕慶祝活動 地點：校園廣場 內容：聖誕樹點燈、音樂表演、慈善義賣...,Pass score: 1.0,YES,Pass score: 4.0,"The generated answer is highly relevant to the user query, and most of the information is correct. The only minor difference is that the reference answer mentions ""健康日"" (health day) which is not explicitly mentioned in the generated answer, but it's implied by ""學生的權益"" (students' rights). Overall, the generated answer provides a comprehensive overview of the university calendar, and its correctness score is 4.0."
1,什麼是九月份學校的首要活動？,新學年開學典禮。,新學年開學典禮。,# 九月 # 9月5日：新學年開學典禮 地點：大禮堂 內容：校長致辭、新生介紹、校園生活指導 # 9月20日：學生權益講座 地點：學生權益保護機構 內容：學生權益保護講座、法律援助介紹 # 十月 # 10月10日：校園節能日 地點：校園各處 內容：節能設備展示、節能比賽 # 10月24日：國際交流活動 地點：國際合作中心 內容：留學生交流、國際文化展示 # 十一月 # 11月8日：科技創新大會 地點：科研與創新區 內容：科技創新展示、專題研討會 # 十二月 # 12月21日：聖誕慶祝活動 地點：校園廣場 內容：聖誕樹點燈、音樂表演、慈善義賣...,Pass score: 1.0,"YES. The response ""新學年開學典禮"" is in line with the context information provided, which lists a series of events happening throughout the year, and September 5th being the day for the new school year opening ceremony.",Pass score: 5.0,"The generated answer is identical to the reference answer, indicating that it is fully correct and relevant to the user query. The fact that it provides the same information as the reference answer suggests a high level of accuracy and relevance."


Total Relevancy correct: 2 out of 2
Total Correctness correct: 2 out of 2
