In [1]:
# https://docs.arize.com/phoenix/evaluation/evals

In [1]:
from phoenix.otel import register

tracer_provider = register(
  project_name="default",
  endpoint="http://phoenix:6006/v1/traces",
  # auto_instrument=True # Auto-instrument your app based on installed dependencie
)

🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: default
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://phoenix:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [2]:
from openinference.instrumentation.openai import OpenAIInstrumentor

OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

In [3]:
import pandas as pd

# 設定顯示選項，確保不會截斷字串
pd.set_option("display.max_colwidth", None)  # 使得字串欄位完整顯示
pd.set_option("display.max_columns", None)   # 顯示所有欄位
pd.set_option("display.width", None)         # 讓輸出寬度自動調整

# 如果需要顯示更多列，可以視需求調整：
# pd.set_option("display.max_rows", 200)

df = pd.DataFrame(
    [
        {
            "reference": "The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",
            "query": "Where is the Eiffel Tower located?",
            "response": "The Eiffel Tower is located in Paris, France.",
        },
        {
            "reference": "The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",
            "query": "How long is the Great Wall of China?",
            "response": "The Great Wall of China is approximately 13,171 miles (21,196 kilometers) long.",
        },
        {
            "reference": "The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",
            "query": "What is the largest tropical rainforest?",
            "response": "The Amazon rainforest is the largest tropical rainforest in the world. It is home to the largest number of plant and animal species in the world.",
        },
        {
            "reference": "Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",
            "query": "Which is the highest mountain on Earth?",
            "response": "Mount Everest, standing at 29,029 feet (8,848 meters), is the highest mountain on Earth.",
        },
        {
            "reference": "The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",
            "query": "What is the longest river in the world?",
            "response": "The Nile River, at 6,650 kilometers (4,132 miles), is the longest river in the world.",
        },
        {
            "reference": "The Mona Lisa was painted by Leonardo da Vinci. It is considered an archetypal masterpiece of the Italian Renaissance and has been described as 'the best known, the most visited, the most written about, the most sung about, the most parodied work of art in the world'.",
            "query": "Who painted the Mona Lisa?",
            "response": "The Mona Lisa was painted by the Italian Renaissance artist Leonardo da Vinci.",
        },
        {
            "reference": "The human body has 206 bones. These bones provide structure, protect organs, anchor muscles, and store calcium.",
            "query": "How many bones are in the human body?",
            "response": "The adult human body typically has 256 bones.",
        },
        {
            "reference": "Jupiter is the largest planet in our solar system. It is a gas giant with a mass more than two and a half times that of all the other planets in the solar system combined.",
            "query": "Which planet is the largest in our solar system?",
            "response": "Jupiter is the largest planet in our solar system.",
        },
        {
            "reference": "William Shakespeare wrote 'Romeo and Juliet'. It is a tragedy about two young star-crossed lovers whose deaths ultimately reconcile their feuding families.",
            "query": "Who wrote 'Romeo and Juliet'?",
            "response": "The play 'Romeo and Juliet' was written by William Shakespeare.",
        },
        {
            "reference": "The first moon landing occurred in 1969. On July 20, 1969, American astronauts Neil Armstrong and Edwin 'Buzz' Aldrin became the first humans to land on the moon as part of the Apollo 11 mission.",
            "query": "When did the first moon landing occur?",
            "response": "The first moon landing took place on July 20, 1969.",
        },
    ]
)
df.head()

Unnamed: 0,reference,query,response
0,"The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",Where is the Eiffel Tower located?,"The Eiffel Tower is located in Paris, France."
1,"The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",How long is the Great Wall of China?,"The Great Wall of China is approximately 13,171 miles (21,196 kilometers) long."
2,"The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",What is the largest tropical rainforest?,The Amazon rainforest is the largest tropical rainforest in the world. It is home to the largest number of plant and animal species in the world.
3,"Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",Which is the highest mountain on Earth?,"Mount Everest, standing at 29,029 feet (8,848 meters), is the highest mountain on Earth."
4,"The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",What is the longest river in the world?,"The Nile River, at 6,650 kilometers (4,132 miles), is the longest river in the world."


In [4]:
import nest_asyncio
import openai
import os

from phoenix.evals import HallucinationEvaluator, OpenAIModel, QAEvaluator, run_evals

nest_asyncio.apply()  # This is needed for concurrency in notebook environments

# Set your OpenAI API key
eval_model = OpenAIModel(model="o1", api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE"))

# Define your evaluators
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_evaluator = QAEvaluator(eval_model)

# We have to make some minor changes to our dataframe to use the column names expected by our evaluators
# for `hallucination_evaluator` the input df needs to have columns 'output', 'input', 'context'
# for `qa_evaluator` the input df needs to have columns 'output', 'input', 'reference'
df["context"] = df["reference"]
df.rename(columns={"query": "input", "response": "output"}, inplace=True)
assert all(column in df.columns for column in ["output", "input", "context", "reference"])

# Run the evaluators, each evaluator will return a dataframe with evaluation results
# We upload the evaluation results to Phoenix in the next step
hallucination_eval_df, qa_eval_df = run_evals(
    dataframe=df, evaluators=[hallucination_evaluator, qa_evaluator], provide_explanation=True
)

run_evals |          | 0/20 (0.0%) | ⏳ 00:00<? | ?it/s

Exception in worker on attempt 1: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 1: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 1: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 1: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 2: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 3: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 1: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 2: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 1: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 2: raised APIConnectionError('Connection error.')
Requeuing...
Exception in worker on attempt 1: raised APIConnectionError(

In [5]:
import pandas as pd

results_df = df.copy()
results_df["hallucination_eval"] = hallucination_eval_df["label"]
results_df["hallucination_explanation"] = hallucination_eval_df["explanation"]
results_df["qa_eval"] = qa_eval_df["label"]
results_df["qa_explanation"] = qa_eval_df["explanation"]

# 顯示前 5 筆資料，並確保欄位內容不被截斷
results_df.head()


Unnamed: 0,reference,input,output,context,hallucination_eval,hallucination_explanation,qa_eval,qa_explanation
0,"The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",Where is the Eiffel Tower located?,"The Eiffel Tower is located in Paris, France.","The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",,,,
1,"The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",How long is the Great Wall of China?,"The Great Wall of China is approximately 13,171 miles (21,196 kilometers) long.","The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",,,,
2,"The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",What is the largest tropical rainforest?,The Amazon rainforest is the largest tropical rainforest in the world. It is home to the largest number of plant and animal species in the world.,"The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",,,,
3,"Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",Which is the highest mountain on Earth?,"Mount Everest, standing at 29,029 feet (8,848 meters), is the highest mountain on Earth.","Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",,,,
4,"The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",What is the longest river in the world?,"The Nile River, at 6,650 kilometers (4,132 miles), is the longest river in the world.","The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",,,,


In [5]:
import nest_asyncio
import openai
import os

from phoenix.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    SummarizationEvaluator,
    RelevanceEvaluator,
    run_evals
)

# 嘗試引入 FactualityEvaluator，若不可用則設為 None
try:
    from phoenix.evals import FactualityEvaluator
except ImportError:
    print("FactualityEvaluator 不存在，將跳過此評估器。")
    FactualityEvaluator = None

nest_asyncio.apply()  # Notebook環境中需要處理非同步操作

# 建立 OpenAI 模型實例
eval_model = OpenAIModel(
    model="o1",
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE")
)

# 定義評估器
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_evaluator = QAEvaluator(eval_model)
summarization_evaluator = SummarizationEvaluator(eval_model)
if FactualityEvaluator is not None:
    factuality_evaluator = FactualityEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)

# 調整 DataFrame 格式：將 reference 欄位複製為 context，並將 query 與 response 分別改名為 input 與 output
df["context"] = df["reference"]
df.rename(columns={"query": "input", "response": "output"}, inplace=True)
assert all(column in df.columns for column in ["output", "input", "context", "reference"])

# 準備評估器列表：若 FactualityEvaluator 可用，則加入它
evaluators = [
    hallucination_evaluator,
    qa_evaluator,
    summarization_evaluator,
]
if FactualityEvaluator is not None:
    evaluators.append(factuality_evaluator)
evaluators.append(relevance_evaluator)

# 執行評估，回傳的 DataFrame 順序與評估器列表順序一致
results = run_evals(
    dataframe=df,
    evaluators=evaluators,
    provide_explanation=True
)

# 根據 FactualityEvaluator 是否可用，解包回傳的結果
if FactualityEvaluator is not None:
    hallucination_eval_df, qa_eval_df, summarization_eval_df, factuality_eval_df, relevance_eval_df = results
else:
    hallucination_eval_df, qa_eval_df, summarization_eval_df, relevance_eval_df = results

# 整理結果 DataFrame，加入各評估器的結果
results_df = df.copy()
results_df["hallucination_eval"] = hallucination_eval_df["label"]
results_df["hallucination_explanation"] = hallucination_eval_df["explanation"]
results_df["qa_eval"] = qa_eval_df["label"]
results_df["qa_explanation"] = qa_eval_df["explanation"]
results_df["summarization_eval"] = summarization_eval_df["label"]
results_df["summarization_explanation"] = summarization_eval_df["explanation"]
if FactualityEvaluator is not None:
    results_df["factuality_eval"] = factuality_eval_df["label"]
    results_df["factuality_explanation"] = factuality_eval_df["explanation"]
results_df["relevance_eval"] = relevance_eval_df["label"]
results_df["relevance_explanation"] = relevance_eval_df["explanation"]

# 顯示前 5 筆資料
results_df.head()



FactualityEvaluator 不存在，將跳過此評估器。


run_evals |          | 0/40 (0.0%) | ⏳ 00:00<? | ?it/s

Exception in worker on attempt 1: raised InternalServerError('<!DOCTYPE html>\n<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->\n<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->\n<head>\n\n\n<title>farm-bottom-mentor-hostels.trycloudflare.com | 524: A timeout occurred</title>\n<meta charset="UTF-8" />\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<meta http-equiv="X-UA-Compatible" content="IE=Edge" />\n<meta name="robots" content="noindex, nofollow" />\n<meta name="viewport" content="width=device-width,initial-scale=1" />\n<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />\n\n\n</head>\n<body>\n<div id="cf-wrapper">\n    <div id="cf-error-details" class="p-0">\n        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">

Unnamed: 0,reference,input,output,context,hallucination_eval,hallucination_explanation,qa_eval,qa_explanation,summarization_eval,summarization_explanation,relevance_eval,relevance_explanation
0,"The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",Where is the Eiffel Tower located?,"The Eiffel Tower is located in Paris, France.","The Eiffel Tower is located in Paris, France. It was constructed in 1889 as the entrance arch to the 1889 World's Fair.",factual,"EXPLANATION: The query asks for the location of the Eiffel Tower. The reference text states that the Eiffel Tower is located in Paris, France. The answer provided is ""The Eiffel Tower is located in Paris, France."" This statement directly matches the information given in the reference text. Therefore, the answer is not a hallucination and is factually correct based on the provided reference text.\n\nLABEL: factual",correct,"The question asks for the location of the Eiffel Tower. The reference text states, ""The Eiffel Tower is located in Paris, France."" The answer provided is, ""The Eiffel Tower is located in Paris, France."" The answer exactly matches the information provided in the reference text and fully answers the question. Therefore, the answer is correct.\n\nLABEL: correct\n************",good,"Here's a step-by-step explanation to evaluate the summary:\n\n1. **Comprehensiveness:** Does the summary capture the *essential* information from the original document? The original document is a question *asking* where the Eiffel Tower is. The summary *answers* that question. It provides the location – Paris, France. Therefore, it *does* capture the essential information.\n\n2. **Conciseness:** Is the summary as short as it can be while still conveying the essential information? The summary is a single sentence and directly answers the question posed in the original document. There isn’t any extraneous information. It’s quite concise.\n\n3. **Coherence:** Does the summary make sense on its own? Is it logically structured? The summary is a simple statement of fact. It’s perfectly coherent; it doesn't require the original question to understand it.\n\n4. **Independence:** Does the summary stand alone, or does it heavily rely on the original document for context? While the summary *originates* from the question, it doesn't *require* the question to be understood. Someone reading only the summary would understand that the Eiffel Tower is in Paris, France. It's independent in the sense that it’s a complete thought.\n\nConsidering all these points, the summary effectively answers the question posed in the original document in a clear, concise, and self-contained manner.\n\nLABEL: good",relevant,"The question asks for the location of the Eiffel Tower. The reference text explicitly states, ""The Eiffel Tower is located in Paris, France."" This directly answers the question. Therefore, the reference text contains information relevant to answering the question.\n\nLABEL: relevant"
1,"The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",How long is the Great Wall of China?,"The Great Wall of China is approximately 13,171 miles (21,196 kilometers) long.","The Great Wall of China is over 13,000 miles long. It was built over many centuries by various Chinese dynasties to protect against nomadic invasions.",factual,"The query asks for the length of the Great Wall of China. The reference text states the Great Wall is ""over 13,000 miles long"". The answer states the Great Wall is approximately 13,171 miles (21,196 kilometers) long.\n\nThe answer provides a more precise length than the reference text, but it does *not* contradict the reference text. The reference text says ""over 13,000 miles"", and 13,171 miles is indeed over 13,000 miles. The answer doesn't introduce any new information that wasn't implied or reasonably inferred from the reference text. Therefore, the answer is factual, even though it provides a more specific number.\n\nLABEL: factual",correct,"The question asks for the length of the Great Wall of China. The reference text states the Great Wall is ""over 13,000 miles long"". The answer provides a more specific length of ""approximately 13,171 miles"". Since 13,171 miles is greater than 13,000 miles and provides a more precise answer to the question of length, the answer is consistent with and expands upon the information in the reference text. The answer fully addresses the question.\n\nLABEL: correct",good,"Here's a step-by-step explanation of how to evaluate the summary against the original document, focusing on comprehensiveness, conciseness, coherence, and independence:\n\n1. **Comprehensiveness Check:** Does the summary answer the question posed in the original document? The original document asks ""How long is the Great Wall of China?"". The summary *does* provide a length. We need to consider if it provides *enough* information to satisfy the question. A single, specific length is a reasonable answer.\n\n2. **Conciseness Check:** Is the summary unnecessarily verbose? The summary is a single sentence stating the length. Given the directness of the original question, this is appropriately concise. There's no extraneous information.\n\n3. **Coherence Check:** Does the summary make sense on its own, and is the information presented logically? Yes, the statement ""The Great Wall of China is approximately 13,171 miles (21,196 kilometers) long"" is a clear and logical statement.\n\n4. **Independence Check:** Does the summary require the original document to be understood? No. The summary is self-contained; a reader doesn't need the original question to understand the information presented. It stands alone as a factual statement.\n\nConsidering all four criteria – comprehensiveness, conciseness, coherence, and independence – the summary effectively addresses the original document's question in a clear, brief, and standalone manner.\n\nLABEL: good",relevant,"The question asks for the length of the Great Wall of China. The reference text explicitly states the length of the Great Wall of China: ""over 13,000 miles long."" Therefore, the reference text directly provides information to answer the question.\n\nLABEL: relevant"
2,"The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",What is the largest tropical rainforest?,The Amazon rainforest is the largest tropical rainforest in the world. It is home to the largest number of plant and animal species in the world.,"The Amazon rainforest is the largest tropical rainforest in the world. It covers much of northwestern Brazil and extends into Colombia, Peru and other South American countries.",factual,"The query asks for the largest tropical rainforest. The reference text states that the Amazon rainforest is the largest tropical rainforest. The answer states the Amazon rainforest is the largest tropical rainforest *and* that it is home to the largest number of plant and animal species. While the first part of the answer is directly supported by the reference text, the claim about plant and animal species is *not* mentioned in the reference text. Therefore, that portion of the answer is an assumption not found within the provided text. However, the core answer to the question is correct as supported by the reference text. Since the answer isn't entirely based on the reference text, it contains information not explicitly stated in the reference text.\n\nLABEL: factual",correct,"The question asks for the largest tropical rainforest. The reference text states that the Amazon rainforest is the largest tropical rainforest. The answer also states that the Amazon rainforest is the largest tropical rainforest. Additionally, the answer provides extra information about the Amazon rainforest having the largest number of plant and animal species. While this extra information isn't *required* to answer the question, it doesn't detract from the correctness of the answer. The core of the question is answered accurately and completely.\n\nLABEL: correct",bad,"Here's how we can evaluate the summary against the original document, step-by-step:\n\n1. **Comprehensiveness:** Does the summary cover the key information presented (or implied) in the original document? The original document *asks a question* about the largest tropical rainforest. A comprehensive summary should *answer* that question. The summary *does* answer the question, identifying the Amazon rainforest. It also *adds* information about biodiversity, which, while related, isn't directly *asked* for. We need to consider if that addition is helpful or detracts from directness.\n\n2. **Conciseness:** Is the summary as short as possible while still conveying the essential information? The summary is two sentences long. Given that the original document is a single question, two sentences isn’t excessively long, but we should evaluate if both sentences are *necessary* to answer the question.\n\n3. **Coherence:** Does the summary make sense on its own? Is it logically organized? The summary flows reasonably well – it identifies the rainforest and then adds a supporting fact.\n\n4. **Independence:** Can the summary be understood without needing to refer back to the original document? Yes, the summary is understandable on its own. It doesn’t rely on the original question for context.\n\nConsidering these points, the summary *answers* the question, but it *also* adds extra information that wasn't directly requested. While not *incorrect*, it slightly reduces the directness of answering the original query.\n\nLABEL: bad",relevant,"The question asks for the largest tropical rainforest. The reference text states, ""The Amazon rainforest is the largest tropical rainforest in the world."" This directly answers the question. Therefore, the reference text contains information relevant to answering the question.\n\nLABEL: relevant"
3,"Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",Which is the highest mountain on Earth?,"Mount Everest, standing at 29,029 feet (8,848 meters), is the highest mountain on Earth.","Mount Everest is the highest mountain on Earth. It is located in the Mahalangur Himal sub-range of the Himalayas, straddling the border between Nepal and Tibet.",hallucinated,"The query asks for the highest mountain on Earth. The reference text states Mount Everest is the highest mountain on Earth and provides its location. The answer also states Mount Everest is the highest mountain on Earth and *adds* the height: 29,029 feet (8,848 meters). While the statement that Mount Everest is the highest mountain is supported by the reference text, the *specific height* is not mentioned in the reference text. Therefore, the added height information is not explicitly supported by the provided text, even if it is a generally known fact. Since the answer includes information not present in the reference text, it can be considered a slight hallucination.\n\nLABEL: hallucinated",correct,"The question asks for the highest mountain on Earth. The reference text states that Mount Everest is the highest mountain on Earth. The answer also states that Mount Everest is the highest mountain on Earth, and additionally provides its height. Since the answer correctly identifies the highest mountain and provides additional relevant information, it fully and correctly answers the question.\n\nLABEL: correct\n************",good,"Here's how we can evaluate the summary against the original document:\n\n1. **Comprehensiveness:** Does the summary cover all the *important* information present (or strongly implied) in the original document? The original document poses a question *about* the highest mountain. The summary *answers* that question. It provides the name of the highest mountain (Mount Everest). Therefore, it appears to be comprehensive *for the purpose of answering the question*.\n\n2. **Conciseness:** Is the summary as short as it can be while still conveying the necessary information? The summary is a single sentence and directly answers the question posed in the original document. It doesn't include extraneous details. It is reasonably concise.\n\n3. **Coherence:** Does the summary make sense on its own, and is the information presented logically? The summary is a clear and logical statement. It directly states a fact about Mount Everest. It's easy to understand even without the original question.\n\n4. **Independence:** Does the summary stand alone, or does it rely heavily on the original document for context? While the summary *originates* from the question in the original document, it doesn't *require* the original document to be understood. A reader encountering the summary alone would understand it.\n\nConsidering these four points, the summary effectively addresses the original document’s question in a clear, concise, and independent manner.\n\nLABEL: good",relevant,"The question asks for the highest mountain on Earth. The reference text directly states ""Mount Everest is the highest mountain on Earth."" This is a direct answer to the question. Therefore, the reference text contains information relevant to answering the question.\n\nLABEL: relevant"
4,"The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",What is the longest river in the world?,"The Nile River, at 6,650 kilometers (4,132 miles), is the longest river in the world.","The Nile is the longest river in the world. It flows northward through northeastern Africa for approximately 6,650 km (4,132 miles) from its most distant source in Burundi to the Mediterranean Sea.",factual,"EXPLANATION: The query asks for the longest river in the world. The reference text states that the Nile is the longest river in the world and provides its length as approximately 6,650 km (4,132 miles). The answer states that the Nile River, at 6,650 kilometers (4,132 miles), is the longest river in the world. The answer directly reflects the information provided in the reference text, without adding any new or contradictory information. Therefore, the answer is based on the reference text and is not a hallucination.\n\nLABEL: factual",correct,"The question asks for the longest river in the world. The reference text states that the Nile is the longest river in the world and provides its length. The answer also states that the Nile is the longest river in the world and includes the same length provided in the reference text. Therefore, the answer accurately and completely answers the question based on the provided reference.\n\nLABEL: correct",good,"Here's how we can evaluate the summary against the original document, step-by-step:\n\n1. **Comprehensiveness:** Does the summary cover the key information requested in the original document? The original document *asks a question* about the longest river. The summary *answers* that question. It provides the name of the longest river (the Nile) and its length. Therefore, it appears to be comprehensive *in answering the question*.\n\n2. **Conciseness:** Is the summary as short as it can be while still conveying the essential information? The summary is a single sentence. Given the simple question, a shorter summary would likely omit crucial details (like the river's length). It appears concise.\n\n3. **Coherence:** Does the summary make sense on its own? Is it logically structured? The summary presents a clear statement of fact. It's easy to understand without needing the original question for context. It is coherent.\n\n4. **Independence:** Does the summary stand alone, or does it heavily rely on the original document for meaning? The summary *could* stand alone as a factual statement, even without knowing the original question. While the question prompted the information, the summary itself isn't *dependent* on it for understanding.\n\nConsidering all these points, the summary effectively and efficiently answers the question posed in the original document.\n\nLABEL: good",relevant,"The question asks for the longest river in the world. The reference text explicitly states ""The Nile is the longest river in the world."" This directly answers the question. Therefore, the reference text contains information relevant to answering the question.\n\nLABEL: relevant"


In [None]:
import pandas as pd

results_df = df.copy()
results_df["hallucination_eval"] = hallucination_eval_df["label"]
results_df["hallucination_explanation"] = hallucination_eval_df["explanation"]
results_df["qa_eval"] = qa_eval_df["label"]
results_df["qa_explanation"] = qa_eval_df["explanation"]

results_df["summarization_eval"] = summarization_eval_df["label"]
results_df["summarization_explanation"] = summarization_eval_df["explanation"]
results_df["factuality_eval"] = factuality_eval_df["label"]
results_df["factuality_explanation"] = factuality_eval_df["explanation"]
results_df["relevance_eval"] = relevance_eval_df["label"]
results_df["relevance_explanation"] = relevance_eval_df["explanation"]

# 顯示前 5 筆資料，並確保欄位內容不被截斷
results_df.head()
