In [None]:
import numpy as np
import pandas as pd
import phoenix as px
from phoenix.experimental.evals.functions import llm_classify
from phoenix.experimental.evals.models import OpenAIModel
from phoenix.experimental.evals.templates.default_templates import (
    HALLUCINATION_PROMPT_RAILS_MAP,
    HALLUCINATION_PROMPT_TEMPLATE,
    QA_PROMPT_RAILS_MAP,
    QA_PROMPT_TEMPLATE,
    RAG_RELEVANCY_PROMPT_RAILS_MAP,
    RAG_RELEVANCY_PROMPT_TEMPLATE,
)
from phoenix.session.evaluation import add_evaluations, get_retrieved_documents
from phoenix.trace.exporter import HttpExporter
from sklearn.metrics import ndcg_score

# Start Phoenix

In [None]:
ds = px.load_example_traces("llama_index_rag")
px.launch_app(trace=ds)

# Extract Retrieved Documents

In [None]:
retrieved_documents = get_retrieved_documents(px.active_session())
retrieved_documents

# Set Up OpenAI

In [None]:
import os
from getpass import getpass

import openai

if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:
model = OpenAIModel(model_name="gpt-4-1106-preview")
model("hi")

# Evaluate Document Relevance

In [None]:
retrieved_documents_eval = llm_classify(
    retrieved_documents,
    model,
    RAG_RELEVANCY_PROMPT_TEMPLATE,
    list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,
)
retrieved_documents_eval["score"] = (
    retrieved_documents_eval.label[~retrieved_documents_eval.label.isna()] == "relevant"
).astype(int)
retrieved_documents_eval.to_parquet("llama_index_rag_with_rerank.documents_eval.parquet")

In [None]:
retrieved_documents_eval = pd.read_parquet("llama_index_rag_with_rerank.documents_eval.parquet")
retrieved_documents_eval

# Merge Data to Compute Ranking Metrics

In [None]:
combined = pd.concat([retrieved_documents, retrieved_documents_eval.add_prefix("eval_")], axis=1)
combined

# Compute NDCG@2

In [None]:
def _compute_ndcg(df: pd.DataFrame, k: int):
    """Compute NDCG@k in the presence of missing values (e.g. as a result of keyboard interrupt)."""
    eval_scores = [np.nan] * k
    pred_scores = [np.nan] * k
    for i in range(k):
        if i >= len(df.eval_score):
            break
        eval_scores[i] = df.eval_score[i]
        pred_scores[i] = df.document_score[i]
    try:
        return ndcg_score([eval_scores], [pred_scores])
    except ValueError:
        return np.nan


ndcg_at_2 = pd.DataFrame({"score": combined.groupby("context.span_id").apply(_compute_ndcg, k=2)})
ndcg_at_2.to_parquet("llama_index_rag_with_rerank.ndcg_at_2.parquet")

In [None]:
ndcg_at_2 = pd.read_parquet("llama_index_rag_with_rerank.ndcg_at_2.parquet")
ndcg_at_2

# Compute Precision@3

In [None]:
precision_at_3 = pd.DataFrame(
    {
        "score": combined.groupby("context.span_id").apply(
            lambda x: x.eval_score[:3].sum(skipna=False) / 3
        )
    }
)
precision_at_3.to_parquet("llama_index_rag_with_rerank.precision_at_3.parquet")

In [None]:
precision_at_3 = pd.read_parquet("llama_index_rag_with_rerank.precision_at_3.parquet")
precision_at_3

# Merge Documents from Retrieval Spans to Q&A Spans (to Compute Q&A Correctness)

In [None]:
qa_df = (
    px.active_session()
    .get_spans_dataframe("output.value is not None", root_spans_only=True)
    .set_index("context.trace_id")[
        ["attributes.input.value", "attributes.output.value", "context.span_id"]
    ]
    .rename({"attributes.input.value": "input", "attributes.output.value": "output"}, axis=1)
)
qa_df["reference"] = retrieved_documents.groupby("context.trace_id").apply(
    lambda x: "\n\n".join(x.reference)
)
qa_df.set_index("context.span_id", inplace=True)
qa_df

# Evaluate Q&A Correctness

In [None]:
qa_correctness_eval = llm_classify(
    qa_df,
    model,
    QA_PROMPT_TEMPLATE,
    list(QA_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,
)
qa_correctness_eval["score"] = (
    qa_correctness_eval.label[~qa_correctness_eval.label.isna()] == "correct"
).astype(int)
qa_correctness_eval.to_parquet("llama_index_rag_with_rerank.qa_correctness_eval.parquet")

In [None]:
qa_correctness_eval = pd.read_parquet("llama_index_rag_with_rerank.qa_correctness_eval.parquet")
qa_correctness_eval

# Evaluate Hallucination

In [None]:
hallucination_eval = llm_classify(
    qa_df,
    model,
    HALLUCINATION_PROMPT_TEMPLATE,
    list(HALLUCINATION_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,
)
hallucination_eval["score"] = (
    hallucination_eval.label[~hallucination_eval.label.isna()] == "factual"
).astype(int)
hallucination_eval.to_parquet("llama_index_rag_with_rerank.hallucination_eval.parquet")

In [None]:
hallucination_eval = pd.read_parquet("llama_index_rag_with_rerank.hallucination_eval.parquet")
hallucination_eval

# Ingest Evaluations

In [None]:
exporter = HttpExporter()

In [None]:
add_evaluations(exporter, retrieved_documents_eval, "Relevance")

In [None]:
add_evaluations(exporter, ndcg_at_2, "NDCG@2")

In [None]:
add_evaluations(exporter, precision_at_3, "Precision@2")

In [None]:
add_evaluations(exporter, qa_correctness_eval, "Q&A Correctness")

In [None]:
add_evaluations(exporter, hallucination_eval, "Hallucination")

# End Session

In [None]:
# px.active_session().end()