## 1. Install Dependencies and Import Libraries

Install LlamaIndex and other dependencies.

In [None]:
!pip install -q "arize-phoenix[experimental]" gcsfs llama-index tqdm

Import libraries.

In [None]:
import json
import os
from urllib.request import urlopen

import openai
import pandas as pd
import phoenix as px
from gcsfs import GCSFileSystem
from langchain.chat_models import ChatOpenAI
from llama_index import LLMPredictor, ServiceContext, StorageContext, load_index_from_storage
from llama_index.callbacks import CallbackManager
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.simple import SimpleGraphStore
from phoenix.experimental.callbacks.llama_index_trace_callback_handler import (
    OpenInferenceTraceCallbackHandler,
)
from tqdm import tqdm

pd.set_option("display.max_colwidth", 1000)

## 2. Configure Your OpenAI API Key

In [None]:
from getpass import getpass

if not (openai_api_key := os.getenv("OPENAI_API_KEY")):
    openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
openai.api_key = openai_api_key
os.environ["OPENAI_API_KEY"] = openai_api_key

## 3. Download Your Knowledge Base

Download your pre-built index from cloud storage and instantiate your storage context.

In [None]:
file_system = GCSFileSystem(project="public-assets-275721")
index_path = "arize-assets/phoenix/datasets/unstructured/llm/llama-index/arize-docs/index/"
storage_context = StorageContext.from_defaults(
    fs=file_system,
    persist_dir=index_path,
    graph_store=SimpleGraphStore(),  # prevents unauthorized request to GCS
)

Download and unzip a pre-built knowledge base index consisting of chunks of the Arize documentation.

## 4. Run Your Question-Answering Service

💭 Start a LlamaIndex application from your downloaded index. Use the `OpenInferenceTraceCallbackHandler` to store your data in [OpenInference format](https://github.com/Arize-ai/open-inference-spec), an open standard for capturing and storing AI model inferences that enables production LLMapp servers to seamlessly integrate with LLM observability solutions such as Arize and Phoenix.

In [None]:
callback_handler = OpenInferenceTraceCallbackHandler()
service_context = ServiceContext.from_defaults(
    llm_predictor=LLMPredictor(llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)),
    embed_model=OpenAIEmbedding(model="text-embedding-ada-002"),
    callback_manager=CallbackManager(handlers=[callback_handler]),
)
index = load_index_from_storage(
    storage_context,
    service_context=service_context,
)
query_engine = index.as_query_engine()

💭 Ask questions of your question-answering service and view the responses.

In [None]:
# Load queries from GCS - these are commonly asked questions about Arize
queries_url = "http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/llm/context-retrieval/arize_docs_queries.jsonl"
queries = []
with urlopen(queries_url) as response:
    for line in response:
        line = line.decode("utf-8").strip()
        data = json.loads(line)
        queries.append(data["query"])
queries

# 5. Launch Phoenix

Phoenix will run in the background and collect trace data emitted by the `OpenInferenceTraceCallbackHandler` that you attached to your LlamaIndex query engine.

In [None]:
px.launch_app()

## 6. Run Your Query Engine

In [None]:
for query in tqdm(queries):
    response = query_engine.query(query)

## 7. Export Your Trace Data

In [None]:
trace_df = px.export_trace_dataframe(span_kind="retrieve")  # dataframe must be indexed by span id
trace_df.head()

## 8. Run Evaluations

In [None]:
precisions_at_k = run_eval(trace_df, "precision_at_k")  # e.g., [ 0.2, 0.4, ...]
mean_precision_at_k = precisions_at_k.mean()  # a single number
mean_precision_at_k

In [None]:
trace_df["precision_at_2"] = run_eval(trace_df, "precision_at_k")  # e.g., [ 0.2, 0.4, ...]
trace_df

End of M0. In M1, we will implement the ability to import metrics into Phoenix.

In [None]:
df["llm_assisted_relevance"] = run_binary_eval(df, <context relevancy>)

# Option 1
# num_spans x k
# returns [
#     ("relevant", "irrelevant"),
#     ("relevant", "relevant"),
# ]

# Option 2
# num_spans x 1
# return [ 0, 1 ]  # 0 if the top document is irrelevant, 1 if the top document is relevant

# f1 = sklearn.metrics.f1(df["llm_assisted_relevance"], ["relevant", "relevant"])
# precision_at_k = ...

px.import_trace_dataframe(df, eval_column_names=["eval_column_name"])
mean_precision_at_1 = df["llm_assisted_relevance"].mean()
mean_precision_at_k = df["llm_assisted_relevance"].mean()