<a href="https://colab.research.google.com/github/arize-ai/phoenix/blob/main/tutorials/experiments/run_experiments_with_llama_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


In [None]:
!uv pip install -Uqq arize-phoenix "torch<2.7" sentence-transformers openinference-instrumentation-llama_index openinference-instrumentation-openai llama-index

In [None]:
import json
import os
import tempfile
from datetime import datetime, timezone
from functools import partial
from getpass import getpass
from time import sleep
from typing import Any
from urllib.request import urlretrieve

import nest_asyncio
import pandas as pd
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.chat_engine import ContextChatEngine
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.response_synthesizers import get_response_synthesizer
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from openinference.instrumentation.openai import OpenAIInstrumentor

import phoenix as px
from phoenix.client import Client
from phoenix.evals import (
    OpenAIModel,
)
from phoenix.otel import register

nest_asyncio.apply()

pd.set_option("display.max_colwidth", None)

In [None]:
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("🔑 Enter your OpenAI API key: ")

# Set Up Instrumentation


In [None]:
px.launch_app()

In [None]:
tracer_provider = register()
LlamaIndexInstrumentor().instrument(skip_dep_check=True, tracer_provider=tracer_provider)
OpenAIInstrumentor().instrument(skip_dep_check=True, tracer_provider=tracer_provider)

# Create Dataset


In [None]:
df = pd.DataFrame(
    {
        "input_messages": [
            [{"role": "user", "content": "Which grad schools did the author apply for and why?"}],
            [{"role": "user", "content": "What did the author do growing up?"}],
        ],
        "output_message": [
            {
                "role": "assistant",
                "content": "The author applied to three grad schools: MIT and Yale, which were renowned for AI at the time, and Harvard, which the author had visited because a friend went there and it was also home to Bill Woods, who had invented the type of parser the author used in his SHRDLU clone. The author chose these schools because he wanted to learn about AI and Lisp, and these schools were known for their expertise in these areas.",
            },
            {
                "role": "assistant",
                "content": "The author took a painting class at Harvard with Idelle Weber and later became her de facto studio assistant. Additionally, the author worked on several different projects, including writing essays, developing spam filters, and painting.",
            },
        ],
    }
)
for c in ("input_messages", "output_message"):
    df[c] = df[c].apply(json.dumps).astype("string")
df

## Upload Dataset


In [None]:
dataset_name = datetime.now(timezone.utc).isoformat()
phoenix_client = Client()

phoenix_client.datasets.create_dataset(
    name=dataset_name,
    dataframe=df,
    input_keys=("input_messages",),
    output_keys=("output_message",),
)
sleep(1)

## Download Dataset


In [None]:
ds = phoenix_client.datasets.get_dataset(dataset=dataset_name)

# Set Up LLamaIndex


In [None]:
# Configure models
experiment_metadata = {
    "llm": "gpt-4",
    "embed_model": "text-embedding-3-small",
    "reranker": "cross-encoder/ms-marco-MiniLM-L-2-v2",
}
Settings.llm = OpenAI(model=experiment_metadata["llm"])
Settings.embed_model = OpenAIEmbedding(model=experiment_metadata["embed_model"])
reranker = SentenceTransformerRerank(model=experiment_metadata["reranker"], top_n=2)

# Load and chunk document
print("📚 Loading and chunking document...")
essay = "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt"

with tempfile.NamedTemporaryFile() as tf:
    urlretrieve(essay, tf.name)
    # Load document
    documents = SimpleDirectoryReader(input_files=[tf.name]).load_data()
    print(f"📄 Loaded {len(documents)} document(s)")

    # Create text chunks using sentence splitter
    parser = SentenceSplitter(chunk_size=512, chunk_overlap=50)
    nodes = parser.get_nodes_from_documents(documents)
    print(f"📑 Created {len(nodes)} chunks")

# Create index
print("\n🔍 Creating index...")
index = VectorStoreIndex(nodes)
print("✅ Index created")

# Create Task


In [None]:
def rag_with_reranker(input) -> str:
    try:
        # Parse query
        messages = input["input_messages"]
        messages = json.loads(messages)
        query = messages[-1]["content"]

        # Create retriever with reranking
        retriever = retriever = index.as_retriever(
            similarity_top_k=5, node_postprocessors=[reranker]
        )

        # Create response synthesizer
        response_synthesizer = get_response_synthesizer(response_mode="compact")

        # Create context chat engine explicitly
        chat_engine = ContextChatEngine.from_defaults(
            retriever=retriever,
            response_synthesizer=response_synthesizer,
            system_prompt=(
                "You are a helpful assistant. Base your response ONLY on the provided context. "
                "If you cannot find the answer in the context, say 'I cannot find that information "
                "in the provided context.' Include specific details from the context in your response."
            ),
        )

        # Get response
        response = chat_engine.chat(query)
        return str(response)

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        import traceback

        traceback.print_exc()
        return f"Error: {str(e)}"

In [None]:
# Test the RAG pipeline directly
test_input = {
    "input_messages": json.dumps(
        [{"role": "user", "content": "Which grad schools did the author apply for?"}]
    )
}
print("🧪 Testing RAG pipeline...")
result = rag_with_reranker(test_input)
print(f"\n🎯 Final result: {result}")

# Define Evaluators


In [None]:
def contains_substring(output, substring: str) -> dict[str, Any]:
    score = int(isinstance(output, str) and substring in output)
    return {
        "score": score,
        "explanation": f"the substring `{substring}` was in the output",
    }

# Run Experiment


In [None]:
model = OpenAIModel(model="gpt-4o")

experiment = phoenix_client.experiments.run_experiment(
    dataset=ds,
    task=rag_with_reranker,
    experiment_metadata=experiment_metadata,
    evaluators=[partial(contains_substring, substring="school")],
)