In [None]:
# !pip install arize-phoenix[llama-index] llama-index-embeddings-ollama llama-index-llms-ollama llama-index-llms-openai llama-index-agent-openai

In [None]:
import tempfile
from datetime import datetime, timezone
from time import sleep
from urllib.request import urlretrieve

import nest_asyncio
import pandas as pd
import phoenix as px
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.postprocessor.types import BaseNodePostprocessor
from llama_index.core.settings import Settings
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from phoenix.datasets.decorators import monkey_patch
from phoenix.datasets.experiments import run_experiment
from phoenix.datasets.types import Example

nest_asyncio.apply()

# Optional: Instrument LlamaIndex

In [None]:
endpoint = "http://127.0.0.1:6006/v1/traces"
tracer_provider = trace_sdk.TracerProvider()
tracer_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint)))

LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)

# Create Dataset

In [None]:
df = pd.DataFrame(
    {
        "input_messages": [
            [{"role": "user", "content": "Which grad schools did the author apply for and why?"}],
            [{"role": "user", "content": "What did the author do growing up?"}],
        ],
        "output_message": [
            {
                "role": "assistant",
                "content": "The author applied to three grad schools: MIT and Yale, which were renowned for AI at the time, and Harvard, which the author had visited because a friend went there and it was also home to Bill Woods, who had invented the type of parser the author used in his SHRDLU clone. The author chose these schools because he wanted to learn about AI and Lisp, and these schools were known for their expertise in these areas.",
            },
            {
                "role": "assistant",
                "content": "The author took a painting class at Harvard with Idelle Weber and later became her de facto studio assistant. Additionally, the author worked on several different projects, including writing essays, developing spam filters, and painting.",
            },
        ],
    }
)
df

## Upload Dataset

In [None]:
dataset_name = (datetime.now(timezone.utc).isoformat(),)
px.Client().upload_dataset_examples(
    df,
    input_keys=("input_messages",),
    output_keys=("output_message",),
    name=dataset_name,
)
sleep(1)

## Download Dataset

In [None]:
ds = px.Client().get_dataset(name=dataset_name)
type(ds)

# Set Up Experiment Metadata

In [None]:
experiment_metadata = {
    "llm": Ollama(model="llama3"),
    "embed_model": OllamaEmbedding(model_name="mxbai-embed-large"),
    "reranker": SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=2),
}

# Set Up LLamaIndex

In [None]:
Settings.llm = experiment_metadata["llm"]
Settings.embed_model = experiment_metadata["embed_model"]
reranker = experiment_metadata["reranker"]

essay = "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt"
with tempfile.NamedTemporaryFile() as tf:
    urlretrieve(essay, tf.name)
    documents = SimpleDirectoryReader(input_files=[tf.name]).load_data()
index = VectorStoreIndex.from_documents(documents)

# Set Up Capture of Retrieved Documents

In [None]:
patches = {
    BaseNodePostprocessor.postprocess_nodes: dict(
        identifier="documents",
        transform_output=lambda nodes: [node.text for node in nodes],
    ),
}

# Create Task

In [None]:
def rag_with_reranker(example: Example) -> str:
    chat_engine = index.as_chat_engine(similarity_top_k=10, node_postprocessors=[reranker])
    response = chat_engine.chat(example.input["input_messages"][-1]["content"])
    return str(response)

# Run Experiment

In [None]:
with monkey_patch(patches):
    run_experiment(
        dataset=ds,
        fn=rag_with_reranker,
        experiment_metadata=experiment_metadata,
        repetitions=1,
    )