<center>
    <p style="text-align:center">
        <img alt="phoenix logo" src="https://storage.googleapis.com/arize-assets/phoenix/assets/phoenix-logo-light.svg" width="200"/>
        <br>
        <a href="https://docs.arize.com/phoenix/">Docs</a>
        |
        <a href="https://github.com/Arize-ai/phoenix">GitHub</a>
        |
        <a href="https://join.slack.com/t/arize-ai/shared_invite/zt-1px8dcmlf-fmThhDFD_V_48oU7ALan4Q">Community</a>
    </p>
</center>
<h1 align="center">Evaluating and Improving Semantic Retrieval Systems</h1>

In [None]:
!pip install -q arize-phoenix llama-index openai

In [None]:
import hashlib
import json
import os
import tempfile
import textwrap
from tqdm import tqdm
from typing import Dict, List
import urllib
import zipfile

from langchain import OpenAI
from llama_index import StorageContext, load_index_from_storage
from llama_index.response.schema import Response
import numpy as np
import openai
import pandas as pd
import phoenix as px
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

pd.set_option("display.max_colwidth", None)

In [None]:
openai_api_key = "copy paste your api key here"
assert (
    openai_api_key != "copy paste your api key here"
), "❌ Please set your OpenAI API key"
os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:
def download_file(url: str, output_path: str) -> None:
    """
    Downloads a file from the specified URL and saves to a local path.
    """
    urllib.request.urlretrieve(url, output_path)


def unzip_directory(zip_path: str, output_path: str) -> None:
    """
    Unzips a directory to a specified output path.
    """
    with zipfile.ZipFile(zip_path, "r") as f:
        f.extractall(output_path)


print("⏳ Downloading knowledge base...")
data_dir = tempfile.gettempdir()
zip_file_path = os.path.join(data_dir, "index.zip")
download_file(
    url="http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/llm/llama-index/arize-docs/index.zip",
    output_path=zip_file_path,
)

print("⏳ Unzipping knowledge base...")
index_dir = os.path.join(data_dir, "index")
unzip_directory(zip_file_path, index_dir)

print("✅ Done")

In [None]:
# index_dir = "/Users/xandersong/phoenix/llama-index-arize-docs/index/"
# index_dir = "/Users/xandersong/phoenix/langchain-loader-chunker/"
# index_dir = "/Users/xandersong/phoenix/llama-index-arize-docs/index/"

In [None]:
storage_context = StorageContext.from_defaults(
    persist_dir=os.path.join(index_dir),
)
llm = OpenAI(temperature=0, model_name="gpt-4")
index = load_index_from_storage(storage_context, llm=llm)
query_engine = index.as_query_engine()

In [None]:
def display_llama_index_response(response: Response) -> None:
    """
    Displays a LlamaIndex response and its source nodes (retrieved context).
    """

    print("Response")
    print("========")
    for line in textwrap.wrap(response.response.strip(), width=80):
        print(line)
    print()

    print("Source Nodes")
    print("============")
    print()

    for source_node in response.source_nodes:
        print(f"doc_id: {source_node.node.doc_id}")
        print(f"score: {source_node.score}")
        print()
        for line in textwrap.wrap(source_node.node.text, width=80):
            print(line)
        print()

In [None]:
# query = "What's the difference between primary and baseline datasets?"
# query = "How do I send in extra metadata with each record?"
query = "How does Arize's surrogate explainability model work?"
response = query_engine.query(query)
display_llama_index_response(response)

## Load Data into Pandas Dataframes

In [None]:
def load_llama_index_database_into_dataframe(docstore, vector_store) -> pd.DataFrame:
    """
    Loads LlamaIndex data into a Pandas dataframe.
    """
    text_list = []
    embeddings_list = []
    for doc_id in docstore["docstore/data"]:
        text_list.append(docstore["docstore/data"][doc_id]["__data__"]["text"])
        embeddings_list.append(np.array(vector_store["embedding_dict"][doc_id]))
    return pd.DataFrame(
        {
            "text": text_list,
            "text_vector": embeddings_list,
        }
    )


with open(os.path.join(index_dir, "docstore.json")) as f:
    docstore = json.load(f)
with open(os.path.join(index_dir, "vector_store.json")) as f:
    vector_store = json.load(f)

database_df = load_llama_index_database_into_dataframe(docstore, vector_store).drop_duplicates(subset=["text"])
database_df.head()

In [None]:
# query_df = pd.read_parquet("llama-index-arize-docs/retrievals.parquet")
query_df = pd.read_parquet("http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/llm/llama-index/arize-docs/retrievals.parquet")
# query_df = pd.read_parquet("/Users/xandersong/phoenix/llama-index-arize-docs/langchain_loader_chunker_retrievals.parquet")
# query_df = pd.read_parquet("/Users/xandersong/phoenix/llama-index-arize-docs/retrievals.parquet")
query_df = query_df.rename(columns={"query_text": "text", "query_embedding": "text_vector"})
query_df.head()

Add text hash column.

In [None]:
def hash_string(string: str) -> str:
    md5_hash = hashlib.md5()
    md5_hash.update(string.encode("utf-8"))
    return md5_hash.hexdigest()

database_df["text_hash"] = database_df["text"].map(hash_string)
query_df["context_text_hash_0"] = query_df["context_text_0"].map(hash_string)
query_df["context_text_hash_1"] = query_df["context_text_1"].map(hash_string)
query_df["text_hash"] = query_df["context_text_hash_0"]

Optionally run the following cell if you want to filter the database dataframe to only consider retrieved context.

In [None]:
# retrieved_context_hashes = set(query_df["context_text_hash_0"].tolist() + query_df["context_text_hash_1"].tolist())
# database_df = database_df[database_df["text_hash"].isin(retrieved_context_hashes)]
# database_df.head()

## Examine and Correct Bias

Correct for bias between the query and database datasets.

In [None]:
database_bias_vector = database_df['text_vector'].mean()
database_df['bias_corrected_text_vector'] = database_df['text_vector'].apply(lambda x: x - database_bias_vector)
query_bias_vector = query_df['text_vector'].mean()
query_df['bias_corrected_text_vector'] = query_df['text_vector'].apply(lambda x: x - query_bias_vector)

Examine bias of query and database vectors.

In [None]:
def compute_cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)


def compute_euclidean_distance(a, b):
    return np.linalg.norm(a - b)

In [None]:
print(f"Cosine similarity: {compute_cosine_similarity(query_bias_vector, database_bias_vector)}")
print(f"Euclidean distance: {compute_euclidean_distance(query_bias_vector, database_bias_vector)}")

In [None]:
question_text = "?"
# question_text = "This is a test question?"
# question_text = "This is a question?"
question_vector = query_df[query_df["text"] == question_text]["text_vector"].iloc[0]
print(f"Cosine similarity: {compute_cosine_similarity(question_vector, query_bias_vector)}")
print(f"Euclidean distance: {compute_euclidean_distance(question_vector, query_bias_vector)}")

In [None]:
paragraph_text = "My service is a hosting service designed for hosting your website. You can put your website on our service and host it with accelerated CDN delivery, tracking of usage data for running your website. Our service is one of the best on the internet in terms of delivery and experience."
paragraph_vector = query_df[query_df["text"] == paragraph_text]["text_vector"].iloc[0]
print(f"Cosine similarity: {compute_cosine_similarity(paragraph_vector, database_bias_vector)}")
print(f"Euclidean distance: {compute_euclidean_distance(paragraph_vector, database_bias_vector)}")

Compare the query and bias vectors of our documentation against queries and biases from the Wikipedia dataset (also encoded using "text-embedding-ada-002" from OpenAI).

In [None]:
wikipedia_query_df = pd.read_parquet(
    "http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/llm/llama-index/query.parquet"
)
wikipedia_query_bias_vector = wikipedia_query_df['text_vector'].mean()
print(f"Cosine similarity: {compute_cosine_similarity(wikipedia_query_bias_vector, query_bias_vector)}")
print(f"Euclidean distance: {compute_euclidean_distance(wikipedia_query_bias_vector, query_bias_vector)}")

In [None]:
wikipedia_database_df = pd.read_parquet(
    "http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/llm/llama-index/database.parquet"
)
wikipedia_database_bias_vector = wikipedia_database_df['text_vector'].mean()
print(f"Cosine similarity: {compute_cosine_similarity(wikipedia_database_bias_vector, query_bias_vector)}")
print(f"Euclidean distance: {compute_euclidean_distance(wikipedia_database_bias_vector, query_bias_vector)}")

Compute Euclidean distance to retrieved context.

In [None]:
num_retrieved_documents = 2
for context_index in range(num_retrieved_documents):
    euclidean_distances = []
    for _, row in query_df.iterrows():
        query_embedding = row["text_vector"]
        context_text_hash = row[f"context_text_hash_{context_index}"]
        database_row = database_df[database_df["text_hash"] == context_text_hash].iloc[0]
        database_embedding = database_row['text_vector']
        euclidean_distance = compute_euclidean_distance(query_embedding, database_embedding)
        euclidean_distances.append(euclidean_distance)
    query_df[f"euclidean_distance_{context_index}"] = euclidean_distances
query_df.head()

In [None]:
query_df.sort_values(by="context_similarity_0", ascending=False).head(10)

## Run Evaluations With OpenAI

In [None]:
evaluation_prompt_template = """You will be given a query and a reference text. You must determine whether the reference text contains an answer to the input query. Your response must be binary (0 or 1) and should not contain any text or characters aside from 0 or 1. 0 means that the reference text does not contain an answer to the query. 1 means the reference text contains an answer to the query.

# Query: {query}

# Reference: {reference}

# Binary: """

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def complete_batch_of_prompts(prompts: List[str], model_name: str) -> List[str]:
    """
    Completes a list of prompts using the OpenAI completion API and the
    specified model. As of June 2023, OpenAI supports a maximum of 20 prompts
    per completion request. This function is wrapped in a retry decorator in
    order to avoid rate-limiting. Retry settings were copied from
    https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb.
    """
    response = openai.Completion.create(
        model=model_name,
        prompt=prompts,
    )
    return [choice["text"] for choice in response["choices"]]


def complete_prompts(
    prompts: List[str],
    model_name: str,
    batch_size: int = 20,  # the max number of prompts per completion request as of June 2023
) -> List[str]:
    """
    Completes a list of prompts using the OpenAI completion API. The list may be
    of arbitrary length and will be batched using the batch_size parameter.
    """
    completions = []
    progress_bar = tqdm(total=len(prompts))
    for batch_of_prompts in (
        prompts[index : index + batch_size] for index in range(0, len(prompts), batch_size)
    ):
        completions.extend(complete_batch_of_prompts(batch_of_prompts, model_name))
        num_prompts_in_batch = len(batch_of_prompts)
        progress_bar.update(num_prompts_in_batch)
    return completions


def process_completions(
    raw_completions: List[str], binary_to_string_map: Dict[int, str]
) -> List[str]:
    """
    Parses the raw completions returned by the OpenAI completion API and
    converts them to the desired format. The binary_to_string_map parameter
    should be a dictionary mapping binary values (0 or 1) to the desired
    string values (e.g. "irrelevant" or "relevant").
    """
    processed_completions = []
    for raw_completion in raw_completions:
        try:
            binary_value = int(raw_completion.strip())
            processed_completion = binary_to_string_map[binary_value]
        except (ValueError, KeyError):
            processed_completion = None
        processed_completions.append(processed_completion)
    return processed_completions

In [None]:
model_name = "text-davinci-003"  # this is the most powerful model available for the completion API as of June 2023
for index in range(num_retrieved_documents):
    evaluation_prompts = query_df.apply(
        lambda row: evaluation_prompt_template.format(
            query=row["text"], reference=row[f"context_text_{index}"]
        ),
        axis=1,
    ).to_list()
    raw_completions = complete_prompts(evaluation_prompts, model_name)
    processed_completions = process_completions(raw_completions, {0: "irrelevant", 1: "relevant"})
    query_df[f"relevance_{index}"] = processed_completions
query_df

View a few evaluations.

In [None]:
query_df[["text", "context_text_0", "relevance_0", "context_text_1", "relevance_1", "response"]]

Compute precision@k for $k = 1, 2$.

In [None]:
num_relevant_documents_array = np.zeros(len(query_df))
for retrieved_context_index in range(0, num_retrieved_documents):
    num_retrieved_documents = retrieved_context_index + 1
    num_relevant_documents_array += query_df[f"relevance_{retrieved_context_index}"].map(lambda x: int(x == "relevant")).to_numpy()
    query_df[f"precision@{num_retrieved_documents}"] = pd.Series(num_relevant_documents_array / num_retrieved_documents)
query_df[["relevance_0", "relevance_1", "precision@1", "precision@2"]]

## Launch Phoenix

In [None]:
query_schema = px.Schema(
    embedding_feature_column_names={
        "text": px.EmbeddingColumnNames(
            raw_data_column_name="text",
            vector_column_name="bias_corrected_text_vector",
        )
    },
    tag_column_names = [
        'response',
        'context_doc_id_0',
        'context_text_0',
        'context_similarity_0',
        'context_doc_id_1',
        'context_text_1',
        'context_similarity_1',
        'euclidean_distance_0',
        'euclidean_distance_1',
        "relevance_0",
        "relevance_1",
        "precision@1",
        "precision@2",
    ]
)
database_schema = px.Schema(
    embedding_feature_column_names={
        "text": px.EmbeddingColumnNames(
                raw_data_column_name="text",
                vector_column_name="bias_corrected_text_vector",
        )
    }
)

In [None]:
database_ds = px.Dataset(
    database_df, database_schema
)
query_ds = px.Dataset(
    query_df, query_schema
)

In [None]:
px.launch_app(
    query_ds,
    database_ds,
)