Skip to content

Commit

Permalink
feat!: Remove experimental module (#2945)
Browse files Browse the repository at this point in the history
* Remove experimental module

* Update chunk eval script

* Remove evals references from notebooks

* Remove references to `experimental` from notebooks

* Delete experimental module tests

* Remove remaining references to experimental module

* Clean up notebooks
  • Loading branch information
anticorrelator committed Apr 22, 2024
1 parent b8d9d44 commit 01758cf
Show file tree
Hide file tree
Showing 50 changed files with 575 additions and 6,728 deletions.
2 changes: 1 addition & 1 deletion examples/using_llamaindex_with_huggingface_models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install \"arize-phoenix[experimental,llama-index]\" \"openai>=1\" gcsfs accelerate"
"!pip install \"arize-phoenix[llama-index]\" \"openai>=1\" gcsfs accelerate"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
from phoenix.experimental.evals.functions import llm_classify
from phoenix.experimental.evals.models.anthropic import AnthropicModel
from phoenix.experimental.evals.templates.default_templates import (
from phoenix.evals import AnthropicModel, llm_classify
from phoenix.evals.default_templates import (
RAG_RELEVANCY_PROMPT_TEMPLATE,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
from phoenix.experimental.evals.functions import llm_classify
from phoenix.experimental.evals.models.openai import OpenAIModel
from phoenix.experimental.evals.templates.default_templates import (
from phoenix.evals import OpenAIModel, llm_classify
from phoenix.evals.default_templates import (
RAG_RELEVANCY_PROMPT_TEMPLATE,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
from phoenix.experimental.evals.functions import llm_classify
from phoenix.experimental.evals.models.vertex import GeminiModel
from phoenix.experimental.evals.templates.default_templates import (
from phoenix.evals import GeminiModel, llm_classify
from phoenix.evals.default_templates import (
RAG_RELEVANCY_PROMPT_TEMPLATE,
)

Expand Down
2 changes: 1 addition & 1 deletion scripts/data/build_llama_index_rag_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from llama_index.callbacks.open_inference_callback import as_dataframe
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms import OpenAI
from phoenix.experimental.evals.retrievals import (
from phoenix.evals.retrievals import (
classify_relevance,
compute_precisions_at_k,
)
Expand Down
25 changes: 13 additions & 12 deletions scripts/rag/llama_index_w_evals_and_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import numpy as np
import pandas as pd
import phoenix as px
import phoenix.experimental.evals.templates.default_templates as templates
import phoenix.evals.default_templates as templates
import requests
import tiktoken
from bs4 import BeautifulSoup
Expand All @@ -34,17 +34,15 @@
from llama_index.legacy import (
LLMPredictor,
)
from llama_index.legacy.postprocessor.cohere_rerank import CohereRerank
from llama_index.legacy.readers.web import BeautifulSoupWebReader
from llama_index.llms.openai import OpenAI
from llama_index.postprocessor.cohere_rerank import CohereRerank
from openinference.semconv.trace import DocumentAttributes, SpanAttributes
from phoenix.evals import (
OpenAIModel,
llm_classify,
)
from phoenix.evals.models import BaseModel, set_verbosity

# from phoenix.experimental.evals.templates import NOT_PARSABLE
from plotresults import (
plot_latency_graphs,
plot_mean_average_precision_graphs,
Expand Down Expand Up @@ -332,6 +330,7 @@ def run_experiments(
logger.info(f"K : {k}")

time_start = time.time()
# return engine, query
response = engine.query(query)
time_end = time.time()
response_latency = time_end - time_start
Expand Down Expand Up @@ -414,7 +413,9 @@ def df_evals(
lambda chunks: concatenate_and_truncate_chunks(chunks=chunks, model=model, token_buffer=700)
)

df = df.rename(columns={"query": "question", "response": "sampled_answer"})
df = df.rename(
columns={"query": "input", "response": "output", "retrieved_context_list": "reference"}
)
# Q&A Eval: Did the LLM get the answer right? Checking the LLM
Q_and_A_classifications = llm_classify(
dataframe=df,
Expand All @@ -426,22 +427,22 @@ def df_evals(
# Retreival Eval: Did I have the relevant data to even answer the question?
# Checking retrieval system

df = df.rename(columns={"question": "query", "retrieved_context_list": "reference"})
df = df.rename(columns={"question": "input", "retrieved_context_list": "reference"})
# query_column_name needs to also adjust the template to uncomment the
# 2 fields in the function call below and delete the line above
df[formatted_evals_column] = run_relevance_eval(
dataframe=df,
model=model,
template=templates.RAG_RELEVANCY_PROMPT_TEMPLATE,
rails=list(templates.RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
query_column_name="query",
# document_column_name="retrieved_context_list",
query_column_name="input",
document_column_name="reference",
)

# We want 0, 1 values for the metrics
value_map = {"relevant": 1, "irrelevant": 0, "UNPARSABLE": 0}
value_map = {"relevant": 1, "unrelated": 0, "UNPARSABLE": 0}
df[formatted_evals_column] = df[formatted_evals_column].apply(
lambda values: [value_map.get(value) for value in values]
lambda values: [value_map.get(value, 0) for value in values]
)
return df

Expand Down Expand Up @@ -617,10 +618,10 @@ def run_relevance_eval(
model,
template,
rails,
system_instruction,
query_column_name,
document_column_name,
verbose,
verbose=False,
system_instruction=None,
):
"""
Given a pandas dataframe containing queries and retrieved documents, classifies the relevance of
Expand Down

0 comments on commit 01758cf

Please sign in to comment.