In [1]:
import json
import pandas as pd
from llama_index import Document
from llama_index import VectorStoreIndex
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from trulens_eval import Tru
from utils import get_prebuilt_trulens_recorder

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [2]:
def get_results(answer_relevance, context_relevance, groundedness):
    # Calculate and print the average for each specified column
    average_answer_relevance = answer_relevance.mean()
    average_context_relevance = context_relevance.mean()
    average_groundedness = groundedness.mean()

    return average_answer_relevance, average_context_relevance, average_groundedness

In [3]:
file_path = "echr_train.json"

# Read the JSON file
with open(file_path, "r") as file:
    data = json.load(file)

# Extract 'text' values
text_values = [item["text"] for item in data if "text" in item]

# Create a DataFrame
tab_df = pd.DataFrame(text_values, columns=["sentence"]).head(20)

In [3]:
document = Document(text="\n\n".join([doc for doc in tab_df["sentence"]]))

llm = OpenAI(model="gpt-4-1106-preview", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document], service_context=service_context)
query_engine = index.as_query_engine()
tru = Tru()

tru.reset_database()
tru_recorder = get_prebuilt_trulens_recorder(query_engine, app_id="Unsanitized")

eval_questions = []
with open("tab_eval_questions.txt", "r") as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [4]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

records, feedback = tru.get_records_and_feedback(app_ids=[])
average_answer_relevance, average_context_relevance, average_groundedness = get_results(
    records["Answer Relevance"], records["Context Relevance"], records["Groundedness"]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
# Print the averages
print("Average Answer Relevance:", average_answer_relevance)
print("Average Context Relevance:", average_context_relevance)
print("Average Groundedness:", average_groundedness)

Average Answer Relevance: 0.9645569620253165
Average Context Relevance: 0.12784810126582277
Average Groundedness: 0.8549484642341786


## SanText

In [15]:
santext_sanitized = pd.read_csv("SanText.csv", usecols=['sanitized sentence'])
santext_sanitized = santext_sanitized.rename(columns={"sanitized sentence": "sentence"})

In [17]:
document = Document(text="\n\n".join([doc for doc in santext_sanitized["sentence"]]))

llm = OpenAI(model="gpt-4-1106-preview", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document], service_context=service_context)
query_engine = index.as_query_engine()
tru = Tru()

tru.reset_database()
tru_recorder = get_prebuilt_trulens_recorder(query_engine, app_id="Unsanitized")

eval_questions = []
with open("tab_eval_questions.txt", "r") as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [18]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

records, feedback = tru.get_records_and_feedback(app_ids=[])
average_answer_relevance, average_context_relevance, average_groundedness = get_results(
    records["Answer Relevance"], records["Context Relevance"], records["Groundedness"]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
# Print the averages
print("Average Answer Relevance:", average_answer_relevance)
print("Average Context Relevance:", average_context_relevance)
print("Average Groundedness:", average_groundedness)

Average Answer Relevance: 0.9227848101265823
Average Context Relevance: 0.0
Average Groundedness: 0.25665445665445663


## CusText

In [20]:
custext_sanitized = pd.read_csv("CusText.csv", usecols=['sanitized sentence'])
custext_sanitized = custext_sanitized.rename(columns={"sanitized sentence": "sentence"})
document = Document(text="\n\n".join([doc for doc in custext_sanitized["sentence"]]))

llm = OpenAI(model="gpt-4-1106-preview", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document], service_context=service_context)
query_engine = index.as_query_engine()
tru = Tru()

tru.reset_database()
tru_recorder = get_prebuilt_trulens_recorder(query_engine, app_id="Unsanitized")

eval_questions = []
with open("tab_eval_questions.txt", "r") as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [21]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

records, feedback = tru.get_records_and_feedback(app_ids=[])
average_answer_relevance, average_context_relevance, average_groundedness = get_results(
    records["Answer Relevance"], records["Context Relevance"], records["Groundedness"]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
# Print the averages
print("Average Answer Relevance:", average_answer_relevance)
print("Average Context Relevance:", average_context_relevance)
print("Average Groundedness:", average_groundedness)

Average Answer Relevance: 0.9253164556962025
Average Context Relevance: 0.0
Average Groundedness: 0.3099780701754386


## SanText + Presidio

In [4]:
santext_presidio_sanitized = pd.read_csv("SanText_Presidio.csv", usecols=['sanitized sentence'])
santext_presidio_sanitized = santext_presidio_sanitized.rename(columns={"sanitized sentence": "sentence"})
document = Document(text="\n\n".join([doc for doc in santext_presidio_sanitized["sentence"]]))

llm = OpenAI(model="gpt-4-1106-preview", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document], service_context=service_context)
query_engine = index.as_query_engine()
tru = Tru()

tru.reset_database()
tru_recorder = get_prebuilt_trulens_recorder(query_engine, app_id="Unsanitized")

eval_questions = []
with open("tab_eval_questions.txt", "r") as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [5]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

records, feedback = tru.get_records_and_feedback(app_ids=[])
average_answer_relevance, average_context_relevance, average_groundedness = get_results(
    records["Answer Relevance"], records["Context Relevance"], records["Groundedness"]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
# Print the averages
print("Average Answer Relevance:", average_answer_relevance)
print("Average Context Relevance:", average_context_relevance)
print("Average Groundedness:", average_groundedness)

Average Answer Relevance: 0.929113924050633
Average Context Relevance: 0.00949367088607595
Average Groundedness: 0.32742616033755273


In [8]:
records.to_csv("SanText_Presidio_Eval.csv")

## CusText + Presidio

In [9]:
custext_presidio_sanitized = pd.read_csv("CusText_Presidio.csv", usecols=['sanitized sentence'])
custext_presidio_sanitized = custext_presidio_sanitized.rename(columns={"sanitized sentence": "sentence"})
document = Document(text="\n\n".join([doc for doc in custext_presidio_sanitized["sentence"]]))

llm = OpenAI(model="gpt-4-1106-preview", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document], service_context=service_context)
query_engine = index.as_query_engine()
tru = Tru()

tru.reset_database()
tru_recorder = get_prebuilt_trulens_recorder(query_engine, app_id="Unsanitized")

eval_questions = []
with open("tab_eval_questions.txt", "r") as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [10]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

records, feedback = tru.get_records_and_feedback(app_ids=[])
average_answer_relevance, average_context_relevance, average_groundedness = get_results(
    records["Answer Relevance"], records["Context Relevance"], records["Groundedness"]
)

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function BaseQueryEngine.query at 0x13c30cb80>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.indices.vector_store.retrievers.retriever.VectorIndexRetriever'> at 0x15c169d50 is calling an instrumented method <function BaseRetriever.retrieve at 0x13c0f77e0>. The path of this call may be incorrect.
Guessing path of new object is app.retriever based on other object (0x15c167610) using 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x15834d050 is calling an instrumented method <function Refine.get_response at 0x13eae8a40>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x15c3e9710) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0c

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x15834d050 is calling an instrumented method <function Refine.get_response at 0x13eae8a40>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x15c3e9710) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0c

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x15834d050 is calling an instrumented method <function Refine.get_response at 0x13eae8a40>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x15c3e9710) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0c

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x15834d050 is calling an instrumented method <function Refine.get_response at 0x13eae8a40>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x15c3e9710) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0c

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x15834d050 is calling an instrumented method <function Refine.get_response at 0x13eae8a40>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x15c3e9710) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0c

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x15834d050 is calling an instrumented method <function Refine.get_response at 0x13eae8a40>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x15c3e9710) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0c

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0ce050) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x15834d050 is calling an instrumented method <function Refine.get_response at 0x13eae8a40>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x15c3e9710) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x15c1d8990 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x169b585e0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x15c0c

In [11]:
records.to_csv("CusText_Presidio_Eval.csv")

In [12]:
# Print the averages
print("Average Answer Relevance:", average_answer_relevance)
print("Average Context Relevance:", average_context_relevance)
print("Average Groundedness:", average_groundedness)

Average Answer Relevance: 0.9569620253164556
Average Context Relevance: 0.2858974358974359
Average Groundedness: 0.621769568151147
