# Test LlamaIndex evaluation modules

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [40]:
from rag_chat.evaluation.dataset import generate_dataset
from rag_chat.query.query import load_query_engine, load_async_query_engine

from llama_index.embeddings import SimilarityMode
from llama_index.evaluation import (
    DatasetGenerator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    FaithfulnessEvaluator,
    BatchEvalRunner,
    SemanticSimilarityEvaluator,
    RetrieverEvaluator
)
import asyncio
from llama_index import Response
import pandas as pd

In [55]:
query_engine = load_query_engine()

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/vector_store "HTTP/1.1 200 OK"
HTTP Request: GET http://localhost:6333/collections/vector_store "HTTP/1.1 200 OK"


In [3]:
queries, answers = generate_dataset()

Reading data...
Build generator


  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 1047.18it/s]


Build data...


100%|██████████| 3/3 [00:02<00:00,  1.03it/s]
100%|██████████| 2/2 [00:04<00:00,  2.27s/it]
100%|██████████| 2/2 [00:08<00:00,  4.05s/it]
100%|██████████| 2/2 [00:10<00:00,  5.47s/it]


In [6]:
response_vector = query_engine.query(queries[2])

In [61]:
response_vector

Response(response='The FUEL Pureformance Heavy Bag Stand contributes to improving boxing and MMA performance by providing a convenient and sturdy platform for hanging heavy bags. It holds the heavy bag in the optimum position for hand and foot work, allowing athletes to practice their punches, kicks, and other techniques effectively. The stand is made from heavy-duty steel tubing with a scratch-resistant powder-coated finish, ensuring durability and withstanding intense workouts. It also has weight pegs with foam stops for increased stability and two bottom tube hooks for optional bag stabilization. Overall, the FUEL Pureformance Heavy Bag Stand offers a reliable and accessible training tool for boxers and MMA fighters to enhance their skills and optimize their performance.', source_nodes=[NodeWithScore(node=TextNode(id_='15b2656d-74dd-43f5-9ee6-3a2ec8e158dd', embedding=None, metadata={'list_price': 129.99, 'category': 'Health > Sports Medicine & Injury Recovery Solution > Optimize Per

## RelevancyEvaluator

In [29]:
evaluator = RelevancyEvaluator()

In [37]:
eval_result = evaluator.evaluate_response(
    query=queries[2], response=response_vector
)

In [31]:
def display_eval_df(query: str, response: Response, eval_result: str) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": (
                response.source_nodes[0].node.get_content()[:800] + "..."
            ),
            "Evaluation Passing": eval_result.passing,
            "Evaluation Score": eval_result.score,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)


In [38]:
display_eval_df(queries[2], response_vector, eval_result)

Unnamed: 0,Query,Response,Source,Evaluation Passing,Evaluation Score
0,How does the FUEL Pureformance Heavy Bag Stand contribute to improving boxing and MMA performance?,"The FUEL Pureformance Heavy Bag Stand contributes to improving boxing and MMA performance by providing a convenient and sturdy stand for hanging a heavy bag. This allows individuals to practice hand and foot work, which is essential for both boxing and MMA. The stand is made from heavy-duty steel tubing with a scratch-resistant powder-coated finish, making it durable and able to withstand intense workouts. It also has weight pegs with foam stops for increased stability and two bottom tube hooks for optional bag stabilization. Overall, the FUEL Pureformance Heavy Bag Stand provides a reliable and optimal position for training, helping individuals improve their cardio, strength, and overall performance in boxing and MMA.","product_url: https://www.walmart.com/ip/FUEL-Pureformance-Heavy-Bag-Stand-Black/55505439 product_name: FUEL Pureformance Heavy Bag Stand, Black brand: Fuel Pureformance description: The FUEL Pureformance Heavy Bag Stand is the perfect stand for hanging your punching bag and upping your boxing and MMA game. Standing just over seven feet tall, the FUEL Pureformance stand holds your heavy bag in the optimum position for both hand and foot work. With the convenience of hanging a heavy bag at home or the office, you can get a quick cardio and strength workout, relieve some stress or just get some blood pumping easily without having to go anywhere. A good boxing workout uses most of your muscle groups from your legs to your arms, strengthening your core and improving your balance. So, whet...",True,1.0


## FaithfulnessEvaluator

Evaluates whether a response is faithful to the contexts
(i.e. whether the response is supported by the contexts or hallucinated.)

This evaluator only considers the response string and the list of context strings.

https://github.com/run-llama/llama_index/blob/60b75cb014bccf60a153d5dc5295a91c7cdcd9f6/llama_index/evaluation/faithfulness.py

In [91]:
evaluator = FaithfulnessEvaluator()

In [92]:
eval_result = evaluator.evaluate_response(response=response_vector)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [94]:
response_vector

Response(response='The FUEL Pureformance Heavy Bag Stand contributes to improving boxing and MMA performance by providing a convenient and sturdy platform for hanging heavy bags. It holds the heavy bag in the optimum position for hand and foot work, allowing athletes to practice their punches, kicks, and other techniques effectively. The stand is made from heavy-duty steel tubing with a scratch-resistant powder-coated finish, ensuring durability and withstanding intense workouts. It also has weight pegs with foam stops for increased stability and two bottom tube hooks for optional bag stabilization. Overall, the FUEL Pureformance Heavy Bag Stand offers a reliable and accessible training tool for boxers and MMA fighters to enhance their skills and optimize their performance.', source_nodes=[NodeWithScore(node=TextNode(id_='15b2656d-74dd-43f5-9ee6-3a2ec8e158dd', embedding=None, metadata={'list_price': 129.99, 'category': 'Health > Sports Medicine & Injury Recovery Solution > Optimize Per

In [93]:
eval_result.feedback

'NO'

In [9]:
def display_eval_df(response: Response, eval_result: str) -> None:
    if response.source_nodes == []:
        print("no response!")
        return
    eval_df = pd.DataFrame(
        {
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

In [13]:
queries[1]

"What are the features of the Toytainer Shoe Box Play-N-Store that make it suitable for kids' storage?"

In [10]:
display_eval_df(response_vector, eval_result)

Unnamed: 0,Response,Source,Evaluation Result
0,"The Toytainer Shoe Box Play-N-Store is designed to teach children to put away their toys while having fun playing. It features a fold-out play mat, a top carry handle, and a fun city-theme. Additionally, it is collapsible or can be stacked for modularity, making it suitable for kids' storage.","product_url: https://www.walmart.com/ip/Toytainer-Shoe-Box-Play-N-Store-Boy/40718635 product_name: Toytainer Shoe Box Play-N-Store, Boy brand: Toytainer description: Toytainer Shoe Box Play-N-Store, Boy: Teach your children to put away their toys while having fun playing Shoe box features fold-out play mat Top carry handle Fun city-theme Collapsible or stacks for modularity Play and put away! Teach your children to put away their toys while having fun playing! Shoebox features fold-out play mat, top-carry handle and a fun city-theme. It is collapsible or stacks for modularity.|Teach your children to put away their toys while having fun playing|Shoe box features fold-out play mat|Top carry handle|Fun city-theme|Collapsible or stacks for modularity available: True sale_price: 11.52 discount: False...",Fail


In [16]:
import logging 
import sys 

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [17]:
def evaluate_query_engine(query_engine, queries):
    results = [query_engine.query(q) for q in queries]
    print("finished queries")

    total_correct = 0
    for r in results:
        # evaluate with gpt 4
        eval_result = (
            1 if evaluator.evaluate_response(response=r).passing else 0
        )
        total_correct += eval_result

    return total_correct, len(results)

In [18]:
correct, total = evaluate_query_engine(query_engine, queries)

print(f"score: {correct}/{total}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/vector_store/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:6333/collections/vector_store/points/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/vector_store/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:6333/collections/vector_store/points/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP R

## CorrectnessEvaluator

In [62]:
evaluator = CorrectnessEvaluator()

In [63]:
response_vector = query_engine.query(queries[2])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/vector_store/points/search "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:6333/collections/vector_store/points/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [89]:
result = evaluator.evaluate_response(
    query=queries[2],
    response=response_vector,
    reference=answers[2],
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [86]:
result.passing

True

In [87]:
result.score

4.5

In [90]:
result.feedback

'The generated answer is highly relevant to the user query and contains accurate information about the features and benefits of the FUEL Pureformance Heavy Bag Stand. The answer accurately describes how the stand contributes to improving boxing and MMA performance by providing a convenient and sturdy platform for hanging heavy bags and allowing athletes to practice their techniques effectively. The answer also mentions the stand\'s durable construction and additional features for stability. The only minor difference from the reference answer is the use of the word "optimize" instead of "enhance" in describing the stand\'s impact on performance. Overall, the generated answer is relevant and correct, with only a slight difference in wording.'

## SemanticSimilarityEvaluator

In [95]:
evaluator = SemanticSimilarityEvaluator()

Similarity:

- DEFAULT = "cosine"
- DOT_PRODUCT = "dot_product"
- EUCLIDEAN = "euclidean"

In [98]:
result = evaluator.evaluate_response(
    response=response_vector,
    reference=answers[2],
    similarity_mode=SimilarityMode.DEFAULT,
    similarity_threshold=0.8, # Default 0.8
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [99]:
result.score

0.986772607409376

In [100]:
result.passing

True

# Inspect Dataset Generator module

In [44]:
from llama_index.llama_dataset.generator import RagDatasetGenerator
from llama_index.prompts.base import PromptTemplate
from llama_index.prompts.prompt_type import PromptType

from rag_chat.storage.mongo.reader import CustomMongoReader
from rag_chat.storage.mongo import mongodb_uri
from rag_chat.storage.config import mongo_reader_config

reader = CustomMongoReader(uri=mongodb_uri)
documents = reader.load_data(**mongo_reader_config) # TODO: mongo_reader_eval_config


In [45]:
dataset_generator = RagDatasetGenerator.from_documents(
    documents[:3], # TODO: remove number 3
    num_questions_per_chunk=2,  # set the number of questions per nodes
    show_progress=True,
    # text_question_template=PromptTemplate(QUESTION_GENERATION_PROMPT),
    # text_qa_template=PromptTemplate(
    #     TEXT_QA_PROMPT, prompt_type=PromptType.QUESTION_ANSWER
    # )
)

chunk_size_limit is deprecated, please specify chunk_size instead


Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 580.50it/s]


In [49]:
rag_dataset = dataset_generator.generate_dataset_from_nodes()

  0%|          | 0/3 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 33%|███▎      | 1/3 [00:02<00:04,  2.24s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 67%|██████▋   | 2/3 [00:02<00:01,  1.06s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


100%|██████████| 3/3 [00:02<00:00,  1.18it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 50%|█████     | 1/2 [00:02<00:02,  2.43s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


100%|██████████| 2/2 [00:03<00:00,  1.59s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 50%|█████     | 1/2 [00:07<00:07,  7.50s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


100%|██████████| 2/2 [00:07<00:00,  3.91s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 50%|█████     | 1/2 [00:08<00:08,  8.44s/it]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


100%|██████████| 2/2 [00:10<00:00,  5.46s/it]


In [50]:
df = rag_dataset.to_pandas()

In [51]:
df.head()

Unnamed: 0,query,reference_contexts,reference_answer,reference_answer_by,query_by
0,How does the Toytainer Shoe Box Play-N-Store h...,[product_url: https://www.walmart.com/ip/Toyta...,The Toytainer Shoe Box Play-N-Store helps teac...,ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)
1,What features does the Toytainer Shoe Box Play...,[product_url: https://www.walmart.com/ip/Toyta...,The Toytainer Shoe Box Play-N-Store has severa...,ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)
2,How does the FUEL Pureformance Heavy Bag Stand...,[product_url: https://www.walmart.com/ip/FUEL-...,The FUEL Pureformance Heavy Bag Stand contribu...,ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)
3,What are the key features of the FUEL Pureform...,[product_url: https://www.walmart.com/ip/FUEL-...,The key features of the FUEL Pureformance Heav...,ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)
4,How does the composition of the arm sleeves co...,[product_url: https://www.walmart.com/ip/Compr...,"The composition of the arm sleeves, which is 8...",ai (gpt-3.5-turbo),ai (gpt-3.5-turbo)


## Retrieval Evaluation

In [28]:
from llama_index.evaluation import RetrieverEvaluator
from rag_chat.query.query import load_retriever, load_query_engine
from rag_chat.storage.load import load_storage
from rag_chat.storage.qdrant.vector_store import load_vector_store
from rag_chat.agent.chat import load_chat_engine

retriever = load_retriever()
metrics = ["mrr", "hit_rate"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever
)

In [None]:
load_chat_engine()

In [30]:
query_engine = load_query_engine()

In [32]:
response = query_engine.query("Hi, do you have popcorn?")

In [34]:
response.source_nodes # List of nodes

[NodeWithScore(node=TextNode(id_='a4e392f4-637e-427a-93dc-98e3bb35f867', embedding=None, metadata={'section_summary': 'The Weight Watchers Popcorn (White Cheddar) is non-GMO and gluten-free with no artificial ingredients. Each bag is only 2 SmartPoints value. The product comes in a box with 6 bags.', 'excerpt_keywords': 'Weight Watchers, Popcorn, White Cheddar, Non-GMO, Gluten-free, SmartPoints, Snacks, Box, Artificial Ingredients, Walmart'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7ff5f1285c377369e8cc5ea71459f676', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='508a14d28f91c837c970e11cc9302a5443c9265940cf2440371f2325b5e8db22'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='72bc3767-e088-481a-9873-c1140f4883cf', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a42b2b908cac1b8af76cdd536d29476597faa1aa6d57409a744f878964315a9a'), <NodeRelationship.NEXT: '3'>: RelatedN

In [35]:
response.source_nodes[0].id_

'a4e392f4-637e-427a-93dc-98e3bb35f867'

In [39]:
retrieved_ids = [node.id_ for node in response.source_nodes]

In [40]:
retrieved_ids

['a4e392f4-637e-427a-93dc-98e3bb35f867',
 '545ec0f8-d6f5-4349-b2f3-3dcce33978ce',
 '68fcac0b-74f1-4a91-8ebe-c85892fd3f25',
 '423b60e7-ff1c-4db0-a8dc-1c7134fac9c3',
 '59fb3dd0-88b3-4a78-ac1e-249beda1b6a4']

In [43]:
from llama_index.evaluation.retrieval.metrics import HitRate, MRR

In [41]:
hit_rate = HitRate() # Checks if any expected_id is in retrieved_nodes (1 or 0)
hit_rate.compute(
    expected_ids=['a4e392f4-637e-427a-93dc-98e3bb35f867'], 
    retrieved_ids=retrieved_ids
)

RetrievalMetricResult(score=1.0, metadata={})

In [61]:
first = retrieved_ids[0]
custom = retrieved_ids[1:] + [first]

In [63]:
mrr = MRR() # Checks the position of the expected_ids within the retrieved Nodes
mrr.compute(
    expected_ids=['a4e392f4-637e-427a-93dc-98e3bb35f867'], 
    retrieved_ids=custom
)

RetrievalMetricResult(score=0.2, metadata={})