In [None]:
%%capture
!pip install qdrant-client llama-index==0.10.25 llama-index-embeddings-openai  llama-index-vector-stores-qdrant llama-index-llms-openai

In [None]:
import os
import sys
from getpass import getpass
import nest_asyncio

from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv("../.env")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, setup_vector_store

In [None]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

In [None]:
QDRANT_URL = os.environ['QDRANT_URL'] or getpass("Enter your Qdrant URL:")

In [None]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [None]:
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from utils import setup_llm, setup_embed_model

setup_llm(provider="openai", model="gpt-3.5-turbo", api_key=OPENAI_API_KEY)

setup_embed_model(provider="openai", api_key=OPENAI_API_KEY)

In [None]:
from datasets import load_dataset

eval_dataset = load_dataset("harpreetsahota/LI_Learning_RAG_Eval_Set", split='train')

eval_dataset = eval_dataset.filter(lambda x: x['question_groundedness_score'] is not None and x['question_groundedness_score'] >= 5)

In [None]:
from llama_index.core.settings import Settings
from utils import setup_llm, setup_embed_model

setup_llm(
    provider="openai",
    api_key=OPENAI_API_KEY, 
    model="gpt-3.5-turbo", 
    temperature=0.75, 
    system_prompt="""Use ONLY the provided context and generate a complete, coherent answer to the user's query. 
    Your response must be grounded in the provided context and relevant to the essence of the user's query.
    """
    )

setup_embed_model(provider="openai", api_key=OPENAI_API_KEY)

In [None]:
from llama_index.core import StorageContext
from llama_index.core.settings import Settings

from utils import create_index, create_query_engine, ingest, setup_vector_store

COLLECTION_NAME = "words-of-the-senpai-semantic-nodes"

semantic_nodes_vector_store = setup_vector_store(QDRANT_URL, QDRANT_API_KEY, COLLECTION_NAME)

semantic_nodes_storage_context = StorageContext.from_defaults(vector_store=semantic_nodes_vector_store)

semantic_nodes_index = create_index(from_where="vector_store", vector_store=semantic_nodes_vector_store, storage_context=semantic_nodes_storage_context)

base_query_engine = semantic_nodes_index.as_query_engine()

# Self-Correcting Query Enginers

Self-correcting query engines in LlamaIndex evaluate their own output and then self-correct to provide better responses. They are designed to improve the quality of responses from a base query engine.

There are a few types of self-correcting query engines:

- **Retry Query Engine:** This engine uses an evaluator to improve the response from a base query engine. It first queries the base query engine, then uses the evaluator to decide if the response passes. If the response passes, it returns the response. Otherwise, it transforms the original query with the evaluation result into a new query and repeats the process up to a maximum number of retries.

- **Retry Source Query Engine:** This engine modifies the query source nodes by filtering the existing source nodes for the query based on LLM node evaluation.

- **Retry Guideline Query Engine:** This engine uses guidelines to direct the evaluator's behavior. It can be customized with your own guidelines. The engine evaluates the response against the guidelines, and if the response doesn't meet the guidelines, it transforms the query and retries.


# 🔃 [RetryQueryEngine](https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/query_engine/retry_query_engine.py)

The `RetryQueryEngine` improves the quality of query responses by retrying the query if the initial response fails to meet certain evaluation criteria. It provides a way to automatically handle situations where the first attempt at answering a query may not produce a satisfactory result.

The intuition behind this class is that sometimes, the initial response generated by a query engine might not be accurate or comprehensive enough. By evaluating the response against predefined criteria and retrying the query with modifications if necessary, the `RetryQueryEngine` aims to iteratively improve the response until it meets the desired quality or until the maximum number of retries is reached.

#### **Arguments you need to know**

1. `query_engine`: The underlying query engine that will be used to execute the actual queries. You need to provide an initialized query engine object that is capable of handling the specific type of queries you want to perform.

2. `evaluator`: Responsible for evaluating the quality of the query responses. The evaluator should have a method called `evaluate_response` that takes the query string and the response as input and returns an evaluation result indicating whether the response meets the desired criteria.

3. `max_retries`: This is an integer value specifying the maximum number of retries allowed. It determines how many times the `RetryQueryEngine` will attempt to retry the query if the previous response fails evaluation. By default, it is set to 3.

4. `callback_manager`: Manages callbacks during the query execution process. If you have any specific callbacks you want to register or invoke during the query lifecycle, you can provide a callback manager object.


#### **Under the hood**

It iteratively queries the base query engine, evaluates the response, and if the response fails the evaluation, it transforms the query based on the feedback and retries the process until a satisfactory response is obtained or the maximum number of retries is reached.


1. The `RetryQueryEngine` first queries the base query engine (`query_engine`) with the original `QueryBundle` to obtain an initial response.

2. It then uses the `evaluator` to determine if the response passes the evaluation criteria. The `evaluate_response` method of the evaluator is called with the query string and the response.

3. If the response passes the evaluation (i.e., `eval.passing` is `True`), the `RetryQueryEngine` immediately returns the response without any further retries.

4. If the response fails the evaluation (i.e., `eval.passing` is `False`), the `RetryQueryEngine` transforms the original query using the `FeedbackQueryTransformation` class. The transformation takes into account the original query bundle, the response, and the evaluation feedback to create a new, modified query bundle.

5. The `RetryQueryEngine` then creates a new instance of itself with the same `query_engine`, `evaluator`, and decremented `max_retries`.

6. The new, modified query bundle is passed to the `query` method of the newly created `RetryQueryEngine` instance, and the process repeats from step 1.

7. The retries continue until either a satisfactory response is obtained (i.e., the evaluation passes) or the maximum number of retries (`max_retries`) is reached.

8. The final response, whether it passed evaluation or reached the maximum retries, is returned.


In [None]:
from llama_index.core.query_engine import RetryQueryEngine
from llama_index.core.evaluation import RelevancyEvaluator
from utils import create_query_engine

query_response_evaluator = RelevancyEvaluator()

retry_query_engine = RetryQueryEngine(
    base_query_engine, 
    query_response_evaluator
    )


In [None]:
from utils import create_query_pipeline

retry_chain = [Settings.llm,  retry_query_engine]

retry_query_pipeline = create_query_pipeline(retry_chain)

# 🔄 [RetrySourceQueryEngine](https://github.com/run-llama/llama_index/blob/b767f274f8563298b607229e4598b4ed92056394/llama-index-core/llama_index/core/query_engine/retry_source_query_engine.py#L25)

This is a query engine that retries a query with a subset of source nodes if the initial response fails evaluation.  It improves response quality by selectively using source nodes that pass evaluation, creating a new index with those nodes, and retrying the query with the refined index.


### Arguments you need to know

- `query_engine`: The base query engine to execute queries.

- `evaluator`: Evaluates the quality of responses and source nodes.

- `max_retries`: Maximum number of retries allowed.

### Under the hood

1. Query the base query engine (`query_engine`) to get an initial response.

2. Evaluate the response using the `evaluator`.

3. If the response passes evaluation, return it.

4. If the response fails evaluation:
   - Evaluate each source node used in the response individually.
   - Create a new index using only the source nodes that pass evaluation.
   - Create a new `RetrieverQueryEngine` with the new index.
   - Create a new `RetrySourceQueryEngine` with the new `RetrieverQueryEngine` and decremented `max_retries`.
   - Retry the query with the new `RetrySourceQueryEngine`.

5. Repeat steps 1-4 until a satisfactory response is obtained or `max_retries` is reached.


In [None]:
from llama_index.core.query_engine import RetrySourceQueryEngine

retry_source_query_engine = RetrySourceQueryEngine(
    base_query_engine, 
    query_response_evaluator
    )

retry_source_response = retry_source_query_engine.query(query)

In [None]:
from utils import create_query_pipeline

retry_source_chain = [Settings.llm,  retry_source_query_engine]

retry_source_query_pipeline = create_query_pipeline(retry_source_chain)

# 🔁 [`RetryGuidelineQueryEngine`](https://github.com/run-llama/llama_index/blob/b767f274f8563298b607229e4598b4ed92056394/llama-index-core/llama_index/core/query_engine/retry_query_engine.py#L71)

The `RetryGuidelineQueryEngine` aims to improve response quality by iteratively evaluating the response against predefined guidelines, transforming the query based on evaluation feedback, and retrying the query with the modified query bundle until a satisfactory response is obtained or the maximum number of retries is reached.

#### **Arguments you need to know**

- `query_engine`: The base query engine to execute queries.

- `guideline_evaluator`: Evaluates the quality of responses based on predefined guidelines.

- `resynthesize_query`: Flag indicating whether to resynthesize the query based on evaluation feedback.

- `max_retries`: Maximum number of retries allowed.

- `query_transformer`: Transforms the query bundle based on evaluation feedback (default: `FeedbackQueryTransformation`).

#### **Under the hood**

1. When the `query` method is called with a `QueryBundle`, it first queries the base query engine to get an initial response.

2. If `max_retries` is 0 or less, the initial response is returned without evaluation or retries.

3. If `max_retries` is greater than 0, the response is evaluated using the `evaluate_response` method of the guideline evaluator.

4. If the response passes evaluation, it is returned as is.

5. If the response fails evaluation:

   - A new instance of `RetryGuidelineQueryEngine` is created with the same base query engine, guideline evaluator, `resynthesize_query` flag, decremented `max_retries`, and callback manager.

   - The `query_transformer` (default: `FeedbackQueryTransformation`) is used to transform the original query bundle based on the evaluation feedback.

   - The transformed query bundle is passed to the `query` method of the new `RetryGuidelineQueryEngine` instance.

6. The process repeats from step 2 until a satisfactory response is obtained or `max_retries` is reached.

In [None]:
from llama_index.core.evaluation import GuidelineEvaluator
from llama_index.core.evaluation.guideline import DEFAULT_GUIDELINES
from llama_index.core import Response
from llama_index.core.indices.query.query_transform.feedback_transform import FeedbackQueryTransformation
from llama_index.core.query_engine import RetryGuidelineQueryEngine

# Guideline eval
guideline_eval = GuidelineEvaluator(
    guidelines=DEFAULT_GUIDELINES
    + "\nThe response should not be overly long.\n"
    "The response should try to summarize where possible.\n"
) 

In [None]:
typed_response = (
    response if isinstance(response, Response) else response.get_response()
)
eval = guideline_eval.evaluate_response(query, typed_response)
print(f"Guideline eval evaluation result: {eval.feedback}")

feedback_query_transform = FeedbackQueryTransformation(resynthesize_query=True)
transformed_query = feedback_query_transform.run(query, {"evaluation": eval})
print(f"Transformed query: {transformed_query.query_str}")

In [None]:
retry_guideline_query_engine = RetryGuidelineQueryEngine(
    base_query_engine, 
    guideline_eval, 
    resynthesize_query=True
)

In [None]:
from utils import create_query_pipeline

retry_guideline_chain = [Settings.llm,  retry_guideline_query_engine]

retry_guideline_query_pipeline = create_query_pipeline(retry_guideline_chain)