In [None]:
%%capture
!pip install llama-index==0.10.37 llama-index-embeddings-openai==0.1.9 qdrant-client==1.9.1 llama-index-vector-stores-qdrant==0.2.8 llama-index-llms-openai==0.1.19

In [None]:
import os
import sys
from getpass import getpass
import nest_asyncio

from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv("../.env")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, setup_vector_store

In [None]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

In [None]:
QDRANT_URL = os.environ['QDRANT_URL'] or getpass("Enter your Qdrant URL:")

In [None]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [None]:
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from utils import setup_llm, setup_embed_model

setup_llm(
    provider="openai", 
    model="gpt-4o", 
    api_key=OPENAI_API_KEY
    )

setup_embed_model(
    provider="openai", 
    api_key=OPENAI_API_KEY
    )

In [None]:
from datasets import load_dataset

eval_dataset = load_dataset("harpreetsahota/LI_Learning_RAG_Eval_Set", split='train')

eval_dataset = eval_dataset.filter(lambda x: x['question_groundedness_score'] is not None and x['question_groundedness_score'] >= 4)

smol_eval_set = eval_dataset.shuffle(seed=2022).select(range(10))

In [None]:
from utils import get_documents_from_docstore

senpai_documents = get_documents_from_docstore("../data/words-of-the-senpais")

## Setup Qdrant Vector Store

In [None]:
from llama_index.core import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store.simple_index_store import SimpleIndexStore
from llama_index.core.settings import Settings
from utils import setup_vector_store

COLLECTION_NAME = "words-of-the-senpai-rr-fusion"

rr_fusion_vector_store = setup_vector_store(QDRANT_URL, QDRANT_API_KEY, COLLECTION_NAME, enable_hybrid=True)

rr_fusion_storage_context = StorageContext.from_defaults(
    docstore = SimpleDocumentStore.from_persist_dir(persist_dir="../data/words-of-the-senpais"),
    index_store=SimpleIndexStore(),
    vector_store = rr_fusion_vector_store
    )

### Ingest with a docstore

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore

from utils import ingest 

sentence_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=16)

index = VectorStoreIndex.from_documents(
    documents=senpai_documents, 
    embed_model=Settings.embed_model,
    storage_context=rr_fusion_storage_context,
    transformations=[sentence_splitter]
    )

# A brief word on vector store query modes

The vector_store_query_mode in LlamaIndex determines the type of search to be performed. Here's a brief description of each mode:

 - `default`: This mode performs a vector search. It retrieves the most similar vectors based on the query vector.  They create a numerical representation of a piece of text, represented as a long list of numbers. These dense vectors can capture rich semantics across the entire piece of text. `alpha=0.75` is used by default.

 - `hybrid`: This mode performs a hybrid search. It combines vector search with traditional search methods. `alpha` parameter determines weighting (`alpha = 0` -> bm25, `alpha = 1` -> vector search). 

 - `semantic_hybrid`: Semantic hybrid search combines text search with vector embeddings. Text search provides keyword matching and lexical retrieval. Vector embeddings allow finding documents with similar meaning, even if they don't contain exact keyword matches. This mode incorporates semantic reranking to hybrid search results to improve search relevance.

 - `sparse`: Most of the elements in a sparse vector are zero, with only a few key values being non-zero. These sparse vectors are great at capturing specific keywords and similar small details. You need to use a specialized embedding model to create sparse vectors. 
   - `FastEmbed` has a few choices for sparse text embedding models, for example you can pass in `prithvida/Splade_PP_en_v1` as the model name when you run `setup_embed_model` if you want to use them. 
    - We didn't use a sparse vector here, so we won't see this in action.  
    - Note, if you try this you'll need to set the `sparse_top_k` argument, which represents how many nodes will be retrieved from each dense and sparse query. For example, if `sparse_top_k=5` is set, that means I will retrieve 5 nodes using sparse vectors and 5 nodes using dense vectors.

 - `text_search`: Text search looks for exact keyword matches between the query and documents.

 - `similarity_top_k`: controls the final number of returned nodes. A fusion algorithm is applied to rank and order the nodes from different vector spaces, `similarity_top_k=2` means the top two nodes after fusion are returned.

 - `hybrid_top_k`: return top k results from `hybrid` search. `similarity_top_k` is used for dense search top k

In [None]:
QUERY_STRING = "How can I create my own luck?"

def test_retrievers(query=QUERY_STRING, index=index, **kwargs):
    retriever_engine = index.as_retriever(**kwargs)
    retrieved_docs = retriever_engine.retrieve(query)
    print(f"Retrieved {len(retrieved_docs)} nodes.")
    print("\n")
    for node in retrieved_docs:
        print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")
    
mode_kwargs = {
    'default': {'vector_store_query_mode': 'default', 'similarity_top_k': 3},
    'bm25': {'vector_store_query_mode':'hybrid', 'alpha': 0.0, 'hybrid_top_k': 3}, 
    'hybrid': {'vector_store_query_mode':'hybrid', 'alpha': 0.25, 'hybrid_top_k': 3},
    'semantic_hybrid': {'vector_store_query_mode':'semantic_hybrid', 'alpha': 0.75, 'hybrid_top_k': 3},
    # 'sparse': {"sparse_top_k":5},
    'text_search': {'vector_store_query_mode':'text_search', 'similarity_top_k': 3},
}

for mode, kwargs in mode_kwargs.items():
    print(f"Retrieving nodes using: {mode} retrieval")
    test_retrievers(**kwargs)
    print(f"Retrieval with {mode} complete...")        
    print("\n")

# Query Transformation

When handling user queries in a RAG system, agent, or any other pipeline, there are various ways to transform and decompose the queries before executing them.

One way is query rewriting. This involves rewriting the original query in multiple ways while which then sent sent for retrieval and generation. 

LlamaIndex implements various query transformations, [check the source code for details](https://github.com/run-llama/llama_index/blob/f116d75557d6867ed2cc61811a1c2f0b0c4d4ddb/llama-index-legacy/llama_index/legacy/indices/query/query_transform/base.py).


In [None]:
from llama_index.core import PromptTemplate

QUERY_GEN_PROMPT = """Users aren't always the best at articulating what they're looking for. Your task is to understand the 
essense of the user query and generate {num_queries} alternate queries to expand the users query so it's more robust. This way the user will
recieve the most relevant information. 

Examples are delimited by triple backticks (```) below

````
User Query: How can I find the positive in situations that seem negative?

Alternate Queries:

1. How can I cultivate optimism and positive thinking in my daily life?
2. Is it possible to find meaning and purpose in challenging or difficult times?
3. What are some effective strategies for reframing negative thoughts into positive ones?
````

````
User Query: How do I deal with setbacks, failures, delays, defeat, or other disasters?

Alternate Queries:

1. How can I build resilience and learn to cope with adversity effectively?
2. What are some practical tips for overcoming challenges and obstacles that I face?
3. How can I develop a growth mindset and view setbacks as opportunities for learning?
4. What are healthy ways to process and learn from failures and mistakes?
````
````
User Query: How can I overcome defeat and suffering by changing my mindset?

Alternate Queries:

1. What is the power of positive thinking and affirmations, and how can they benefit me?
2. Can mindfulness and meditation practices improve my mental well-being and outlook?
3. How can I develop self-compassion and acceptance, especially during difficult times?
```

Generate {num_queries} alternate queries, one on each line, for the following user query:\n
--------------------
User Query: {query}\n
--------------------

Alternate Queries:\n
"""


QUERY_GEN_PROMPT_TEMPLATE = PromptTemplate(QUERY_GEN_PROMPT)

In [None]:
def generate_queries(query= QUERY_STRING, llm=Settings.llm, num_queries  = 4):
    response = llm.predict(
        QUERY_GEN_PROMPT_TEMPLATE, 
        num_queries=num_queries, 
        query=query
        )
    queries = response.split("\n")
    queries_str = "\n".join(queries)
    print(f"Generated queries:\n{queries_str}")
    return queries

generate_queries()

# Hybrid Fusion Retriever

The Hybrid Fusion Retriever combines of semantic and keyword-based approaches.  This uses a [BM25-based retriever](https://en.wikipedia.org/wiki/Okapi_BM25) with a semantic index. BM25 is a ranking function used by search engines to estimate the relevance of documents to a given search query. 

#### How it works

The system follows a three-step process:

- **Query Generation/Rewriting**: It creates multiple queries from the original user query to better match the user's intent and improve the precision and recall of the retrieved results.

- **Retrieval**: It performs the retrieval for each query over an ensemble of retrievers.

- **Reranking/Fusion**: It combines the results from all queries and applies a reranking step to fuse the top relevant results.

#### ℹ️ Useful knowledge to have as a RAG practitioner

##### Index Fusion Mode

We set the mode to `reciprocal_rerank`. The system merges its index with a BM25 based retriever. This allows it to understand both the semantic relationships (meaningful connections between words) and keywords in the input queries. Other modes are `relative_score`, `dist_based_score`, `simple` .

  - [`reciprocal_rerank`](https://github.com/run-llama/llama_index/blob/f116d75557d6867ed2cc61811a1c2f0b0c4d4ddb/llama-index-core/llama_index/core/retrievers/fusion_retriever.py#L99): Reciprocal rank is a measure of how early a relevant item appears in a ranked list. Lower ranks correspond to higher relevance. This mode fuses the results from multiple sources by giving higher importance to nodes that appear earlier in the rankings across those sources.

  - [`relative_score`](https://github.com/run-llama/llama_index/blob/f116d75557d6867ed2cc61811a1c2f0b0c4d4ddb/llama-index-core/llama_index/core/retrievers/fusion_retriever.py#L135): It scales each score to a range from 0 to 1 using min-max scaling. Then it multiplies each scaled score by a retriever-specific weight. After that, it divides each score by the total number of queries. Basically, it scales, weights, and combines scores from multiple retrieval sources.

  - `dist_based_score`: Same as `relative_score`, but, instead of using the minimum and maximum scores directly, the function calculates them based on the mean and standard deviation of the scores. This reduces the impact of outliers on the scaling process.

  - `simple`: re-orders results based on original scores


##### **Reciprocal Rerank Algorithm**

 Since both retrievers calculate a score for the relevance of results, the system uses the reciprocal rerank algorithm to reshuffle the results. This is done without employing additional models or excessive computation, making the process more efficient.
 
  - 🧮 **Rank Calculation**: For each unique node, calculate its reciprocal rank from each list where it appears. The reciprocal rank of a node in a list is defined as 1 divided by its position in that list (e.g., a node at rank 3 has a reciprocal rank of 1/3).

  - 📊 **Score Aggregation**: Sum up the reciprocal ranks for each node across all lists in which it appears. This aggregated score represents the overall relevance of the node, taking into account its performance across multiple retrieval scenarios.

  - 🥇🥈🥉 **Reordering**: Finally, reorder all nodes based on their aggregated scores, from highest to lowest. This re-ranking step prioritizes nodes that consistently appear in higher ranks across multiple lists, thus likely to be more relevant to the query.

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever

from llama_index.core.retrievers import QueryFusionRetriever

vector_retriever = index.as_retriever(similarity_top_k=5)

bm25_retriever = BM25Retriever.from_defaults(docstore=index.docstore, similarity_top_k=5)

from llama_index.core.query_engine import RetrieverQueryEngine

retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever],
    similarity_top_k=5,
    num_queries=3,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True,
    query_gen_prompt=QUERY_GEN_PROMPT_TEMPLATE, 
)

In [None]:
nodes_with_scores = retriever.retrieve(
    "How can I stop wasting energy on projecting a facade and focus on expanding my potential as a human being?"
)

In [None]:
for node in nodes_with_scores:
    print(f"Score: {node.score:.2f} - {node.text}...\n-----\n")

In [None]:
from llama_index.core.response_synthesizers import ResponseMode

from utils import create_query_pipeline
from utils import run_generations_on_eval_set

from prompts import HYPE_ANSWER_GEN_PROMPT

HYPE_ANSWER_GEN_PROMPT_TEMPLATE = PromptTemplate(HYPE_ANSWER_GEN_PROMPT)

rr_fusion_query_engine = RetrieverQueryEngine.from_args(
    retriever,
    response_mode = ResponseMode.COMPACT_ACCUMULATE,
    use_async = True,
    text_qa_template = HYPE_ANSWER_GEN_PROMPT_TEMPLATE
    )

rr_fusion_chain = [Settings.llm,  rr_fusion_query_engine]

rr_fusion_query_pipeline = create_query_pipeline(rr_fusion_chain)

smol_eval_set = run_generations_on_eval_set(
    eval_dataset=smol_eval_set, 
    col_name="rr-fusion-answer", 
    query_pipeline=rr_fusion_query_pipeline,
    time_out=False)

In [None]:
for row in smol_eval_set.select(range(10)):
    print("💬\n")
    print(f"""🙋🏽‍♂️ Question: {row["question"]}""")
    print(f""""RR Fusion Reponse: {row["rr-fusion-answer"]}""")

# `SubQuestionQueryEngine`

The `SubQuestionQueryEngine` works by breaking down a complex query into simpler sub-questions (with each potentially targeting a specific data source).

#### Here's how it works:

 - The `SubQuestionQueryEngine` receives a complex query.

- It then decomposes this query into several sub-questions. Each sub-question is designed to extract specific information from a particular data source.

- The engine then sends these sub-questions to their respective data sources and gathers the responses.

- Finally, it synthesizes all the intermediate responses to form a final comprehensive answer to the original complex query.

This process makes the `SubQuestionQueryEngine` particularly useful for handling compare/contrast queries across documents, as well as queries pertaining to a specific document. It's also well-suited for multi-document queries and can execute any number of sub-queries against any subset of query engine tools before synthesizing the final answer.

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine_tools = [
    QueryEngineTool(
        query_engine=index.as_query_engine(),
        metadata=ToolMetadata(
            name="the senpais",
            description="The collective thoughts and writings of all my virtual mentors",
        ),
    ),
]

sub_question_query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    use_async=True
    )

sub_question_query_engine.update_prompts({'response_synthesizer:text_qa_template':HYPE_ANSWER_GEN_PROMPT_TEMPLATE})

In [None]:
from utils import display_prompt_dict

sub_q_prompts = sub_question_query_engine.get_prompts()

display_prompt_dict(sub_q_prompts)

In [None]:
sub_question_query_engine.query("How can I build my own luck, what are the types of luck I should pursue, and how can I hack luck and minimize my exposure to downside while maintaining skin in the game?")

# Hypothetical Document Embeddings (HyDE)

At a high level, [HyDE](https://arxiv.org/pdf/2212.10496.pdf) is an embedding technique that takes queries, generates a hypothetical answer, and then embeds that generated document and uses that as the final example. 

- 🧐 **Problem Tackled**: Addresses the struggle of creating fully zero-shot dense retrieval systems without relevance labels.

- 📚 **Traditional Methods**: Rely on relevance labels for document retrieval based on semantic similarities.

- 🚫 **Zero-Shot Challenge**: Especially tough without a large dataset for training.

### What is HyDE?

Given a query, `HyDE` instructs a language model to generate a hypothetical document.

This document captures relevance patterns but might contain inaccuracies or false details.

After generating the hypothetical document, an unsupervised contrastively learned encoder encodes the document into an embedding vector.

This vector identifies a neighborhood in the corpus embedding space, where similar real documents are retrieved based on vector similarity.

### How Does HyDE Work?

The process starts by feeding a query to a generative model with the instruction to "write a document that answers the question". This generates a hypothetical document that captures the essence of relevance.

 - Generates an embedding vector for a "fake" document

- It does not generate any actual text content for the document

- The embedding is solely for reserving space in the vectorstore index

- There is no full hypothetical document text you can access later

This vector is used to search against the corpus embeddings, and the most similar real documents are retrieved. The idea is that a hypothetical answer to a question is more semantically similar to the real answer than the question is. 

**In practice this means that your search would use GPT to generate a hypothetical answer, then embed that and use it for search**.

Key advantages of HyDE:

- Zero-shot, no labeled data or fine-tuning needed

- Performs comparably to fine-tuned retrievers across tasks/languages

- Grounds the query in real data via generated hypothetical documents

In [None]:
from llama_index.core.indices.query.query_transform import HyDEQueryTransform

from llama_index.core.query_engine import TransformQueryEngine

In [None]:
hyde = HyDEQueryTransform(
    include_original=True,
    )

hyde_query_engine = TransformQueryEngine(
    query_engine = index.as_query_engine(), 
    query_transform = hyde,
    )

In [None]:
display_prompt_dict(hyde_query_engine.get_prompts())

In [None]:
response = hyde_query_engine.query(QUERY_STRING)

display(Markdown(f"<b>{response}</b>"))

In [None]:
hyde_chain = [Settings.llm,  hyde_query_engine]

hyde_query_pipeline = create_query_pipeline(hyde_chain)

smol_eval_set = run_generations_on_eval_set(
    eval_dataset=smol_eval_set, 
    col_name="hyde-answer", 
    query_pipeline=hyde_query_pipeline,
    time_out=False
    )

In [None]:
for row in smol_eval_set.select(range(10)):
    print("💬\n")
    print(f"""🙋🏽‍♂️ Question: {row["question"]}""")
    print(f""""RR Fusion Reponse: {row["rr-fusion-answer"]}""")
    print(f""""HyDE Reponse: {row["hyde-answer"]}""")