In [None]:
%%capture
!pip install llama-index==0.10.25 llama-index-embeddings-fastembed qdrant-client llama-index-vector-stores-qdrant llama-index-llms-cohere

In [None]:
import os
import sys
from dotenv import load_dotenv
from getpass import getpass
import nest_asyncio
from IPython.display import Markdown, display

nest_asyncio.apply()

load_dotenv("../.env")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, create_vector_store_index

In [None]:
CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

In [None]:
QDRANT_URL = os.environ['QDRANT_URL'] or getpass("Enter your Qdrant URL:")

In [None]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [None]:
from llama_index.core.settings import Settings
from utils import setup_llm, setup_embed_model, create_vector_store_index

setup_llm(api_key=CO_API_KEY)

setup_embed_model()

COLLECTION_NAME = "it_can_be_done"

index = create_vector_store_index(QDRANT_URL, QDRANT_API_KEY, COLLECTION_NAME)

# Default Prompt Templates

In [None]:
from utils import display_prompt_dict, create_query_engine

In [None]:
query_engine = create_query_engine(
    index,
    similarity_top_k=3, 
    return_sources=True
    )

display_prompt_dict(query_engine.get_prompts())

In [None]:
from llama_index.core.prompts import PromptTemplate

custom_prompt = """You are an assistant for question-answering tasks related to \
motivational poetry. Your must reponse with an original Haiku style poem.

Use the following pieces of retrieved context to answer the user's query:

---------------------\n
{context_str}\n
---------------------\n

Query: {query_str}
"""

custom_prompt_template = PromptTemplate(custom_prompt)

In [None]:
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": custom_prompt_template}
)

In [None]:
display_prompt_dict(query_engine.get_prompts())

In [None]:
from utils import create_query_pipeline

chain = [Settings.llm, query_engine]

query_pipeline = create_query_pipeline(chain)

In [None]:
response = query_pipeline.run("I want to turn a new chapter in my life.")

In [None]:
print(response.response)

# Response Synthesizers

The Llama Index [documentation](https://docs.llamaindex.ai/en/stable/module_guides/querying/response_synthesizers/) has a lot of detail regarding each of the response sythensizers. 

In [None]:

from llama_index.core.response_synthesizers import ResponseMode

In [None]:
dir(ResponseMode)

#### ⚗️ Refine 

Refine is an iterative method to generate a response. 

Initially, we use the context in the first node and the query to create a basic answer. Then, we refine this answer by inputting it, along with the query and context of the second node, into a "refine prompt" to generate an improved answer. 

This refinement process continues through N-1 nodes, with N being the total number of nodes.

#### 🤏 Compact

Compact and refine mode first combine text chunks into larger consolidated chunks that more fully utilize the available context window, then refine answers across them. This mode is faster than refine since we make fewer calls to the LLM.


#### 📝 Simple summarize

Merge all text chunks into one and make a large language model call. The call will fail if the merged text chunk exceeds the context window size.

#### 🌴 Tree summarize

Construct a tree index for the candidate nodes in a bottom-up manner then use a summary prompt based on the query. Return the root node as the final response.

#### 🤖 Generation

Ignore context, just use LLM to generate a response.

#### ❌ No text
Return the retrieved context nodes, without synthesizing a final response.

#### 📏 Accumulate

Synthesize a response for each text chunk, and then return the concatenation.

#### Compact accumulate

In the compact and accumulate mode, text chunks are combined into larger chunks to utilize the context window better. Answers are then accumulated for each chunk and returned as a concatenation. This mode is faster than accumulate as it reduces calls to the LLM.

In [None]:
from llama_index.core import get_response_synthesizer

response_synthesizer = get_response_synthesizer(response_mode="compact")

query_engine = create_query_engine(
    index,
    response_synthesizer = response_synthesizer
    )

chain = [Settings.llm, query_engine]

query_pipeline = create_query_pipeline(chain)

query_pipeline.run("I want to turn a new chapter in my life.")