# GPT3.5 + MongoDB Atlas

In [1]:
from rag_chat.storage.mongo import mongodb_client, mongodb_uri
from llama_index import SummaryIndex
from llama_index.readers import SimpleMongoReader
import openai
import logging
import sys
import os
from dotenv import load_dotenv

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY

## 0. Simple Test

In [None]:
db_name = "products"
collection_name = "data_xs"
query_dict = {} # passed into db.collection.find()
field_names = ["product_name"]
reader = SimpleMongoReader(uri=mongodb_uri)
documents = reader.load_data(
    db_name, collection_name, field_names, query_dict=query_dict
)

In [None]:
index = SummaryIndex.from_documents(documents)

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("Do you have any light bulbs?")
print(response)

## 1. Load data

Source: https://docs.llamaindex.ai/en/latest/module_guides/loading/connector/root.html

In [3]:
DB_NAME = "products"
COLLECTION_NAME = "data"
FIELD_NAMES = ["product_url", "product_name", "description"]
SEPARATOR = " \n"
QUERY_DICT = {"description": { "$type": "string" }} # Will be passed into db.collection.find() -> TODO: need to check data types to prevent errors or clean the data
MAX_DOCS = 50
METADATA_NAMES = ["sale_price", "brand", "category", "available"]

In [4]:
# BaseReader: https://github.com/run-llama/llama_index/blob/08caf8119c66b4dfb5899da0efce09bb2c62bf96/llama_index/readers/base.py#L24
# SimpleMongoReader: https://github.com/run-llama/llama_index/blob/08caf8119c66b4dfb5899da0efce09bb2c62bf96/llama_index/readers/mongo.py
reader = SimpleMongoReader(uri=mongodb_uri)
documents = reader.load_data(
    DB_NAME, 
    COLLECTION_NAME, 
    FIELD_NAMES, 
    separator = SEPARATOR, 
    query_dict=QUERY_DICT,
    max_docs = MAX_DOCS,
    metadata_names = METADATA_NAMES
)

In [5]:
documents[:5]

[Document(id_='73d35005-ca59-48ae-97ed-c2101114edf3', embedding=None, metadata={'sale_price': 31.93, 'brand': 'La Costeï¿½ï¿½a', 'category': 'Food | Meal Solutions, Grains & Pasta | Canned Goods | Canned Vegetables', 'available': True}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='c689b341afa6f28007625c316c7f15a89f51519df8b8e3b2ecb2bfc0b9f07618', text='https://www.walmart.com/ip/La-Costena-Chipotle-Peppers-7-OZ-Pack-of-12/139941530 \nLa Costena Chipotle Peppers, 7 OZ (Pack of 12) \n We aim to show you accurate product information. Manufacturers, suppliers and others provide what you see here, and we have not verified it. See our disclaimer |La Costena Chipotle Peppers, 7 OZ (Pack of 12) Easy open. Ready to serve! Product of Mexico.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='6ed53629-fe11-4a07-b6e7-9bd896351eb6', embedding=No

___
NOT TODO: We might need to extend or redo the SimpleMongoReader to add information on the `relationships`, maybe by linking similar products (by category or brand, or other).

This might not be necessary, as relationships denote informaiton on the order of the Nodes or the Parent/Child nodes, with Documents seems as if only the SOURCE can be specified as a relationship (https://github.com/search?q=repo%3Arun-llama%2Fllama_index+DocumentRelationship&type=code). therefore maybe we don't need relationships in this case, and categories can be added as a list in the metadata.

Relationships can be set like this (as per Node info https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_nodes.html):
```
node[0].relationships[NodeRelationship.NEXT] = RelatedNodeInfo(
    node_id=node[1].node_id
)
```
___

TODO: we could even set the id of the document to the URL, for example by setting `documents[0].id_`.

Also, we might want to add info in metadata for better retrieval which is not added to the prompt: information relavant to rank the information but not relevant for the response: https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_documents.html#customizing-llm-metadata-text

This is what will be feed to the LLM for ranking.

In [6]:
from llama_index.schema import MetadataMode

print(documents[0].get_content(metadata_mode=MetadataMode.LLM)) # What the LLM model will see when crafting the response
print("------------")
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED)) # What the embedding model will see when ranking the information


sale_price: 31.93
brand: La Costeï¿½ï¿½a
category: Food | Meal Solutions, Grains & Pasta | Canned Goods | Canned Vegetables
available: True

https://www.walmart.com/ip/La-Costena-Chipotle-Peppers-7-OZ-Pack-of-12/139941530 
La Costena Chipotle Peppers, 7 OZ (Pack of 12) 
 We aim to show you accurate product information. Manufacturers, suppliers and others provide what you see here, and we have not verified it. See our disclaimer |La Costena Chipotle Peppers, 7 OZ (Pack of 12) Easy open. Ready to serve! Product of Mexico.
------------
sale_price: 31.93
brand: La Costeï¿½ï¿½a
category: Food | Meal Solutions, Grains & Pasta | Canned Goods | Canned Vegetables
available: True

https://www.walmart.com/ip/La-Costena-Chipotle-Peppers-7-OZ-Pack-of-12/139941530 
La Costena Chipotle Peppers, 7 OZ (Pack of 12) 
 We aim to show you accurate product information. Manufacturers, suppliers and others provide what you see here, and we have not verified it. See our disclaimer |La Costena Chipotle Peppers,

DONE: I might have to rewrite a custom Reader in order to load the data from MongoDB, for example, I'd like some fields to be both in the FIELD_NAMES and metadata parts, but in the text I can only have strings, therefore I might need to make it custom so the fields that are not strings get transformed when added to the FIELD_NAMES. Also personalizing a bit more the FIELD_NAMES field.

## 2. Nodes (check)

Source: https://docs.llamaindex.ai/en/latest/module_guides/loading/documents_and_nodes/root.html#

- Document: container arround a data source (PDF, API, data from DB).
- Node: represents a “chunk” of a source Document. Nodes have metadata that relate them to the document they are in and to other nodes.

We might not need to separate Documents into Nodes, as informaiton is not that big on a document, but rather, try to have one Node per document? Or, if possible, only use the documents directly.

In [None]:
# We might want to transform the Documents into Nodes and do certain transformations by using an Ingestion Pipeline. We can also use it to store the embeddings into a Vector Store.
# https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/root.html 
# Here are the possible transformations: https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/transformations.html
# For starters, we might want to use the SimpleFileNodeParser, and later on see if we can improve the Node parsing or only use the Documents: https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/transformations.html

In [None]:
# CAUTION: Do not run!!
from llama_index.embeddings import OpenAIEmbedding
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import TitleExtractor
from llama_index.ingestion import IngestionPipeline, IngestionCache
import asyncio

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
        TitleExtractor(),
        OpenAIEmbedding(), # Creating the embedding is to create the Index
    ]
)

# run the pipeline
nodes = pipeline.run(documents=documents) 
# NOTE: This fails on Jupyter Notebook due to an already asyncio.run() process

In [None]:
# The error about Notebooks can be avoided by runing this:
# import nest_asyncio
# nest_asyncio.apply()

In [8]:
from llama_index.text_splitter import SentenceSplitter

# Because it fails with pipeline, we run separate transformations:
splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20) # If we increase the chunk_size we can end up with a Node per Document.
nodes = splitter.get_nodes_from_documents(documents)

In [9]:
len(nodes)

52

A transformation is something that takes a list of nodes as an input, and returns a list of nodes. Currently, the following components are Transformation objects:
- [TextSplitter](ttps://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules.html#text-splitters): In our case the only one that should work is SentenceSplitter, which attempts to split text while respecting the boundaries of sentences.
- [NodeParser](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules.html): create nodes based on the type of content that is being parsed. Not suited for our case.
- [MetadataExtractor](https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_metadata_extractor.html): For now it might not be necessary, but **TODO: we can test some of these in the future**!!
    - `SummaryExtractor` - automatically extracts a summary over a set of Nodes.
    - `QuestionsAnsweredExtractor` - extracts a set of questions that each Node can answer.
    - `TitleExtractor` - extracts a title over the context of each Node.
    - `EntityExtractor` - extracts entities (i.e. names of places, people, things) mentioned in the content of each Node.
- [Embeddings model](https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings.html#list-of-embeddings): Various integrations supported, **TODO: This will need to be changed depending on the model beeing used!!**

https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/transformations.html

In [None]:
# CAUTION: Do not run!!
from qdrant_client import QdrantClient
from llama_index.vector_stores import QdrantVectorStore

client = QdrantClient(location=":memory:")
# client = QdrantClient(path="path/to/db") # -> Path to Docker for persistent storage

vector_store = QdrantVectorStore(client=client, collection_name="test_store")

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
        TitleExtractor(),
        OpenAIEmbedding(),
    ],
    vector_store=vector_store,
)

# Ingest directly into a vector db
nodes = pipeline.run(documents=documents)

# It also supports async operations
# nodes = await pipeline.arun(documents=documents)

# Create your index
from llama_index import VectorStoreIndex

index = VectorStoreIndex.from_vector_store(vector_store)

Note that in the above example, embeddings are calculated as part of the pipeline. If you are connecting your pipeline to a vector store, embeddings must be a stage of your pipeline or your later instantiation of the index will fail.

You can omit embeddings from your pipeline if you are not connecting to a vector store, i.e. just producing a list of nodes.

## 3. Index

https://docs.llamaindex.ai/en/latest/understanding/indexing/indexing.html#

An Index is a data structure composed of Document objects, designed to enable querying by an LLM. Your Index is designed to be complementary to your querying strategy.

A Vector Store Index turns all of your text into embeddings using an API from your LLM, this is what is meant when we say it “embeds your text”. If you have a lot of text, generating embeddings can take a long time since it involves many round-trip API calls. When you want to search your embeddings, your query is itself turned into a vector embedding, and then a mathematical operation is carried out by VectorStoreIndex to rank all the embeddings by how semantically similar they are to your query.

Once the ranking is complete, VectorStoreIndex returns the most-similar embeddings as their corresponding chunks of text. The number of embeddings it returns is known as k, so the parameter controlling how many embeddings to return is known as top_k. This whole type of search is often referred to as “top-k semantic retrieval” for this reason.

In [None]:
# index = SummaryIndex.from_documents(documents)
# This can be used with documents, but if we want to use embeddings for faster search, we need Nodes, as the Nodes have a fixed chunk size, I think...
# Source: https://docs.llamaindex.ai/en/latest/understanding/indexing/indexing.html#vector-store-index

In [11]:
from llama_index import VectorStoreIndex

index_from_docs = VectorStoreIndex.from_documents(documents, show_progress=True)

Parsing nodes: 100%|██████████| 50/50 [00:00<00:00, 1130.38it/s]
Generating embeddings:   0%|          | 0/52 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings: 100%|██████████| 52/52 [00:01<00:00, 40.21it/s]


We can see that even as though we have 50 documents, the `from_documents` created 52 embeddings, this is because when you use from_documents, your Documents are split into chunks and parsed into Node objects. We could control the Node creation settings by adding the ServiceContext:

In [19]:
from llama_index import ServiceContext
from llama_index.text_splitter import SentenceSplitter

text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20)
service_context_custom = ServiceContext.from_defaults(text_splitter=text_splitter)

index_from_docs_custom = VectorStoreIndex.from_documents(documents, service_context = service_context_custom, show_progress=True)

Parsing nodes: 100%|██████████| 50/50 [00:00<00:00, 975.97it/s]
Generating embeddings:   0%|          | 0/63 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Generating embeddings: 100%|██████████| 63/63 [00:01<00:00, 50.29it/s]


And we can see that since we set the SentenceSplitter method (which is a NodeParser) we now have 63 embeddings.

In [18]:
index_from_docs

<llama_index.indices.vector_store.base.VectorStoreIndex at 0x14cfeb340>

In [12]:
index_from_nodes = VectorStoreIndex(nodes)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


## 4. Storing

Check best database for vector store: https://docs.llamaindex.ai/en/latest/module_guides/storing/vector_stores.html#

Qdrant allos to store the vectors and also the metadata, here called Payload: https://qdrant.tech/documentation/concepts/payload/ 

When running Qdrant on Docker, we can access the UI, where we can visualize the vectors, see that the default configuration is set to cosine distance and the size of the vectors.

In [20]:
# https://docs.llamaindex.ai/en/latest/examples/vector_stores/QdrantIndexDemo.html#
from qdrant_client import QdrantClient
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

qdrant = QdrantClient(location=":memory:") # Create in-memory Qdrant instance, for testing, CI/CD
# OR
# client = QdrantClient("localhost", port=6333)  # Persists changes to disk, fast prototyping

In [21]:
service_context = ServiceContext.from_defaults()
vector_store = QdrantVectorStore(client=qdrant, collection_name="products")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [None]:
# We can build the vector store but asyncronously: https://docs.llamaindex.ai/en/latest/examples/vector_stores/QdrantIndexDemo.html#build-the-vectorstoreindex-asynchronously

## 5. Querying

https://docs.llamaindex.ai/en/latest/understanding/querying/querying.html#

Querying consists of three distinct stages:
1. **Retrieval** is when you find and return the most relevant documents for your query from your Index. As previously discussed in indexing, the most common type of retrieval is “top-k” semantic retrieval, but there are many other retrieval strategies.
2. **Postprocessing** is when the Nodes retrieved are optionally reranked, transformed, or filtered, for instance by requiring that they have specific metadata such as keywords attached or require that the retrieved nodes reach a minimum similarity score to be included.
3. **Response synthesis** is when your query, your most-relevant data and your prompt are combined and sent to your LLM to return a response.

In [30]:
# THIS IS THE VANILLA IMPLEMENTATION
query_engine = index.as_query_engine()
response = query_engine.query("What are the cheepest products?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [31]:
from IPython.display import Markdown, display

display(Markdown(f"<b>{response}</b>"))

<b>The cheapest products are not mentioned in the given context information.</b>

In [None]:
# We can also do the query async: https://docs.llamaindex.ai/en/latest/examples/vector_stores/QdrantIndexDemo.html#async-query-index

Here's the ganular implementation, where we can select each of the steps:

#### Retriever

TODO: we need to look into this for further customization: https://docs.llamaindex.ai/en/latest/module_guides/querying/retriever/root.html#

In [32]:
from llama_index.retrievers import VectorIndexRetriever

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

#### Postprocessing & Response synthesis

##### Postprocessing:
- Node postprocessors can provide filters and augmentation that can improve the relevancy of the retrieved Node objects and help reduce the time/number of LLM calls/cost.
- Here's a list of all Node Postprocessors: https://docs.llamaindex.ai/en/latest/api_reference/node_postprocessor.html# (each `pydantic model` is a postprocessor).

##### Response Synthesizer:
Here are the options supported: https://docs.llamaindex.ai/en/stable/module_guides/deploying/query_engine/response_modes.html ; REFINE seems like the best option for us, but we need to check what "create and refine" means in the documentation. We can also consider creating our own synthesizer: https://docs.llamaindex.ai/en/stable/module_guides/querying/response_synthesizers/root.html#custom-response-synthesizers

Then, we can even ensure a structured response with Pydantic: https://docs.llamaindex.ai/en/latest/module_guides/querying/structured_outputs/query_engine.html# 
This could be useful to extract a structure for analysis puerpouses and provide links, prices, etc.


In [41]:
from llama_index import get_response_synthesizer
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index.response_synthesizers import ResponseMode

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    # response_mode=ResponseMode.REFINE -> REFINE is not working very good, the default is best for now
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [42]:
response = query_engine.query("What light bulbs do you have?")
print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
We have a variety of light bulbs available, including the following options:
- 03102 Light Bulb, Vanity Globe, Clear, 720 Lumens, 60-Watts - Quantity 1
- AduroSmart ERIA Soft White Smart A19 Light Bulb CRI 90+, 60W Equivalent, Hub Required


Now we try with the Pydantic pattern

In [48]:
from typing import List
from pydantic import BaseModel

# NOTE: This structure seems to be too complicated!!

class Product(BaseModel):
    """Data model for a product"""
    name: str
    product_url: str

class Response(BaseModel):
    """Data model for a response."""
    products: List[Product]
    response: str

In [52]:
class Response(BaseModel):
    """Data model for a response."""
    products: List[str]

In [53]:
response_synthesizer = get_response_synthesizer(
    # response_mode=ResponseMode.REFINE -> REFINE is not working very good, the default is best for now
    output_cls = Response
)

In [54]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [55]:
response = query_engine.query("What light bulbs do you have?")
print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
{"products":["03102 Light Bulb, Vanity Globe, Clear, 720 Lumens, 60-Watts - Quantity 1","AduroSmart ERIA Soft White Smart A19 Light Bulb CRI 90+, 60W Equivalent, Hub Required"]}


## 6. Prompts

- First check this: https://docs.llamaindex.ai/en/stable/module_guides/deploying/chat_engines/usage_pattern.html
- https://docs.llamaindex.ai/en/latest/module_guides/models/prompts.html# 
- https://docs.llamaindex.ai/en/stable/examples/customization/prompts/chat_prompts.html
- Investigate LangChain: I think that LangChain should be used to identify when RAG is needed and maybe to add ReAct on top, but the RAG prompting can be made with Llama-Index.

## 7. Evaluating

https://docs.llamaindex.ai/en/latest/understanding/evaluating/evaluating.html

here are some open-source tools (i.e., DeepEval) that help us debug and evaluate the results: https://docs.llamaindex.ai/en/latest/module_guides/observability/observability.html#deepeval

DeepEval has different metrics: latency, hallucination, faithfulness, summarization, ...

## 8. All toguether

https://docs.llamaindex.ai/en/latest/understanding/putting_it_all_together/putting_it_all_together.html#