# test SentenceSpliter

In [1]:
from llama_index.schema import MetadataMode
from llama_index.ingestion import IngestionPipeline
from llama_index.embeddings import OpenAIEmbedding
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import (
    SummaryExtractor,
    KeywordExtractor,
    QuestionsAnsweredExtractor,
)

from rag_chat.data.mongo import mongodb_uri
from rag_chat.data.mongo.document_reader import CustomMongoReader

In [2]:
DB_NAME = "products"
COLLECTION_NAME = "data"
FIELD_NAMES = ["product_url", "product_name", "brand", "description", "available", 
                "sale_price", "discount"]
SEPARATOR = " \n\n"
QUERY_DICT = {"description": { "$type": "string" }}
MAX_DOCS = 10
METADATA_NAMES = ["uniq_id", "list_price", "category"]
EXCLUDED_LLM_METADATA_KEYS = ["uniq_id"]

NODE_CHUNK_SIZE = 512 # Token chunk size
NODE_CHUNK_OVERLAP = 128

In [3]:
reader = CustomMongoReader(uri=mongodb_uri)
documents = reader.load_data(
    DB_NAME, 
    COLLECTION_NAME, 
    FIELD_NAMES, 
    separator = SEPARATOR, 
    query_dict=QUERY_DICT,
    max_docs = MAX_DOCS,
    metadata_names = METADATA_NAMES,
    metadata_seperator = SEPARATOR,
    excluded_llm_metadata_keys = EXCLUDED_LLM_METADATA_KEYS
)

In [4]:
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter( # Node Parser
            chunk_size=NODE_CHUNK_SIZE, 
            chunk_overlap=NODE_CHUNK_OVERLAP,
            # include_prev_next_rel=False
            ),
    ]
)

In [5]:
len(documents)

10

In [6]:
nodes = pipeline.run(documents=documents)

In [7]:
len(nodes)

12

In [8]:
for i, node in enumerate(nodes):
    print(len(node.text))
    print(len(documents[i].text))

818
818
1835
1835
1179
1179
1809
3108
1701
563
563
2243
910
977
1698
529
977
1521
529
1090
1521


IndexError: list index out of range

In [9]:
print("Docuemnt 0:", documents[3:4])
print("Node 0:", nodes[3:4])
print("Document 0:", documents[4:5])
print("Node 0:", nodes[4:5])

Docuemnt 0: [Document(id_='5791457e-81af-43e4-86e4-9c873091edf3', embedding=None, metadata={'uniq_id': 'd32c8b6f2ead69d851e336f5397ec527', 'list_price': 17.44, 'category': 'Health > Caregiver > Multivitamins'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['uniq_id'], relationships={}, hash='dfff3ca943f668b39c7637c4927408a5f688950e211cd0c0c44095c6b1a48a29', text='product_url: https://www.walmart.com/ip/Centrum-Silver-Men-200-Count-Complete-Multivitamin-Multimineral-Supplement-Tablet-Vitamin-D3-B-Vitamins-Zinc-Age-50/12081664 \n\nproduct_name: Centrum Silver Men (200 Count) Complete Multivitamin / Multimineral Supplement Tablet, Vitamin D3, B Vitamins, Zinc, Age 50+ \n\nbrand: Centrum \n\ndescription: Centrum Silver Men Multivitamin and Multimineral Supplement is the most complete supplement to help support the health of men age 50 and older. As men age, its important that they get enough Magnesium, Vitamin D and Vitamin B6 to help maintain healthy muscles. Centrum Silver

In [10]:
pipeline_2 = IngestionPipeline(
    transformations=[
        SentenceSplitter( # Node Parser
            chunk_size=NODE_CHUNK_SIZE, 
            chunk_overlap=NODE_CHUNK_OVERLAP,
            include_prev_next_rel=False
            ),
    ]
)

In [11]:
nodes = pipeline_2.run(documents=documents)

In [12]:
len(nodes)

12

In [13]:
print("Docuemnt 0:", documents[3:4])
print("Node 0:", nodes[3:4])
print("Document 0:", documents[4:5])
print("Node 0:", nodes[4:5])

Docuemnt 0: [Document(id_='5791457e-81af-43e4-86e4-9c873091edf3', embedding=None, metadata={'uniq_id': 'd32c8b6f2ead69d851e336f5397ec527', 'list_price': 17.44, 'category': 'Health > Caregiver > Multivitamins'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['uniq_id'], relationships={}, hash='dfff3ca943f668b39c7637c4927408a5f688950e211cd0c0c44095c6b1a48a29', text='product_url: https://www.walmart.com/ip/Centrum-Silver-Men-200-Count-Complete-Multivitamin-Multimineral-Supplement-Tablet-Vitamin-D3-B-Vitamins-Zinc-Age-50/12081664 \n\nproduct_name: Centrum Silver Men (200 Count) Complete Multivitamin / Multimineral Supplement Tablet, Vitamin D3, B Vitamins, Zinc, Age 50+ \n\nbrand: Centrum \n\ndescription: Centrum Silver Men Multivitamin and Multimineral Supplement is the most complete supplement to help support the health of men age 50 and older. As men age, its important that they get enough Magnesium, Vitamin D and Vitamin B6 to help maintain healthy muscles. Centrum Silver

# Test Metadata Extractors

In [42]:
from llama_index.schema import MetadataMode
from llama_index.ingestion import IngestionPipeline
from llama_index.embeddings import OpenAIEmbedding
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import (
    SummaryExtractor,
    KeywordExtractor,
    EntityExtractor,
    TitleExtractor,
    QuestionsAnsweredExtractor,
)

from rag_chat.data.mongo import mongodb_uri
from rag_chat.data.mongo.document_reader import CustomMongoReader

In [3]:
DB_NAME = "products"
COLLECTION_NAME = "data"
FIELD_NAMES = ["product_url", "product_name", "brand", "description", "available", 
                "sale_price", "discount"]
SEPARATOR = " \n\n"
QUERY_DICT = {"description": { "$type": "string" }}
MAX_DOCS = 10
METADATA_NAMES = ["uniq_id", "list_price", "category"]
EXCLUDED_LLM_METADATA_KEYS = ["uniq_id"]

NODE_CHUNK_SIZE = 512 # Token chunk size
NODE_CHUNK_OVERLAP = 128

In [4]:
reader = CustomMongoReader(uri=mongodb_uri)
documents = reader.load_data(
    DB_NAME, 
    COLLECTION_NAME, 
    FIELD_NAMES, 
    separator = SEPARATOR, 
    query_dict=QUERY_DICT,
    max_docs = MAX_DOCS,
    metadata_names = METADATA_NAMES,
    metadata_seperator = SEPARATOR,
    excluded_llm_metadata_keys = EXCLUDED_LLM_METADATA_KEYS
)

In [30]:
DEFAULT_SUMMARY_EXTRACT_TEMPLATE = """\
Here is the content of the product:
{context_str}

Summarize the key features of the product. \
Do not mention the category, URL, price nor discount information. \

Summary: """

In [31]:
metadata_extractors = [
    SummaryExtractor(
        summaries=["self"],
        prompt_template=DEFAULT_SUMMARY_EXTRACT_TEMPLATE
    ),
    KeywordExtractor(keywords=10),
]

In [32]:
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter( # Node Parser
            chunk_size=NODE_CHUNK_SIZE, 
            chunk_overlap=NODE_CHUNK_OVERLAP
            ),
        *metadata_extractors,
    ]
)

In [36]:
# To run async operations on Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()

# Show logging
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [33]:
nodes = pipeline.run(documents=documents)

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:12<00:00,  1.05s/it]
100%|██████████| 12/12 [00:10<00:00,  1.10it/s]


In [34]:
nodes[:1]

[TextNode(id_='604366f8-8b80-4580-9c2f-e88b04cf8c90', embedding=None, metadata={'uniq_id': 'f1aee81991548da1c5085395ea033a84', 'list_price': 11.52, 'category': 'Home > Kids Rooms > Kids Storage', 'section_summary': 'The key features of the product include a shoe box with a fold-out play mat, a top carry handle, and a fun city-theme. It is collapsible or can be stacked for modularity. The product is designed to teach children to put away their toys while having fun playing.', 'excerpt_keywords': 'Toytainer, Shoe Box Play-N-Store, Boy, teach, children, toys, play, put away, fold-out play mat, city-theme'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=['uniq_id'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='bb29d17c-e70b-4d62-8325-2efc337135a8', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'uniq_id': 'f1aee81991548da1c5085395ea033a84', 'list_price': 11.52, 'category': 'Home > Kids Rooms > Kids Storage'}, hash='d0530c10c91a260a36f7db237310444b9

In [38]:
type(documents[0])

llama_index.schema.Document

In [45]:
type(OpenAIEmbedding)

pydantic.v1.main.ModelMetaclass

In [46]:
type(nodes[0])

llama_index.schema.TextNode

In [47]:
from llama_index import VectorStoreIndex
index = VectorStoreIndex(nodes)
type(index)

  from . import loading


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


llama_index.indices.vector_store.base.VectorStoreIndex

In [58]:
from llama_index.schema import Document
import uuid

doc = Document(text="hola", id_=str(uuid.uuid4()))

In [59]:
doc

Document(id_='061c3988-481d-42f4-974d-54b89e97b3f7', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='7b7bfe7a832e493804ecfa1fa87848c2dfc9320ac22301b8a420a264cbaa8e28', text='hola', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')