In [1]:
from tqdm import tqdm
import os
import json
import uuid
from tqdm import tqdm
from langchain_chroma import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline
from chromadb import PersistentClient
import time

In [4]:
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-roberta-base-v3")

In [5]:
def load_existing_summaries(output_file):
    existing_summaries = {}
    
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    entry = json.loads(line.strip())
                    existing_summaries[entry["text"]] = entry["summary"]  # Store text-content as key
                except json.JSONDecodeError:
                    print(f"⚠️ Skipping corrupted line in {output_file}")
    
    return existing_summaries

In [6]:
text_summaries = load_existing_summaries("text_summaries.jsonl")
table_summaries = load_existing_summaries("table_summaries.jsonl")

In [7]:
len(text_summaries)

1011

In [41]:
texts = list(text_summaries.keys())
tables = list(table_summaries.keys())

In [44]:
text_summaries = list(text_summaries.values())
table_summaries = list(table_summaries.values())

In [43]:
# The vectorstore to use to index the child chunks
client = PersistentClient(path="./chroma_db")

vectorstore = Chroma(client=client,collection_name="multi_modal_rag", embedding_function=embedding_function)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [45]:
# Generate document IDs with progress bar
print("Generating document IDs...")
doc_ids = [str(uuid.uuid4()) for _ in tqdm(texts, desc="Generating Text IDs")]
table_ids = [str(uuid.uuid4()) for _ in tqdm(tables, desc="Generating Table IDs")]

# Create summary document objects with progress bar
print("Creating text document objects...")
summary_texts = [Document(page_content=summary, metadata={id_key: doc_ids[i]}) 
                 for i, summary in tqdm(enumerate(text_summaries), desc="Creating Text Docs", total=len(text_summaries))]

print("Creating table document objects...")
summary_tables = [Document(page_content=summary, metadata={id_key: table_ids[i]}) 
                  for i, summary in tqdm(enumerate(table_summaries), desc="Creating Table Docs", total=len(table_summaries))]

# Add texts to vectorstore with progress bar
print("Adding text summaries to vectorstore...")
for doc in tqdm(summary_texts, desc="Storing Texts in Vectorstore"):
    retriever.vectorstore.add_documents([doc])

print("Adding text summaries to docstore...")
for doc_id, text in tqdm(zip(doc_ids, texts), desc="Storing Texts in Docstore", total=len(texts)):
    retriever.docstore.mset([(doc_id, text)])

# Add tables to vectorstore with progress bar
print("Adding table summaries to vectorstore...")
for doc in tqdm(summary_tables, desc="Storing Tables in Vectorstore"):
    retriever.vectorstore.add_documents([doc])

print("Adding table summaries to docstore...")
for table_id, table in tqdm(zip(table_ids, tables), desc="Storing Tables in Docstore", total=len(tables)):
    retriever.docstore.mset([(table_id, table)])

# Persist the database
# print("Saving vectorstore...")
# client.persist()
print("✅ Database saved successfully!")


Generating document IDs...


Generating Text IDs: 100%|██████████████████████████| 1011/1011 [00:00<00:00, 108975.16it/s]
Generating Table IDs: 100%|███████████████████████████| 394/394 [00:00<00:00, 117477.48it/s]


Creating text document objects...


Creating Text Docs: 100%|████████████████████████████| 1011/1011 [00:00<00:00, 73704.51it/s]


Creating table document objects...


Creating Table Docs: 100%|████████████████████████████| 394/394 [00:00<00:00, 332172.02it/s]


Adding text summaries to vectorstore...


Storing Texts in Vectorstore: 100%|█████████████████████| 1011/1011 [01:55<00:00,  8.76it/s]


Adding text summaries to docstore...


Storing Texts in Docstore: 100%|████████████████████| 1011/1011 [00:00<00:00, 400532.86it/s]


Adding table summaries to vectorstore...


Storing Tables in Vectorstore: 100%|██████████████████████| 394/394 [00:28<00:00, 14.04it/s]


Adding table summaries to docstore...


Storing Tables in Docstore: 100%|█████████████████████| 394/394 [00:00<00:00, 547929.63it/s]

✅ Database saved successfully!





In [46]:
# Function to load and query the database
def query_database(query_text, top_k=3):
    vectorstore = Chroma(persist_directory="chroma_db", collection_name="multi_modal_rag", embedding_function=embedding_function)
    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
    results = retriever.get_relevant_documents(query_text)
    return results


In [50]:
query_text = "What is the maturity amount for jeevan lakshya give table"
ans = query_database(query_text)
ans

[Document(id='a0a4e6c8-748e-4b62-abbb-eb1a2591f944', metadata={'doc_id': '48772028-983b-4ba5-83c6-b53605f80746'}, page_content='LIC’s Jeevan Lakshya is a participating, non-linked, individual life insurance savings plan that combines protection and savings. It provides an annual income benefit to support the family, particularly children, in case of the policyholder\'s death before maturity, along with a lump sum maturity amount regardless of the policyholder\'s survival. The plan is available offline through agents, brokers, and other intermediaries. Key features include protection and savings, installment payment options, optional rider benefits for enhanced coverage, high sum assured rebates, and loan facilities. Eligibility requires a minimum entry age of 18 years and a maximum entry age of 50 years. The minimum policy term is 13 years, and the maximum is 25 years, with a premium paying term of policy term minus 3 years. The minimum basic sum assured is Rs. 2,00,000, with no maximu

In [52]:
def query_document(query_text, retriever, top_k=3):
    """
    Retrieve both the original text and its summary from the database.
    
    Parameters:
    - query_text (str): The input query.
    - retriever (MultiVectorRetriever): The retriever object.
    - top_k (int): Number of top relevant documents to retrieve.

    Returns:
    - results (list of dict): A list of retrieved documents with text and summary.
    """
    # Retrieve relevant document summaries
    retrieved_docs = retriever.vectorstore.similarity_search(query_text, k=top_k)

    results = []
    for doc in retrieved_docs:
        doc_id = doc.metadata["doc_id"]  # Extract doc_id
        original_text = retriever.docstore.mget([doc_id])[0]  # Fetch full text from docstore
        results.append({
            "summary": doc.page_content,  # Retrieved summary
            "original_text": original_text  # Full document text
        })
    
    return results

# Example usage:
query_text = "jeevan anand"
retrieved_docs = query_document(query_text, retriever)

for idx, doc in enumerate(retrieved_docs):
    print(f"\n🔹 Result {idx + 1}:")
    print(f"📌 **Summary**: {doc['summary']}")
    print(f"📖 **Original Text**: {doc['original_text'][:500]}...")  # Show first 500 chars



🔹 Result 1:
📌 **Summary**: The table provides information for the "Lic’s Jeevan Umany B" insurance plan, which is a non-linked, individual, savings, whole life insurance plan. Its unique identification number is 512N312V03. The GST rate is 4.50% for the first year and 2.25% from the second year onwards.

📖 **Original Text**: <table><thead><tr><th colspan="2">Proposal No:</th></tr><tr><th>Remit Product:</th><th>Lic’s Jeevan Umany B</th></tr></thead><tbody><tr><td>Tag Line: |</td><td>(A Par, Non-Linked, Individual, Savings, Whole Life Insurance Plan)</td></tr><tr><td>Unique Identification No:</td><td>512N312V03</td></tr><tr><td>GST Rate (1st Year):</td><td>4.50%</td></tr><tr><td>GST Rate (2nd Year onwards):</td><td>225%</td></tr></tbody></table>...

🔹 Result 2:
📌 **Summary**: LIC’s New Jeevan Anand (UIN:512N279V03) is a non-linked insurance policy. Key details include instalment premium, mode of premium payment, premium payment term, policy term, basic sum assured, sum assured on death 