In [20]:
from tqdm import tqdm
import os
import json
import uuid
from tqdm import tqdm
from langchain_chroma import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline

In [3]:
CHUNKS_FOLDER = "processed_chunks"

In [4]:
def data_info(folder_path = "processed_chunks"):
    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    total_text = 0
    total_table = 0
    for json_file in tqdm(json_files, desc="Loading chunks", unit="file"):
        file_path = os.path.join(folder_path, json_file)
        composite_count = 0
        table_count = 0
        with open(file_path, "r", encoding="utf-8") as f:
            print(file_path)
            chunk_data = json.load(f)
            # Convert dictionaries back into unstructured objects
            for chunk in chunk_data:
                
                
                element_type = chunk.get("type")
                if element_type == "Table":
                    table_count+=1
                    total_table += 1
                    # print(chunk)
                elif element_type == "CompositeElement":
                    composite_count+=1
                    total_text += 1
            print("Number of Text Elements in this pdf: ",composite_count , "\nNumber of broad tables: ",table_count)
            print("\n")
    print("Total Text chunks: ", total_text, "\nTotal Tables: ", total_table)

In [5]:
data_info("processed_chunks")

Loading chunks:  31%|███████▍                | 24/78 [00:00<00:00, 234.43file/s]

processed_chunks/LIC’s Digi Credit Life - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0


processed_chunks/New Money Back Plan-25 years - CIS  New Money Back Plan-25 years .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Yuva Credit Life - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0


processed_chunks/New Jeevan Amar - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0


processed_chunks/New Jeevan Anand - Policy Document .pdf.json
Number of Text Elements in this pdf:  25 
Number of broad tables:  0


processed_chunks/LIC’s Digi Term - CIS LIC’s Digi Term .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0


processed_chunks/Accident Benefit Rider - Policy Document .pdf.json
Number of Text Elements in this pdf:  6 
Number of broad tables:  0


processed_chunks/New T

Loading chunks: 100%|████████████████████████| 78/78 [00:00<00:00, 262.23file/s]

processed_chunks/Jeevan Tarun - CIS  Jeevan Tarun .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Bima Shree - CIS LIC’s Bima Shree .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0


processed_chunks/New Money Back Plan- 20 Years - CIS  New Money Back Plan-20 years .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/Single Premium Endowment Plan - CIS  Single Premium Endowment Plan .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0


processed_chunks/Bima Ratna - CIS  Bima Ratna .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Yuva Term - CIS LIC’s Yuva Term .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Digi Term - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0


processed_chu




In [6]:
def prepare_lists(folder_path = "processed_chunks"):
    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    tables = []
    texts = []
    orig_tables = 0
    total_text = 0
    total_table = 0
    for json_file in tqdm(json_files, desc="Loading chunks", unit="file"):
        file_path = os.path.join(folder_path, json_file)
        composite_count = 0
        table_count = 0
        with open(file_path, "r", encoding="utf-8") as f:
            print(file_path)
            chunks = json.load(f)
            # Convert dictionaries back into unstructured objects
            for chunk in chunks:
                element_type = chunk.get("type")
                if element_type == "Table":
                    table_count+=1
                    total_table += 1
                    # print(chunk)
                elif element_type == "CompositeElement":
                    composite_count+=1
                    total_text += 1
                    texts.append(chunk["text"])
                for element in chunk["metadata"]["orig_elements"]:
                    if "text_as_html" in element["metadata"] and element["type"] == "Table":
                        tables.append(element["metadata"]["text_as_html"])
                        orig_tables += 1
                        
            print("Number of Text Elements in this pdf: ",composite_count , "\nNumber of broad tables: ",table_count)
            print("Number of original tables: ", orig_tables)
            print("\n")
    print("Total Text chunks: ", total_text, "\nTotal Tables: ", total_table)
    return tables , texts

In [7]:
tables , texts = prepare_lists("processed_chunks")

Loading chunks:   0%|                                  | 0/78 [00:00<?, ?file/s]

processed_chunks/LIC’s Digi Credit Life - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  2


processed_chunks/New Money Back Plan-25 years - CIS  New Money Back Plan-25 years .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0
Number of original tables:  4


processed_chunks/LIC’s Yuva Credit Life - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  8


processed_chunks/New Jeevan Amar - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  11


processed_chunks/New Jeevan Anand - Policy Document .pdf.json
Number of Text Elements in this pdf:  25 
Number of broad tables:  0
Number of original tables:  19


processed_chunks/LIC’s Digi Term - CIS LIC’s Digi Term .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0
Number of 

Loading chunks:  78%|██████████████████▊     | 61/78 [00:00<00:00, 307.40file/s]

processed_chunks/Saral Jeevan Bima - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  9 
Number of broad tables:  0
Number of original tables:  160


processed_chunks/LIC’s Digi Term - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  13 
Number of broad tables:  0
Number of original tables:  171


processed_chunks/New Children's Money Back Plan - CIS  New Children's Money Back Plan .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0
Number of original tables:  173


processed_chunks/Jeevan Azad - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  185


processed_chunks/Premium Waiver Benefit Rider - Policy Document .pdf.json
Number of Text Elements in this pdf:  7 
Number of broad tables:  0
Number of original tables:  186


processed_chunks/Jeevan Utsav - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  18 
Number of broad tables:  0
Number of orig

Loading chunks: 100%|████████████████████████| 78/78 [00:00<00:00, 315.81file/s]

processed_chunks/Amritbaal - CIS  Amritbaal .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0
Number of original tables:  361


processed_chunks/Bima Ratna - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  19 
Number of broad tables:  0
Number of original tables:  377


processed_chunks/Accidental Death & Disability Benefit Rider - Policy Document .pdf.json
Number of Text Elements in this pdf:  8 
Number of broad tables:  0
Number of original tables:  378


processed_chunks/LIC’s Yuva Credit Life - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  10 
Number of broad tables:  0
Number of original tables:  385


processed_chunks/New Children's Money Back Plan - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  18 
Number of broad tables:  0
Number of original tables:  396


processed_chunks/Jeevan Tarun - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  18 
Number of broad tables:  0
Number of original ta




In [14]:
# Initialize the summarization model (BART-large-cnn)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use mps:0


In [13]:
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-roberta-base-v3")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="multi_modal_rag", embedding_function=embedding_function)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [18]:
# Function to summarize texts
def summarize_texts(text_chunks):
    return [summarizer(text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] for text in text_chunks]

In [19]:
# Generate summaries
text_summaries = summarize_texts(texts)
table_summaries = summarize_texts(tables)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

# Persist the database
vectorstore.persist()


Your max_length is set to 200, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 200, but your input_length is only 94. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


KeyboardInterrupt: 

In [24]:
def dynamic_summarize(text, prompt_template):
    """Adjust max_length dynamically and prepend a custom prompt for better summarization."""
    input_length = len(text.split())  # Count words
    max_len = min(150, max(30, int(input_length * 0.7)))  
    min_len = min(50, max(10, int(input_length * 0.3)))  
    
    # Prepend the custom prompt
    full_prompt = f"{prompt_template}\n{text}"
    
    return summarizer(full_prompt, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']

In [33]:
def dynamic_summarize(text, prompt_template, target_length=500):
    """Adjust max_length dynamically and prepend a custom prompt for better summarization.
       Allows for a target summary length, dynamically adjusts if input is shorter.
    """
    input_length = len(text.split())  # Count words

    # Adaptive max_len, but capped at target_length
    max_len = min(target_length, max(30, int(input_length * 0.7)))
    min_len = min(50, max(10, int(input_length * 0.3)))

    # Prepend the custom prompt
    full_prompt = f"{prompt_template}\n{text}"

    try:  # Wrap in a try-except to catch potential summarization errors
        summary = summarizer(full_prompt, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
        print(summary,"\n\n\n")
        return summary
    except Exception as e:
        print(f"Error during summarization: {e}")  # Handle or log the error
        return "Error generating summary." # Or return a default summary, or raise the exception if you want to stop execution



In [26]:
for t in texts :
    print(len(t))

2990
3324
7120
4406
3091
2930
2071
2008
2252
2175
3319
2469
2583
4883
5454
5625
54
3465
2645
2068
2364
337
3367
3144
7016
4500
3088
2930
2068
2012
2249
2230
3262
2467
2630
4883
5454
5625
54
2538
4876
6535
4844
2193
2637
2267
3888
3562
3478
2153
2192
2480
2685
5711
5745
4366
2682
3059
2002
2833
3891
6040
2674
2077
2456
2020
5530
2025
2599
2350
3304
3907
2653
2147
2500
5866
5722
4357
4000
6103
58
3074
3141
2753
817
3495
3581
2947
2101
2264
943
3026
3169
2497
1170
2251
2671
2030
2663
2789
2044
867
2436
2105
2047
2183
2114
4720
2302
2038
2239
4646
2088
216
2593
2129
2152
2520
5379
2113
152
2579
3046
7905
6885
2362
2354
2412
2398
2071
6260
3519
3819
3710
2384
3278
2312
7647
5700
4320
1373
2919
2122
2009
2119
1461
3048
2005
2255
2036
284
2639
4348
6716
6015
4699
2032
2395
2019
6291
4176
3837
2194
4555
2156
2083
2760
6024
5413
4672
10
2902
2040
2561
2155
237
2883
4013
2641
2148
2179
2057
4418
2122
153
2597
3038
8011
6482
4486
2316
2397
2068
5331
4668
3821
2247
4030
3692
2153
2624
5679
5699
43

In [34]:
prompt_template = """
You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text.

Respond only with the summary, no additionnal comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.

Table or text chunk: {element}

"""
# Generate summaries with progress bar
print("Summarizing text chunks...")
text_summaries = [dynamic_summarize(text, prompt_template) 
                  for text in tqdm(texts, desc="Summarizing Texts")]

print("Summarizing table chunks...")
table_summaries = [dynamic_summarize(table, prompt_template) 
                   for table in tqdm(tables, desc="Summarizing Tables")]

# Generate document IDs with progress bar
print("Generating document IDs...")
doc_ids = [str(uuid.uuid4()) for _ in tqdm(texts, desc="Generating Text IDs")]
table_ids = [str(uuid.uuid4()) for _ in tqdm(tables, desc="Generating Table IDs")]

# Create summary document objects with progress bar
print("Creating text document objects...")
summary_texts = [Document(page_content=summary, metadata={id_key: doc_ids[i]}) 
                 for i, summary in tqdm(enumerate(text_summaries), desc="Creating Text Docs", total=len(text_summaries))]

print("Creating table document objects...")
summary_tables = [Document(page_content=summary, metadata={id_key: table_ids[i]}) 
                  for i, summary in tqdm(enumerate(table_summaries), desc="Creating Table Docs", total=len(table_summaries))]

# Add texts to vectorstore with progress bar
print("Adding text summaries to vectorstore...")
for doc in tqdm(summary_texts, desc="Storing Texts in Vectorstore"):
    retriever.vectorstore.add_documents([doc])

print("Adding text summaries to docstore...")
for doc_id, text in tqdm(zip(doc_ids, texts), desc="Storing Texts in Docstore", total=len(texts)):
    retriever.docstore.mset([(doc_id, text)])

# Add tables to vectorstore with progress bar
print("Adding table summaries to vectorstore...")
for doc in tqdm(summary_tables, desc="Storing Tables in Vectorstore"):
    retriever.vectorstore.add_documents([doc])

print("Adding table summaries to docstore...")
for table_id, table in tqdm(zip(table_ids, tables), desc="Storing Tables in Docstore", total=len(tables)):
    retriever.docstore.mset([(table_id, table)])

# Persist the database
print("Saving vectorstore...")
vectorstore.persist()
print("✅ Database saved successfully!")


Summarizing text chunks...


Summarizing Texts:   0%|                     | 1/1024 [00:04<1:16:09,  4.47s/it]

You are an assistant tasked with summarizing tables and text. Give a concise summary of the table or text.Respond only with the summary, no additionnal comment. Do not start your message by saying "Here is a summary" or anything like that. Just give the summary as it is. 





Summarizing Texts:   0%|                     | 2/1024 [00:08<1:07:56,  3.99s/it]

You are an assistant tasked with summarizing tables and text. Give a concise summary of the table or text.Respond only with the summary, no additionnal comment. The approved version of Policy Document in respect of this plan is available on our website: www.licindia.in. 





Summarizing Texts:   0%|                     | 3/1024 [00:13<1:18:11,  4.59s/it]

You are an assistant tasked with summarizing tables and text. Give a concise summary of the table or text.Respond only with the summary, no additionnal comment. Do not start your message by saying "Here is a summary" or anything like that. Give the summary as it is. 





Summarizing Texts:   0%|                     | 4/1024 [00:18<1:22:58,  4.88s/it]

LIC’s Digi Credit Life (UIN: 512N358V01) Page 6 of 18: Nomination is the process of nominating a person(s) in the proposal form or subsequently included/ changed by an endorsement. Proposer is a person who proposes the life insurance proposal. 





Summarizing Texts:   0%|                     | 4/1024 [00:19<1:21:56,  4.82s/it]


KeyboardInterrupt: 

In [89]:
# Function to load and query the database
def query_database(query_text, top_k=3):
    vectorstore = Chroma(persist_directory="db", collection_name="multi_modal_rag", embedding_function=embedding_function)
    retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
    results = retriever.get_relevant_documents(query_text)
    return results
