In [2]:
from tqdm import tqdm
import os
import json
import uuid
from tqdm import tqdm
from langchain_chroma import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline
from chromadb import PersistentClient
import time
from google.api_core.exceptions import ResourceExhausted  # Import exception for handling

In [3]:
CHUNKS_FOLDER = "processed_chunks"

In [4]:
def data_info(folder_path = "processed_chunks"):
    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    total_text = 0
    total_table = 0
    for json_file in tqdm(json_files, desc="Loading chunks", unit="file"):
        file_path = os.path.join(folder_path, json_file)
        composite_count = 0
        table_count = 0
        with open(file_path, "r", encoding="utf-8") as f:
            print(file_path)
            chunk_data = json.load(f)
            # Convert dictionaries back into unstructured objects
            for chunk in chunk_data:
                
                
                element_type = chunk.get("type")
                if element_type == "Table":
                    table_count+=1
                    total_table += 1
                    # print(chunk)
                elif element_type == "CompositeElement":
                    composite_count+=1
                    total_text += 1
            print("Number of Text Elements in this pdf: ",composite_count , "\nNumber of broad tables: ",table_count)
            print("\n")
    print("Total Text chunks: ", total_text, "\nTotal Tables: ", total_table)

In [4]:
data_info("processed_chunks")

Loading chunks:  31%|███████▍                | 24/78 [00:00<00:00, 233.06file/s]

processed_chunks/LIC’s Digi Credit Life - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0


processed_chunks/New Money Back Plan-25 years - CIS  New Money Back Plan-25 years .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Yuva Credit Life - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0


processed_chunks/New Jeevan Amar - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0


processed_chunks/New Jeevan Anand - Policy Document .pdf.json
Number of Text Elements in this pdf:  25 
Number of broad tables:  0


processed_chunks/LIC’s Digi Term - CIS LIC’s Digi Term .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0


processed_chunks/Accident Benefit Rider - Policy Document .pdf.json
Number of Text Elements in this pdf:  6 
Number of broad tables:  0


processed_chunks/New T

Loading chunks:  62%|██████████████▊         | 48/78 [00:00<00:00, 231.36file/s]

Number of Text Elements in this pdf:  19 
Number of broad tables:  0


processed_chunks/Jeevan Tarun - CIS  Jeevan Tarun .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Bima Shree - CIS LIC’s Bima Shree .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0


processed_chunks/New Money Back Plan- 20 Years - CIS  New Money Back Plan-20 years .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/Single Premium Endowment Plan - CIS  Single Premium Endowment Plan .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0


processed_chunks/Bima Ratna - CIS  Bima Ratna .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Yuva Term - CIS LIC’s Yuva Term .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Digi Term - Policy Document .pdf.json
Number of Tex

Loading chunks: 100%|████████████████████████| 78/78 [00:00<00:00, 157.05file/s]

Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/Single Premium Endowment Plan - Policy Document .pdf.json
Number of Text Elements in this pdf:  19 
Number of broad tables:  0


processed_chunks/New Tech-Term - Policy Document .pdf.json
Number of Text Elements in this pdf:  19 
Number of broad tables:  0


processed_chunks/Jeevan Umang - CIS  Jeevan Umang .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0


processed_chunks/LIC’s Yuva Credit Life - CIS LIC’s Yuva Credit Life .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0


processed_chunks/Amritbaal - Policy Document .pdf.json
Number of Text Elements in this pdf:  24 
Number of broad tables:  0


processed_chunks/Jeevan Utsav - CIS  Jeevan Utsav .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0


Total Text chunks:  1024 
Total Tables:  2





In [11]:
def prepare_lists(folder_path = "processed_chunks"):
    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    tables = []
    texts = []
    orig_tables = 0
    total_text = 0
    total_table = 0
    for json_file in tqdm(json_files, desc="Loading chunks", unit="file"):
        file_path = os.path.join(folder_path, json_file)
        composite_count = 0
        table_count = 0
        with open(file_path, "r", encoding="utf-8") as f:
            print(file_path)
            chunks = json.load(f)
            # Convert dictionaries back into unstructured objects
            for chunk in chunks:
                element_type = chunk.get("type")
                if element_type == "Table":
                    table_count+=1
                    total_table += 1
                    # print(chunk)
                elif element_type == "CompositeElement":
                    composite_count+=1
                    total_text += 1
                    texts.append(chunk["text"])
                for element in chunk["metadata"]["orig_elements"]:
                    if "text_as_html" in element["metadata"] and element["type"] == "Table":
                        tables.append(element["metadata"]["text_as_html"])
                        orig_tables += 1
                        
            print("Number of Text Elements in this pdf: ",composite_count , "\nNumber of broad tables: ",table_count)
            print("Number of original tables: ", orig_tables)
            print("\n")
    print("Total Text chunks: ", total_text, "\nTotal Tables: ", total_table)
    return tables , texts

In [12]:
tables , texts = prepare_lists("processed_chunks")

Loading chunks:  51%|██████████████████▍                 | 40/78 [00:00<00:00, 197.09file/s]

processed_chunks/LIC’s Digi Credit Life - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  2


processed_chunks/New Money Back Plan-25 years - CIS  New Money Back Plan-25 years .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0
Number of original tables:  4


processed_chunks/LIC’s Yuva Credit Life - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  8


processed_chunks/New Jeevan Amar - Policy Document .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  11


processed_chunks/New Jeevan Anand - Policy Document .pdf.json
Number of Text Elements in this pdf:  25 
Number of broad tables:  0
Number of original tables:  19


processed_chunks/LIC’s Digi Term - CIS LIC’s Digi Term .pdf.json
Number of Text Elements in this pdf:  4 
Number of broad tables:  0
Number of 

Loading chunks: 100%|████████████████████████████████████| 78/78 [00:00<00:00, 237.68file/s]

processed_chunks/Saral Jeevan Bima - CIS  Saral Jeevan Bima .pdf.json
Number of Text Elements in this pdf:  5 
Number of broad tables:  0
Number of original tables:  246


processed_chunks/New Jeevan Amar - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  12 
Number of broad tables:  0
Number of original tables:  255


processed_chunks/Jeevan Lakshya - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  18 
Number of broad tables:  0
Number of original tables:  267


processed_chunks/Amritbaal - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  283


processed_chunks/LIC’s Digi Credit Life - Sales Brochures .pdf.json
Number of Text Elements in this pdf:  11 
Number of broad tables:  0
Number of original tables:  290


processed_chunks/New Money Back Plan- 20 Years - Sales Brochure .pdf.json
Number of Text Elements in this pdf:  17 
Number of broad tables:  0
Number of original tables:  300





In [23]:
average_len = 0;
text_length = []
for i in texts:
    text_length.append(len(i))

print("average text length: ", sum(text_length)/len(texts))

average text length:  3073.787109375


In [13]:
texts[0]

'OLic UTEINSURANCE CORPORATION OF NDIA\n\nLIFE INSURANCE CORPORATION OF INDIA (Established by the Life Insurance Corporation Act, 1956) Registration Number: 512\n\nLIC’s Digi\n\nDigi Credit Life (UIN:512N358V01)\n\n( A Non-Par, Non-Linked\n\nLinked , Life , Individual, Pure Risk Plan)\n\nPART-A\n\nRef: NB\n\n(Address and e-mail id of Branch Office)\n\nSpace for Name and Address of Policyholder\n\nSpace for Address and e-mail id of Branch Office\n\nDate:\n\nDear Policyholder,\n\nRe: Your Policy No. _______________\n\nWe have pleasure in forwarding herewith the above Policy Document (comprising of Part A to Part G We have pleasure in forwarding herewith the above comprising of Part A to Part G) alongwith Customer Information Sheet (CIS) and Customized and Customized Benefit Illustration, if any. .\n\nalso like to draw your kind attention to the information mentioned in the Schedule of the Policy and the benefits available under the Policy.\n\nSome of our plans have certain options availa

In [7]:
import google.generativeai as genai

# Configure the Google Generative AI API key
genai.configure(api_key="AIzaSyDOHLSnm5NF1_6mlya19Yf2XAb0MdvHTLQ")

# Create a function to generate content using the Gemini model
def summarizer(text, prompt: str):
    full_prompt = prompt.format(element=text)
    model = genai.GenerativeModel("gemini-2.0-flash")
    response = model.generate_content(full_prompt)
    return response.text



In [10]:
prompt_template = """
You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text.

Respond only with the summary, no additionnal comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is. Max 500 words. Retain all the important points.

Table or text chunk: {element}

"""

In [14]:
# Function to load previously saved summaries from a JSONL file
def load_existing_summaries(output_file):
    existing_summaries = {}
    
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    entry = json.loads(line.strip())
                    existing_summaries[entry["text"]] = entry["summary"]  # Store text-content as key
                except json.JSONDecodeError:
                    print(f"⚠️ Skipping corrupted line in {output_file}")
    
    return existing_summaries

# Define a function to rate-limit and save summaries, resuming from the last index
def rate_limit_summarization(summarizer, texts, prompt_template, output_file, max_calls_per_minute=10):
    start_time = time.time()
    calls_this_minute = 0

    # Load already processed summaries
    existing_summaries = load_existing_summaries(output_file)
    
    summaries = []
    
    for text in tqdm(texts, desc="Summarizing Texts"):
        if text in existing_summaries:
            print(f"✅ Skipping already processed text: {text[:50]}...")  # Show first 50 chars
            continue  # Skip already summarized texts
        
        if calls_this_minute >= max_calls_per_minute:
            elapsed_time = time.time() - start_time
            sleep_time = 60 - elapsed_time  # wait for the remainder of the minute
            if sleep_time > 0:
                print(f"⏳ Rate limit reached! Sleeping for {int(sleep_time)} seconds...")
                time.sleep(sleep_time)  # Sleep for the remaining time
            start_time = time.time()  # Reset the timer
            calls_this_minute = 0
        
        # Try to get the summary with error handling
        while True:
            try:
                summary = summarizer(text, prompt_template)
                summaries.append(summary)

                # Save the result immediately
                with open(output_file, "a", encoding="utf-8") as f:
                    json.dump({"id": str(uuid.uuid4()), "text": text, "summary": summary}, f)
                    f.write("\n")  # Newline for JSONL format
                
                calls_this_minute += 1
                break  # Exit retry loop on success
            
            except ResourceExhausted:
                print("🚨 API limit exceeded! Waiting 1 minute before retrying...")
                time.sleep(60)  # Wait for 1 minute before retrying
            
            except Exception as e:
                print(f"⚠️ Unexpected error: {e}")
                break  # Continue to the next text in case of other errors
    
    return summaries

# Define output files
text_summary_file = "text_summaries.jsonl"
table_summary_file = "table_summaries.jsonl"

# Generate and save summaries with progress bar
print("Summarizing text chunks...")
text_summaries = rate_limit_summarization(summarizer, texts, prompt_template, text_summary_file)

print("Summarizing table chunks...")
table_summaries = rate_limit_summarization(summarizer, tables, prompt_template, table_summary_file)

print("✅ Summaries saved successfully and resume functionality enabled!")


Summarizing text chunks...


Summarizing Texts: 100%|████████████████████████████| 1024/1024 [00:00<00:00, 124092.55it/s]


✅ Skipping already processed text: OLic UTEINSURANCE CORPORATION OF NDIA

LIFE INSURA...
✅ Skipping already processed text: Address of Grievance Redressal O

Officer:

Addres...
✅ Skipping already processed text: LIC’s Digi Credit Life (UIN: 512N358V01) Page 3 of...
✅ Skipping already processed text: LIC’s Digi Credit Life (UIN: 512N358V01) Page 6 of...
✅ Skipping already processed text: PART– C: BENEFITS

1. The following benefits are p...
✅ Skipping already processed text: PART – D: CONDITIONS RELATED TO SERVICING ASPECTS
...
✅ Skipping already processed text: Non –forfeiture Regulations:

Not applicable as th...
✅ Skipping already processed text: 4. Surrender:

A. No surrender value will be avail...
✅ Skipping already processed text: After Premium Paying Term

i) If all due premiums ...
✅ Skipping already processed text: 7. Free look period:

During the Free Look Period ...
✅ Skipping already processed text: 2. Suicide Exclusion:

Notwithstanding the provisi...
✅ Skipping already pr

Summarizing Texts: 100%|██████████████████████████████| 455/455 [00:00<00:00, 169908.15it/s]

✅ Skipping already processed text: <table><tbody><tr><td colspan="2">UIN: Plan Number...
✅ Skipping already processed text: <table><thead><tr><th>Policy Year</th><th>Sum Assu...
✅ Skipping already processed text: <table><thead><tr><th rowspan="2">SI. no.</th><th ...
✅ Skipping already processed text: <table><tbody><tr><td>through our Customer Portal ...
✅ Skipping already processed text: <table><tbody><tr><td>Address and contact details ...
✅ Skipping already processed text: <table><thead><tr><th>Agent’s/ Intermediary’s Code...
✅ Skipping already processed text: <table><tbody><tr><td colspan="2">Policy Number:</...
✅ Skipping already processed text: <table><thead><tr><th>1</th></tr></thead><tbody><t...
✅ Skipping already processed text: <table><thead><tr><th>Agent’s/ Intermediary’s Code...
✅ Skipping already processed text: <table><thead><tr><th colspan="3">Policy Number:</...
✅ Skipping already processed text: <table><thead><tr><th>Mode of Instalment payment</...
✅ Skipping already pr


