In [1]:
import os
from unstructured.partition.pdf import partition_pdf
import pandas as pd
from tqdm import tqdm
import json

In [2]:
def rename_policy_files(folder_path):
    """
    Renames PDF files in the specified folder by removing "LIC's" from their filenames.
    
    Args:
        folder_path (str): Path to the folder containing PDF files.
    """
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf") and "LIC's" in filename:
            new_filename = filename.replace("LIC's", "").strip()
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, new_filename)
            os.rename(old_path, new_path)
            print(f'Renamed: {filename} -> {new_filename}')

rename_policy_files("../../policy_documents")

In [76]:
def extract_chunks_from_pdfs(folder_path, output_folder="processed_chunks"):
    """
    Extracts chunks from all PDFs in the folder, saves them as JSON files, and preserves `orig_elements`.

    Args:
        folder_path (str): Path to the folder containing PDF files.
        output_folder (str): Path to save processed chunks.
    """
    os.makedirs(output_folder, exist_ok=True)
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    
    for file in tqdm(pdf_files, desc="Processing PDFs", unit="file", leave = False):
        output_file = os.path.join(output_folder, f"{file}.json")
        
        # Skip processing if the file is already saved
        if os.path.exists(output_file):
            print(f"Skipping {file}, already processed.")
            continue
        
        file_path = os.path.join(folder_path, file)
        chunks = partition_pdf(
            filename=file_path,
            infer_table_structure=True,
            strategy="hi_res",
            chunking_strategy="by_title",
            max_characters=10000,
            combine_text_under_n_chars=2000,
            new_after_n_chars=6000,
        )
        
        # Convert chunks to JSON serializable format
        chunk_data = []
        for chunk in chunks:
            chunk_dict = chunk.to_dict()
            
            # Preserve original elements if available
            if hasattr(chunk.metadata, "orig_elements") and chunk.metadata.orig_elements:
                chunk_dict["metadata"]["orig_elements"] = [elem.to_dict() for elem in chunk.metadata.orig_elements]
            
            chunk_data.append(chunk_dict)
        
        # Save to JSON file
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=4)
        
        print(f"Processed and saved {file}.")


In [77]:
all_chunks = extract_chunks_from_pdfs("../../policy_documents")

Processing PDFs:   0%|                                             | 0/77 [00:00<?, ?file/s]

Skipping New Children's Money Back Plan - Sales Brochure .pdf, already processed.
Skipping Single Premium Endowment Plan - Sales Brochure .pdf, already processed.
Skipping New Money Back Plan- 20 Years - CIS  New Money Back Plan-20 years .pdf, already processed.
Skipping New Endowment Plan - Sales brochure .pdf, already processed.


Processing PDFs:   6%|██▏                               | 5/77 [35:35<8:32:31, 427.10s/file]

Processed and saved Jeevan Umang - Sales Brochures .pdf.


Processing PDFs:   8%|██▋                               | 6/77 [36:46<6:51:49, 348.02s/file]

Processed and saved Jeevan Utsav - Sales Brochure .pdf.


Processing PDFs:   9%|███                               | 7/77 [37:06<5:13:41, 268.87s/file]

Processed and saved LIC’s Bima Shree - CIS LIC’s Bima Shree .pdf.


Processing PDFs:  10%|███▌                              | 8/77 [37:22<3:54:39, 204.05s/file]

Processed and saved New Money Back Plan-25 years - CIS  New Money Back Plan-25 years .pdf.


Processing PDFs:  12%|███▉                              | 9/77 [38:25<3:08:41, 166.50s/file]

Processed and saved Bima Ratna - Sales Brochure .pdf.


Processing PDFs:  13%|████▎                            | 10/77 [39:55<2:42:10, 145.24s/file]

Processed and saved Jeevan Utsav - Policy Document .pdf.


Processing PDFs:  14%|████▋                            | 11/77 [41:00<2:14:40, 122.42s/file]

Processed and saved Jeevan Tarun - Sales Brochure .pdf.


Processing PDFs:  16%|█████▎                            | 12/77 [41:17<1:39:55, 92.24s/file]

Processed and saved Amritbaal - CIS  Amritbaal .pdf.


Processing PDFs:  17%|█████▋                            | 13/77 [41:35<1:15:04, 70.38s/file]

Processed and saved Jeevan Lakshya - CIS  Jeevan Lakshya .pdf.


Processing PDFs:  18%|██████▌                             | 14/77 [41:51<57:07, 54.41s/file]

Processed and saved LIC’s Digi Term - CIS LIC’s Digi Term .pdf.


Processing PDFs:  19%|███████                             | 15/77 [42:45<56:08, 54.32s/file]

Processed and saved New Tech-Term - Policy Document .pdf.


Processing PDFs:  21%|███████▍                            | 16/77 [42:58<42:37, 41.93s/file]

Processed and saved Premium Waiver Benefit Rider - Policy Document .pdf.


Processing PDFs:  22%|███████▉                            | 17/77 [44:33<58:01, 58.03s/file]

Processed and saved New Jeevan Anand - Sales brochure .pdf.


Processing PDFs:  23%|████████▍                           | 18/77 [44:53<45:50, 46.63s/file]

Processed and saved Linked Accidental Death Benefit Rider - Policy Document .pdf.


Processing PDFs:  25%|████████▍                         | 19/77 [47:04<1:09:30, 71.90s/file]

Processed and saved New Money Back Plan-25 years - Policy Document .pdf.


Processing PDFs:  26%|████████▊                         | 20/77 [49:12<1:24:01, 88.45s/file]

Processed and saved Jeevan Azad - Sales Brochure .pdf.


Processing PDFs:  27%|█████████▎                        | 21/77 [50:17<1:15:57, 81.39s/file]

Processed and saved LIC’s Digi Term - Sales Brochure .pdf.


Processing PDFs:  29%|█████████▍                       | 22/77 [53:28<1:44:47, 114.31s/file]

Processed and saved New Endowment Plan - Policy Document .pdf.


Processing PDFs:  30%|█████████▊                       | 23/77 [55:07<1:38:46, 109.76s/file]

Processed and saved New Jeevan Amar - Policy Document .pdf.


Processing PDFs:  31%|██████████▌                       | 24/77 [55:44<1:17:43, 87.99s/file]

Processed and saved New Endowment Plan - CIS  New Endowment Plan .pdf.


Processing PDFs:  32%|███████████                       | 25/77 [57:50<1:26:11, 99.46s/file]

Processed and saved Bima Ratna - Policy Document .pdf.


Processing PDFs:  34%|███████████▏                     | 26/77 [59:44<1:28:08, 103.69s/file]

Processed and saved LIC’s Yuva Credit Life - Policy Document .pdf.


Processing PDFs:  35%|███████████▏                    | 27/77 [1:00:18<1:09:02, 82.85s/file]

Processed and saved New Jeevan Amar - CIS  New Jeevan Amar .pdf.


Processing PDFs:  36%|███████████▎                   | 28/77 [1:02:47<1:23:53, 102.73s/file]

Processed and saved Jeevan Azad - Policy Document .pdf.


Processing PDFs:  38%|███████████▋                   | 29/77 [1:05:00<1:29:24, 111.77s/file]

Processed and saved Amritbaal - Sales Brochure .pdf.


Processing PDFs:  39%|████████████▍                   | 30/77 [1:05:28<1:07:44, 86.47s/file]

Processed and saved Linked Accidental Death Benefit Rider - Sales Brochures .pdf.


Processing PDFs:  40%|████████████▉                   | 31/77 [1:06:32<1:01:13, 79.87s/file]

Processed and saved New Jeevan Amar - Sales Brochure .pdf.


Processing PDFs:  42%|█████████████▎                  | 32/77 [1:08:41<1:11:02, 94.73s/file]

Processed and saved Jeevan Lakshya - Policy Document .pdf.


Processing PDFs:  43%|██████████████▌                   | 33/77 [1:09:20<57:06, 77.86s/file]

Processed and saved Premium Waiver Benefit Rider - Sales Brochures .pdf.


Processing PDFs:  44%|███████████████                   | 34/77 [1:09:53<46:06, 64.33s/file]

Processed and saved Bima Jyoti - CIS  Bima Jyoti .pdf.


Processing PDFs:  45%|██████████████                 | 35/77 [1:16:52<1:59:37, 170.90s/file]

Processed and saved Jeevan Umang - Policy Document .pdf.


Processing PDFs:  47%|██████████████▍                | 36/77 [1:18:43<1:44:22, 152.75s/file]

Processed and saved Jeevan Labh Plan - Sales brochure .pdf.


Processing PDFs:  48%|██████████████▉                | 37/77 [1:19:15<1:17:48, 116.71s/file]

Processed and saved LIC’s Yuva Credit Life - CIS LIC’s Yuva Credit Life .pdf.


Processing PDFs:  49%|████████████████▊                 | 38/77 [1:19:45<58:58, 90.72s/file]

Processed and saved New Tech-Term - CIS  New Tech-Term .pdf.


Processing PDFs:  51%|███████████████▋               | 39/77 [1:21:49<1:03:38, 100.48s/file]

Processed and saved Jeevan Labh Plan - Policy Document .pdf.


Processing PDFs:  52%|█████████████████▋                | 40/77 [1:23:13<59:02, 95.74s/file]

Processed and saved Saral Jeevan Bima - Policy Document .pdf.


Processing PDFs:  53%|██████████████████                | 41/77 [1:25:00<59:29, 99.16s/file]

Processed and saved LIC’s Digi Term - Policy Document .pdf.


Processing PDFs:  55%|██████████████████▌               | 42/77 [1:25:35<46:29, 79.69s/file]

Processed and saved Bima Ratna - CIS  Bima Ratna .pdf.


Processing PDFs:  56%|██████████████████▉               | 43/77 [1:27:48<54:16, 95.79s/file]

Processed and saved New Children's Money Back Plan - Policy Document .pdf.


Processing PDFs:  57%|███████████████████▍              | 44/77 [1:28:19<41:55, 76.23s/file]

Processed and saved Accident Benefit Rider - Policy Document .pdf.


Processing PDFs:  58%|███████████████████▊              | 45/77 [1:29:05<35:53, 67.30s/file]

Processed and saved Saral Jeevan Bima - Sales Brochure .pdf.


Processing PDFs:  60%|████████████████████▎             | 46/77 [1:29:35<28:56, 56.02s/file]

Processed and saved Single Premium Endowment Plan - CIS  Single Premium Endowment Plan .pdf.


Processing PDFs:  61%|████████████████████▊             | 47/77 [1:32:20<44:20, 88.69s/file]

Processed and saved New Jeevan Anand - Policy Document .pdf.


Processing PDFs:  62%|█████████████████████▏            | 48/77 [1:32:58<35:32, 73.53s/file]

Processed and saved New Term Assurance Rider - Sales Brochures .pdf.


Processing PDFs:  64%|█████████████████████▋            | 49/77 [1:33:46<30:48, 66.01s/file]

Processed and saved Accidental Death & Disability Benefit Rider - Sales Brochures .pdf.


Processing PDFs:  65%|██████████████████████            | 50/77 [1:34:15<24:37, 54.71s/file]

Processed and saved LIC’s Yuva Term - CIS LIC’s Yuva Term .pdf.


Processing PDFs:  66%|██████████████████████▌           | 51/77 [1:34:47<20:45, 47.91s/file]

Processed and saved Jeevan Umang - CIS  Jeevan Umang .pdf.


Processing PDFs:  68%|██████████████████████▉           | 52/77 [1:37:09<31:46, 76.27s/file]

Processed and saved Jeevan Tarun - Policy Document .pdf.


Processing PDFs:  69%|███████████████████████▍          | 53/77 [1:38:52<33:43, 84.32s/file]

Processed and saved LIC’s Yuva Term - Policy Document .pdf.


Processing PDFs:  70%|███████████████████████▊          | 54/77 [1:40:58<37:04, 96.71s/file]

Processed and saved Bima Jyoti - Policy Document .pdf.


Processing PDFs:  71%|████████████████████████▎         | 55/77 [1:41:28<28:06, 76.66s/file]

Processed and saved New Jeevan Anand - CIS  New Jeevan Anand .pdf.


Processing PDFs:  73%|████████████████████████▋         | 56/77 [1:41:58<22:00, 62.87s/file]

Processed and saved Saral Jeevan Bima - CIS  Saral Jeevan Bima .pdf.


Processing PDFs:  74%|█████████████████████████▏        | 57/77 [1:42:48<19:38, 58.90s/file]

Processed and saved LIC’s Digi Credit Life - Sales Brochures .pdf.


Processing PDFs:  75%|█████████████████████████▌        | 58/77 [1:43:15<15:39, 49.46s/file]

Processed and saved Jeevan Labh Plan - CIS  Jeevan Labh Plan .pdf.


Processing PDFs:  77%|██████████████████████████        | 59/77 [1:45:24<21:57, 73.20s/file]

Processed and saved Jeevan Lakshya - Sales Brochure .pdf.


Processing PDFs:  78%|██████████████████████████▍       | 60/77 [1:46:23<19:31, 68.91s/file]

Processed and saved LIC’s Yuva Term - Sales Brochures .pdf.


                                                                                            

KeyboardInterrupt: 

1. Extract tables from each json file's metadata.orig_elements separately.
2. Extract text simply from the chunks json text
3. Summarise the text_list and the tables_list
4. Create the database where both parent_chunks and the summaries have the same ID
5. we query the embeddings of the summaries and retrieve the original docs
6. Original docs are then used to give the answer.

BART-large-cnn for summarising the chunks
Roberta base from huggingface for embeddings
langchain chroma as vector store