In [1]:
import os
from unstructured.partition.pdf import partition_pdf
import pandas as pd
from tqdm import tqdm
import json

In [2]:
def rename_policy_files(folder_path):
    """
    Renames PDF files in the specified folder by removing "LIC's" from their filenames.
    
    Args:
        folder_path (str): Path to the folder containing PDF files.
    """
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf") and "LIC's" in filename:
            new_filename = filename.replace("LIC's", "").strip()
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, new_filename)
            os.rename(old_path, new_path)
            print(f'Renamed: {filename} -> {new_filename}')

rename_policy_files("../../policy_documents")

In [76]:
def extract_chunks_from_pdfs(folder_path, output_folder="processed_chunks"):
    """
    Extracts chunks from all PDFs in the folder, saves them as JSON files, and preserves `orig_elements`.

    Args:
        folder_path (str): Path to the folder containing PDF files.
        output_folder (str): Path to save processed chunks.
    """
    os.makedirs(output_folder, exist_ok=True)
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    
    for file in tqdm(pdf_files, desc="Processing PDFs", unit="file", leave = False):
        output_file = os.path.join(output_folder, f"{file}.json")
        
        # Skip processing if the file is already saved
        if os.path.exists(output_file):
            print(f"Skipping {file}, already processed.")
            continue
        
        file_path = os.path.join(folder_path, file)
        chunks = partition_pdf(
            filename=file_path,
            infer_table_structure=True,
            strategy="hi_res",
            chunking_strategy="by_title",
            max_characters=10000,
            combine_text_under_n_chars=2000,
            new_after_n_chars=6000,
        )
        
        # Convert chunks to JSON serializable format
        chunk_data = []
        for chunk in chunks:
            chunk_dict = chunk.to_dict()
            
            # Preserve original elements if available
            if hasattr(chunk.metadata, "orig_elements") and chunk.metadata.orig_elements:
                chunk_dict["metadata"]["orig_elements"] = [elem.to_dict() for elem in chunk.metadata.orig_elements]
            
            chunk_data.append(chunk_dict)
        
        # Save to JSON file
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=4)
        
        print(f"Processed and saved {file}.")


In [None]:
all_chunks = extract_chunks_from_pdfs("../../policy_documents")

Processing PDFs:   0%|                                             | 0/77 [00:00<?, ?file/s]

Skipping New Children's Money Back Plan - Sales Brochure .pdf, already processed.
Skipping Single Premium Endowment Plan - Sales Brochure .pdf, already processed.
Skipping New Money Back Plan- 20 Years - CIS  New Money Back Plan-20 years .pdf, already processed.
Skipping New Endowment Plan - Sales brochure .pdf, already processed.


1. Extract tables from each json file's metadata.orig_elements separately.
2. Extract text simply from the chunks json text
3. Summarise the text_list and the tables_list
4. Create the database where both parent_chunks and the summaries have the same ID
5. we query the embeddings of the summaries and retrieve the original docs
6. Original docs are then used to give the answer.

BART-large-cnn for summarising the chunks
Roberta base from huggingface for embeddings
langchain chroma as vector store