In [13]:
import os
from unstructured.partition.pdf import partition_pdf
import pandas as pd
from tqdm import tqdm
import json

In [8]:
def rename_policy_files(folder_path):
    """
    Renames PDF files in the specified folder by removing "LIC's" from their filenames.
    
    Args:
        folder_path (str): Path to the folder containing PDF files.
    """
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf") and "LIC's" in filename:
            new_filename = filename.replace("LIC's", "").strip()
            old_path = os.path.join(folder_path, filename)
            new_path = os.path.join(folder_path, new_filename)
            os.rename(old_path, new_path)
            print(f'Renamed: {filename} -> {new_filename}')

rename_policy_files("../../policy_documents")

In [5]:
# def extract_chunks_from_pdfs(folder_path):
#     """
#     Extracts chunks from all PDFs in the folder and returns a list of extracted chunks.
#     Displays a progress bar for tracking completion.

#     Args:
#         folder_path (str): Path to the folder containing PDF files.

#     Returns:
#         list: List of extracted chunks from all PDFs.
#     """
#     pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
#     all_chunks = []
    
#     for file in tqdm(pdf_files, desc="Processing PDFs", unit="file"):
#         file_path = os.path.join(folder_path, file)
#         chunks = partition_pdf(
#             filename=file_path,
#             infer_table_structure=True,
#             strategy="hi_res",
#             chunking_strategy="by_title",
#             max_characters=10000,
#             combine_text_under_n_chars=2000,
#             new_after_n_chars=6000,
#         )
#         all_chunks.extend(chunks)
    
#     return all_chunks

In [18]:
def extract_chunks_from_pdfs(folder_path, output_folder="processed_chunks"):
    """
    Extracts chunks from all PDFs in the folder, saves them as JSON files, and resumes processing 
    from the last unprocessed file when re-run.

    Args:
        folder_path (str): Path to the folder containing PDF files.
        output_folder (str): Path to save processed chunks.
    """
    os.makedirs(output_folder, exist_ok=True)
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]
    
    for file in tqdm(pdf_files, desc="Processing PDFs", unit="file"):
        output_file = os.path.join(output_folder, f"{file}.json")
        
        # Skip processing if the file is already saved
        if os.path.exists(output_file):
            print(f"Skipping {file}, already processed.")
            continue
        
        file_path = os.path.join(folder_path, file)
        chunks = partition_pdf(
            filename=file_path,
            infer_table_structure=True,
            strategy="hi_res",
            chunking_strategy="by_title",
            max_characters=10000,
            combine_text_under_n_chars=2000,
            new_after_n_chars=6000,
        )
        
        # Save chunks as JSON
        chunk_data = [chunk.to_dict() for chunk in chunks]
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(chunk_data, f, ensure_ascii=False, indent=4)
        
        print(f"Processed and saved {file}.")

In [19]:
all_chunks = extract_chunks_from_pdfs("../../policy_documents")

Processing PDFs:   0%|                                                  | 0/77 [00:00<?, ?file/s]

Skipping New Children's Money Back Plan - Sales Brochure .pdf, already processed.
Skipping Single Premium Endowment Plan - Sales Brochure .pdf, already processed.
Skipping New Money Back Plan- 20 Years - CIS  New Money Back Plan-20 years .pdf, already processed.


Processing PDFs:   4%|█▋                                        | 3/77 [00:11<04:54,  3.98s/file]


KeyboardInterrupt: 

In [None]:
def load_chunks_from_json(output_folder="processed_chunks"):
    """
    Reads all saved chunk JSON files and reconstructs them into unstructured objects.
    
    Args:
        output_folder (str): Path where processed chunks are stored.
    
    Returns:
        list: List of unstructured document elements.
    """
    all_chunks = []
    json_files = [f for f in os.listdir(output_folder) if f.endswith(".json")]
    
    for json_file in tqdm(json_files, desc="Loading chunks", unit="file"):
        file_path = os.path.join(output_folder, json_file)
        with open(file_path, "r", encoding="utf-8") as f:
            chunk_data = json.load(f)
            
            # Convert dictionaries back into unstructured objects
            for chunk in chunk_data:
                element_type = chunk.get("type")
                if element_type == "Table":
                    all_chunks.append(Table.from_dict(chunk))
                elif element_type == "NarrativeText":
                    all_chunks.append(NarrativeText.from_dict(chunk))
                elif element_type == "Title":
                    all_chunks.append(Title.from_dict(chunk))
                elif element_type == "ListItem":
                    all_chunks.append(ListItem.from_dict(chunk))
                elif element_type == "Header":
                    all_chunks.append(Header.from_dict(chunk))
                elif element_type == "Image":
                    all_chunks.append(Image.from_dict(chunk))
                elif element_type == "Text":
                    all_chunks.append(Text.from_dict(chunk))
    
    return all_chunks
