In [1]:
import json
from typing import List

# Unstructured for document parsing
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# LangChain components
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
path=r"D:\MultiModulRag\Backend\Pipeline_Database\Images"
def partition_document(file_path: str):
    """Extract elements from PDF using unstructured"""
    print(f"ðŸ“„ Partitioning document: {file_path}")
    
    elements = partition_pdf(
        filename=file_path,
        extract_images_in_pdf=True,
        strategy = "hi_res",
        hi_res_model_name="yolox",
        infer_table_structure=True,
        chunking_strategy="by_title",
        extract_image_block_types=["Image"],
        extract_image_block_output_dir=path,
        extract_image_block_to_payload=True
    )
    
    print(f"âœ… Extracted {len(elements)} elements")
    return elements

# Test with your PDF file

file_path = r"D:\MultiModulRag\docs\iesc102.pdf"  # Change this to your PDF path
elements = partition_document(file_path)


ðŸ“„ Partitioning document: D:\MultiModulRag\docs\iesc102.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


âœ… Extracted 71 elements


***CHECKPOINTER_1 to save state***

In [4]:
import json
import pickle
from pathlib import Path
from unstructured.documents.elements import Element

def save_elements(elements, pkl_path: str, json_path: str = None):
    """
    Save a Python variable `elements` to pickle and optionally to JSON.
    Automatically converts unstructured Element objects to dicts for JSON.

    Args:
        elements: Python variable to save (list, dict, etc.)
        pkl_path: Path to save the pickle file (required)
        json_path: Path to save the JSON file (optional)
    """
    # Ensure parent directories exist
    Path(pkl_path).parent.mkdir(parents=True, exist_ok=True)
    if json_path:
        Path(json_path).parent.mkdir(parents=True, exist_ok=True)

    # Save as Pickle
    with open(pkl_path, "wb") as f:
        pickle.dump(elements, f)
    print(f"âœ… Saved elements to pickle: {pkl_path}")

    # Save as JSON (optional)
    if json_path:
        # Convert Element objects to dicts automatically
        def to_serializable(el):
            return el.to_dict() if isinstance(el, Element) else el
        
        elements_serializable = [to_serializable(el) for el in elements]

        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(elements_serializable, f, indent=4, ensure_ascii=False)
        print(f"âœ… Saved elements to JSON: {json_path}")


# -----------------------------
# Example usage
# your Python variable, e.g., output of partition_pdf

pkl_file = r"D:\MultiModulRag\Backend\Pipeline_Database\Pickel\Checkpointer1.pkl"
json_file = r"D:\MultiModulRag\Backend\Pipeline_Database\JSON\Checkpointer1.json"

save_elements(elements, pkl_file, json_file) 

âœ… Saved elements to pickle: D:\MultiModulRag\Backend\Pipeline_Database\Pickel\Checkpointer1.pkl
âœ… Saved elements to JSON: D:\MultiModulRag\Backend\Pipeline_Database\JSON\Checkpointer1.json


In [5]:
import pickle

# Path to your pickle file
pkl_file = r"D:\MultiModulRag\Backend\Pipeline_Database\Pickel\Checkpointer1.pkl"

# Load pickle into a new variable
with open(pkl_file, "rb") as f:
    loaded_chunks = pickle.load(f)

print(f"âœ… Loaded {len(loaded_chunks)} elements from pickle")


âœ… Loaded 71 elements from pickle


In [6]:
import json

# Path to your pickle file
pkl_file = r"D:\MultiModulRag\Backend\Pipeline_Database\JSON\Checkpointer1.json"

# Load pickle into a new variable
with open(pkl_file, "rb") as f:
    loaded_chunks_json = json.load(f)

print(f"âœ… Loaded {len(loaded_chunks_json)} elements from json")


âœ… Loaded 71 elements from json


In [7]:
loaded_chunks[0].metadata.orig_elements[2].to_dict()

{'type': 'Image',
 'element_id': '93b5a74c-eb1e-4450-90e0-08a88f1a28f9',
 'text': 'EXTRA FREE FLOW IODISED SALT',
 'metadata': {'detection_class_prob': 0.8340976238250732,
  'coordinates': {'points': ((np.float64(159.99850463867188),
     np.float64(719.965576171875)),
    (np.float64(159.99850463867188), np.float64(966.1259155273438)),
    (np.float64(803.1172485351562), np.float64(966.1259155273438)),
    (np.float64(803.1172485351562), np.float64(719.965576171875))),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2280},
  'last_modified': '2025-10-21T15:08:59',
  'filetype': 'PPM',
  'languages': ['eng'],
  'page_number': 1,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAD2AoMDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIh

In [8]:
loaded_chunks[0].metadata.orig_elements[0].to_dict()["type"] == "Image"

True

In [7]:
unique_types = set()
for el in loaded_chunks:
    for orig_el in el.metadata.orig_elements:
        unique_types.add(orig_el.to_dict()["type"])
unique_types_list = sorted(list(unique_types))

In [8]:
unique_types_list

['FigureCaption',
 'Formula',
 'Image',
 'ListItem',
 'NarrativeText',
 'Table',
 'Title',
 'UncategorizedText']

> Focus on the following elements for your RAG or document extraction pipeline:
> - **Text** â†’ Type: `Text`
> - **Table** â†’ Type: `Table`
> - **Image + FigureCaption (combined)** â†’ Type: `Image+Caption`
> - **Footer** â†’ Type: `Footer`
>
> Maintain the **type field** in each chunk so you always know what kind of content it contains.  
> This improves traceability, retrieval accuracy, and contextual organization across your RAG workflow.


In [1]:
import json
from typing import List

# Unstructured for document parsing
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# LangChain components
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv

load_dotenv()

True