In [2]:
import json
from typing import List

# Unstructured for document parsing
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# LangChain components
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
from pathlib import Path
from unstructured.partition.pdf import partition_pdf
import os
import json
import pickle

class PDFPartitioner:
    """Handles PDF partitioning with configurable options"""
    
    # Fixed image output directory
    IMAGE_OUTPUT_DIR = r"D:\MultiModulRag\Backend\SmartChunkClubing\Images"
    
    # Supported languages mapping
    SUPPORTED_LANGUAGES = {
        "afrikaans": "afr", "amharic": "amh", "arabic": "ara", "assamese": "asm",
        "azerbaijani": "aze", "azerbaijani_cyrilic": "aze_cyrl", "belarusian": "bel",
        "bengali": "ben", "tibetan": "bod", "bosnian": "bos", "breton": "bre",
        "bulgarian": "bul", "catalan": "cat", "cebuano": "ceb", "czech": "ces",
        "chinese_simplified": "chi_sim", "chinese": "chi_sim", "chinese_traditional": "chi_tra",
        "cherokee": "chr", "corsican": "cos", "welsh": "cym", "danish": "dan",
        "danish_fraktur": "dan_frak", "german": "deu", "german_fraktur": "deu_frak",
        "dzongkha": "dzo", "greek": "ell", "english": "eng", "esperanto": "epo",
        "estonian": "est", "basque": "eus", "persian": "fas", "filipino": "fil",
        "finnish": "fin", "french": "fra", "german_fraktur": "frk", "western_frisian": "fry",
        "scottish_gaelic": "gla", "irish": "gle", "galician": "glg", "gujarati": "guj",
        "haitian": "hat", "hebrew": "heb", "hindi": "hin", "croatian": "hrv",
        "hungarian": "hun", "armenian": "hye", "indonesian": "ind", "icelandic": "isl",
        "italian": "ita", "javanese": "jav", "japanese": "jpn", "kannada": "kan",
        "georgian": "kat", "kazakh": "kaz", "khmer": "khm", "korean": "kor",
        "lao": "lao", "latin": "lat", "latvian": "lav", "lithuanian": "lit",
        "malayalam": "mal", "marathi": "mar", "macedonian": "mkd", "maltese": "mlt",
        "mongolian": "mon", "malay": "msa", "burmese": "mya", "nepali": "nep",
        "dutch": "nld", "norwegian": "nor", "polish": "pol", "portuguese": "por",
        "pashto": "pus", "romanian": "ron", "russian": "rus", "sanskrit": "san",
        "sinhala": "sin", "slovak": "slk", "slovenian": "slv", "spanish": "spa",
        "albanian": "sqi", "serbian": "srp", "swedish": "swe", "tamil": "tam",
        "telugu": "tel", "thai": "tha", "turkish": "tur", "ukrainian": "ukr",
        "urdu": "urd", "uzbek": "uzb", "vietnamese": "vie", "yiddish": "yid"
    }
    
    def __init__(
        self,
        language: str,
        max_characters: int,
        new_after_n_chars: int,
        combine_text_under_n_chars: int,
        extract_images: bool = False,
        extract_tables: bool = False
    ):
        """
        Initialize PDF Partitioner
        
        Args:
            language: Language name (e.g., 'english', 'spanish', 'hindi') - REQUIRED
            max_characters: Maximum characters per chunk - REQUIRED
            new_after_n_chars: Start new chunk after this many characters - REQUIRED
            combine_text_under_n_chars: Combine small text blocks under this count - REQUIRED
            extract_images: Whether to extract images from PDF (default: False)
            extract_tables: Whether to parse tables as structured HTML (default: False)
        """
        # Language configuration (required)
        self.language = self._validate_language(language)
        
        # Chunking configuration (all required)
        self.max_characters = max_characters
        self.new_after_n_chars = new_after_n_chars
        self.combine_text_under_n_chars = combine_text_under_n_chars
        
        # Optional features
        self.extract_images = extract_images
        self.extract_tables = extract_tables
        
        # Image output directory
        self.image_output_dir = Path(self.IMAGE_OUTPUT_DIR)
        
        # Create image directory if image extraction is enabled
        if self.extract_images:
            self.image_output_dir.mkdir(parents=True, exist_ok=True)
    
    def _validate_language(self, language: str) -> str:
        """Validate and convert language name to code"""
        lang_lower = language.lower().replace(" ", "_").replace("-", "_")
        
        if lang_lower in self.SUPPORTED_LANGUAGES:
            return self.SUPPORTED_LANGUAGES[lang_lower]
        else:
            raise ValueError(
                f"‚ùå Unsupported language '{language}'.\n"
                f"   Use PDFPartitioner.get_supported_languages() to see valid options.\n"
                f"   Examples: english, hindi, spanish, french, etc."
            )
    
    def partition_document(self, file_path: str):
        """
        Extract elements from PDF using unstructured
        
        Args:
            file_path: Path to the PDF file
            
        Returns:
            List of extracted elements
        """
        print(f"\nüìÑ Partitioning document: {file_path}")
        print(f"   Language: {self.language}")
        print(f"   Extract images: {self.extract_images}")
        print(f"   Extract tables: {self.extract_tables}")
        print(f"   Chunking: max={self.max_characters}, new_after={self.new_after_n_chars}, combine_under={self.combine_text_under_n_chars}")
        
        # Build partition parameters
        partition_params = {
            # Core parameters (always required)
            "filename": file_path,
            "strategy": "hi_res",
            "hi_res_model_name": "yolox",
            "chunking_strategy": "by_title",
            "include_orig_elements": True,
            
            # Language configuration
            "languages": [self.language],
            
            # Chunking parameters
            "max_characters": self.max_characters,
            "new_after_n_chars": self.new_after_n_chars,
            "combine_text_under_n_chars": self.combine_text_under_n_chars,
        }
        
        # Add image extraction parameters if enabled
        if self.extract_images:
            partition_params.update({
                "extract_images_in_pdf": True,
                "extract_image_block_to_payload": True,
                "extract_image_block_output_dir": str(self.image_output_dir),
                "extract_image_block_types": ["Image"],
            })
        
        # Add table extraction parameter if enabled
        if self.extract_tables:
            partition_params["infer_table_structure"] = True
        
        # Partition the PDF
        elements = partition_pdf(**partition_params)
        
        print(f"‚úÖ Extracted {len(elements)} elements\n")
        return elements
    
    @classmethod
    def get_supported_languages(cls) -> list:
        """Get list of all supported language names"""
        return sorted(cls.SUPPORTED_LANGUAGES.keys())
    
    @classmethod
    def from_terminal_input(cls):
        """Create PDFPartitioner instance from terminal input with full validation"""
        print("\n" + "=" * 70)
        print(" " * 20 + "PDF PARTITIONER CONFIGURATION")
        print("=" * 70)
        
        # ============ PDF FILE PATH ============
        print("\nüìÅ PDF FILE PATH:")
        while True:
            pdf_path = input("  Enter PDF file path: ").strip()
            
            if not pdf_path:
                print("  ‚ùå Error: Path cannot be empty. Please try again.\n")
                continue
            
            if not os.path.exists(pdf_path):
                print(f"  ‚ùå Error: File does not exist: {pdf_path}")
                retry = input("  Do you want to try again? (yes/no): ").strip().lower()
                if retry not in ['yes', 'y']:
                    raise FileNotFoundError(f"PDF file not found: {pdf_path}")
                continue
            
            if not pdf_path.lower().endswith('.pdf'):
                print("  ‚ö†Ô∏è  Warning: File does not have .pdf extension")
                proceed = input("  Do you want to proceed anyway? (yes/no): ").strip().lower()
                if proceed not in ['yes', 'y']:
                    continue
            
            print(f"  ‚úÖ File found: {pdf_path}\n")
            break
        
        # ============ LANGUAGE CONFIGURATION ============
        print("üìö LANGUAGE CONFIGURATION:")
        print(f"  Available languages (showing first 10): {', '.join(cls.get_supported_languages()[:10])}...")
        print(f"  Total {len(cls.SUPPORTED_LANGUAGES)} languages supported")
        print("  Examples: english, hindi, spanish, french, german, japanese, chinese\n")
        
        while True:
            language = input("  Enter language: ").strip()
            
            if not language:
                print("  ‚ùå Error: Language cannot be empty. Please try again.\n")
                continue
            
            lang_lower = language.lower().replace(" ", "_").replace("-", "_")
            if lang_lower not in cls.SUPPORTED_LANGUAGES:
                print(f"  ‚ùå Error: Unsupported language '{language}'")
                print("     Type 'list' to see all languages, or try again")
                choice = input("  Your choice: ").strip().lower()
                
                if choice == 'list':
                    print("\n  Supported languages:")
                    langs = cls.get_supported_languages()
                    for i in range(0, len(langs), 5):
                        print("    " + ", ".join(langs[i:i+5]))
                    print()
                continue
            
            print(f"  ‚úÖ Language set: {language}\n")
            break
        
        # ============ CHUNKING PARAMETERS ============
        print("üìè CHUNKING CONFIGURATION (All Required):")
        
        while True:
            try:
                max_characters = int(input("  Max characters per chunk: ").strip())
                if max_characters <= 0:
                    print("  ‚ùå Error: Must be a positive number\n")
                    continue
                break
            except ValueError:
                print("  ‚ùå Error: Please enter a valid integer\n")
        
        while True:
            try:
                new_after_n_chars = int(input("  Start new chunk after N chars: ").strip())
                if new_after_n_chars <= 0:
                    print("  ‚ùå Error: Must be a positive number\n")
                    continue
                break
            except ValueError:
                print("  ‚ùå Error: Please enter a valid integer\n")
        
        while True:
            try:
                combine_text_under_n_chars = int(input("  Combine text blocks under N chars: ").strip())
                if combine_text_under_n_chars < 0:
                    print("  ‚ùå Error: Must be a non-negative number\n")
                    continue
                break
            except ValueError:
                print("  ‚ùå Error: Please enter a valid integer\n")
        
        print(f"  ‚úÖ Chunking configured\n")
        
        # ============ OPTIONAL FEATURES ============
        print("üîß OPTIONAL FEATURES:")
        
        # Image extraction
        extract_images = input("  Do you want to extract images? (yes/no): ").strip().lower() in ['yes', 'y']
        if extract_images:
            print(f"  ‚úÖ Images will be saved to: {cls.IMAGE_OUTPUT_DIR}")
        
        # Table extraction
        extract_tables = input("  Do you want to extract tables? (yes/no): ").strip().lower() in ['yes', 'y']
        if extract_tables:
            print("  ‚úÖ Tables will be extracted as structured HTML")
        
        print("\n" + "=" * 70)
        print("‚úÖ Configuration Complete!")
        print("=" * 70 + "\n")
        
        # Create instance
        partitioner = cls(
            language=language,
            max_characters=max_characters,
            new_after_n_chars=new_after_n_chars,
            combine_text_under_n_chars=combine_text_under_n_chars,
            extract_images=extract_images,
            extract_tables=extract_tables
        )
        
        # Return both partitioner and pdf_path
        return partitioner, pdf_path
    
    def get_config_summary(self) -> dict:
        """Get current configuration as dictionary"""
        return {
            "language": self.language,
            "extract_images": self.extract_images,
            "extract_tables": self.extract_tables,
            "image_output_dir": str(self.image_output_dir),
            "chunking": {
                "max_characters": self.max_characters,
                "new_after_n_chars": self.new_after_n_chars,
                "combine_text_under_n_chars": self.combine_text_under_n_chars
            }
        }


# ‚úÖ USAGE EXAMPLE - Interactive Terminal Mode

if __name__ == "__main__":
    try:
        # Get configuration and PDF path from terminal
        partitioner, pdf_path = PDFPartitioner.from_terminal_input()
        
        # Process the document
        elements = partitioner.partition_document(pdf_path)
        
        print("üéâ Processing completed successfully!")
        print(f"üìä Summary: {len(elements)} elements extracted")

        # Define output directories
        json_dir = r"D:\MultiModulRag\Backend\SmartChunkClubing\JSON"
        pickle_dir = r"D:\MultiModulRag\Backend\SmartChunkClubing\Pickel"

        # Create directories if they don‚Äôt exist
        os.makedirs(json_dir, exist_ok=True)
        os.makedirs(pickle_dir, exist_ok=True)

        # Extract filename (without extension)
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]

        # Define output paths
        json_path = os.path.join(json_dir, f"{base_name}.json")
        pickle_path = os.path.join(pickle_dir, f"{base_name}.pkl")

        # Dump to JSON
        try:
            with open(json_path, "w", encoding="utf-8") as jf:
                json.dump(
                    [el.to_dict() if hasattr(el, "to_dict") else str(el) for el in elements],
                    jf,
                    indent=4,
                    ensure_ascii=False
                )
            print(f"‚úÖ JSON saved at: {json_path}")
        except Exception as e:
            print(f"‚ùå Error saving JSON: {e}")

        # Dump to Pickle
        try:
            with open(pickle_path, "wb") as pf:
                pickle.dump(elements, pf)
            print(f"‚úÖ Pickle saved at: {pickle_path}")
        except Exception as e:
            print(f"‚ùå Error saving Pickle: {e}")

    except Exception as e:
        print(f"üö® Error: {e}")
    except FileNotFoundError as e:
        print(f"\n‚ùå File Error: {e}")
    except ValueError as e:
        print(f"\n‚ùå Configuration Error: {e}")
    except Exception as e:
        print(f"\n‚ùå Unexpected Error: {e}")

In [None]:
import json
import pickle
from pathlib import Path
from unstructured.documents.elements import Element

def save_elements(elements, pkl_path: str, json_path: str = None):
    """
    Save a Python variable `elements` to pickle and optionally to JSON.
    Automatically converts unstructured Element objects to dicts for JSON.

    Args:
        elements: Python variable to save (list, dict, etc.)
        pkl_path: Path to save the pickle file (required)
        json_path: Path to save the JSON file (optional)
    """
    # Ensure parent directories exist
    Path(pkl_path).parent.mkdir(parents=True, exist_ok=True)
    if json_path:
        Path(json_path).parent.mkdir(parents=True, exist_ok=True)

    # Save as Pickle
    with open(pkl_path, "wb") as f:
        pickle.dump(elements, f)
    print(f"‚úÖ Saved elements to pickle: {pkl_path}")

    # Save as JSON (optional)
    if json_path:
        # Convert Element objects to dicts automatically
        def to_serializable(el):
            return el.to_dict() if isinstance(el, Element) else el
        
        elements_serializable = [to_serializable(el) for el in elements]

        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(elements_serializable, f, indent=4, ensure_ascii=False)
        print(f"‚úÖ Saved elements to JSON: {json_path}")


# -----------------------------
# Example usage
# your Python variable, e.g., output of partition_pdf

pkl_file = r"D:\MultiModulRag\Backend\Pipeline_Database\Pickel\Checkpointer1.pkl"
json_file = r"D:\MultiModulRag\Backend\Pipeline_Database\JSON\Checkpointer1.json"

save_elements(elements, pkl_file, json_file) 

In [None]:
import os
import base64
from pathlib import Path

class ContentSeparator:
    """Handles separation and storage of different content types from chunks"""
    
    def __init__(self, image_dir):
        """
        Initialize the content separator
        
        Args:
            image_dir: Directory path where images will be saved
        """
        self.image_dir = Path(image_dir)
        self.image_counter = 1
        
        # Setup directory
        self.image_dir.mkdir(parents=True, exist_ok=True)
        self._clear_existing_images()
    
    def _clear_existing_images(self):
        """Private method to clear existing images in directory"""
        for file in self.image_dir.glob("*"):
            if file.is_file():
                file.unlink()
    
    def separate_content_types(self, chunk):
        """
        Analyze and extract content types from a chunk
        
        Args:
            chunk: The chunk object to process
            
        Returns:
            dict: Dictionary containing separated content
        """
        content_data = {
            'text': chunk.text,
            'tables': [],
            'images_base64': [],
            'images_dirpath': [],
            'page_no': [],
            'types': ['text']
        }

        if hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements'):
            for element in chunk.metadata.orig_elements:
                element_type = type(element).__name__
                
                # Handle page numbers
                page_no = element.to_dict()['metadata']['page_number']
                if page_no not in content_data['page_no']: 
                    content_data['page_no'].append(page_no)
                
                # Handle tables
                if element_type == 'Table':
                    self.process_table(element, content_data)
                
                # Handle images
                elif element_type == 'Image':
                    self.process_image(element, content_data)

        return content_data
    
    def process_table(self, element, content_data):
        """Private method to process table elements"""
        if 'table' not in content_data['types']:
            content_data['types'].append('table')
        table_html = getattr(element.metadata, 'text_as_html', element.text)
        content_data['tables'].append(table_html)
    
    def process_image(self, element, content_data):
        """Private method to process image elements"""
        if not (hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64')):
            return
        
        if 'image' not in content_data['types']:
            content_data['types'].append('image')
        
        image_base64 = element.metadata.image_base64
        content_data['images_base64'].append(image_base64)
        
        try:
            image_filename = f"image_{self.image_counter}.png"
            image_path = self.image_dir / image_filename
            
            with open(image_path, "wb") as img_file:
                img_file.write(base64.b64decode(image_base64))
            
            content_data['images_dirpath'].append(str(image_path))
            # print(f"     ‚úÖ Saved image: {image_filename}")
            
            self.image_counter += 1  # Increment instance counter
            
        except Exception as e:
            print(f"     ‚ùå Failed to save image {self.image_counter}: {e}")
    
    def process_chunks(self, chunks):
        """
        Process multiple chunks
        
        Args:
            chunks: List of chunks to process
            
        Returns:
            list: List of content data dictionaries
        """
        all_content_data = []
        total_chunks = len(chunks)
        
        for i, chunk in enumerate(chunks):
            current_chunk = i + 1
            print(f"   Processing chunk {current_chunk}/{total_chunks}")
            
            content_data = self.separate_content_types(chunk)
            
            print(f"     Types found: {content_data['types']}")
            print(f"     Tables: {len(content_data['tables'])}, Images: {len(content_data['images_base64'])}")
            
            all_content_data.append(content_data)
        
        # print(f"\n Total images saved: {self.image_counter - 1}")
        return all_content_data
    
    def get_table_count(self, content_data):
        """Get the total number of tables processed"""
        return sum(len(data['tables']) for data in content_data)
    
    def get_text_count(self, content_data):
        """Get the total number of text chunks processed"""
        return sum(1 for data in content_data if 'text' in data['types'])

    # def get_image_count(self):
    #     """Get the total number of images processed"""
    #     # Count actual files in directory (source of truth)
    #     return len(list(self.image_dir.glob("*.png")))
    
    def get_image_count(self, content_data):
        """Get the total number of images processed"""
        return sum(len(data['images_base64']) for data in content_data)

# ‚úÖ USAGE - Clean and Simple!
if __name__ == "__main__":
    # Create an instance
    separator = ContentSeparator(
        image_dir=r"D:\MultiModulRag\Backend\Pipeline_Database\Images"
    )
    
    # Process all chunks
    all_content_data = separator.process_chunks(checkpoint)
    
    # Get stats
    print(f"Total images: {separator.get_image_count(all_content_data)}")
    print(f"Total tables: {separator.get_table_count(all_content_data)}")
    print(f"Total text chunks: {separator.get_text_count(all_content_data)}")

   Processing chunk 1/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 2/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 3/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 4/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 5/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 6/24
     Types found: ['text', 'image']
     Tables: 0, Images: 1
   Processing chunk 7/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 8/24
     Types found: ['text', 'image']
     Tables: 0, Images: 2
   Processing chunk 9/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 10/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 11/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 12/24
     Types found: ['text']
     Tables: 0, Images: 0
   Processing chunk 13/24
     Types found:

In [11]:
print(f"Total images: {separator.get_image_count()}")

Total images: 3


In [9]:
all_content_data[2]

{'text': '1 Introduction\n\nRecurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been Ô¨Årmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5]. Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13].\n\n‚àóEqual contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the Ô¨Årst Transformer models and has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase

In [None]:

def partition_document(file_path: str):
    """Extract elements from PDF using unstructured"""
    print(f"üìÑ Partitioning document: {file_path}")
    path = r"D:\MultiModulRag\Backend\SmartChunkClubing\Images"
    
    # elements = partition_pdf(
    # filename=file_path, # Stores the path to the PDF file
    # strategy = "hi_res", # Uses high-resolution strategy for better accuracy
    # languages=["eng"], # Specifies the language as English
    # extract_images_in_pdf=True, # Enables image extraction from the PDF but is deprecated and in future will use 'extract_image_block_types'
    #     # extract_image_block_types=
    #     # Only applicable if `strategy=hi_res`.
    #     # Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
    #     # saved in the path specified by 'extract_image_block_output_dir' or stored as base64
    #     # encoded data within metadata fields.
    # # include_page_breaks=True, # Includes page breaks in the output 
    # chunking_strategy="by_title", # Chunks content based on titles
    # extract_image_block_types=["Image"], # Specifies types of image blocks to extract
    # hi_res_model_name="yolox", # Specifies the model for high-resolution extraction
    # infer_table_structure=True, # Keep tables as structured HTML, not jumbled text
    # extract_image_block_output_dir=path, # Directory to save extracted images
    # extract_image_block_to_payload=True, # Saves image data in the element payload
    
    # ### ***Combining chunk parameter*** ###
    # max_characters=3000, # Maximum characters per chunk
    # new_after_n_chars=3800, # Starts a new chunk after this many characters
    # combine_text_under_n_chars=200, # Combines small text blocks under this character count
    # include_orig_elements=True # Includes original elements in the output
    # )
    elements = partition_pdf(

        ### ***Core parameters(Fixed Parameters)()*** ###
        filename=file_path,
        strategy = "hi_res",
        hi_res_model_name="yolox",
        chunking_strategy="by_title",
        include_orig_elements=True # Includes original elements in the output

        ### ***Language and extraction parameters (must select this)*** ###
        languages=[], # Specifies the language and must select this
        ### Supported languages are:
            # "afrikaans": "afr",
            # "amharic": "amh",
            # "arabic": "ara",
            # "assamese": "asm",
            # "azerbaijani": "aze",
            # "azerbaijani - cyrilic": "aze_cyrl",
            # "belarusian": "bel",
            # "bengali": "ben",
            # "tibetan": "bod",
            # "bosnian": "bos",
            # "breton": "bre",
            # "bulgarian": "bul",
            # "catalan; Valencian": "cat",
            # "cebuano": "ceb",
            # "czech": "ces",
            # "chinese - simplified": "chi_sim",
            # "chinese": "chi_sim",
            # "chinese - traditional": "chi_tra",
            # "cherokee": "chr",
            # "corsican": "cos",
            # "welsh": "cym",
            # "danish": "dan",
            # "danish - fraktur": "dan_frak",
            # "german": "deu",
            # "german - fraktur (contrib)": "deu_frak",  # "contrib" not removed because it would repeat key
            # "dzongkha": "dzo",
            # "greek, modern": "ell",
            # "greek": "ell",
            # "english": "eng",
            # "english, middle": "enm",
            # "esperanto": "epo",
            # "math / equation detection module": "equ",
            # "estonian": "est",
            # "basque": "eus",
            # "faroese": "fao",
            # "persian": "fas",
            # "filipino (old - tagalog)": "fil",
            # "filipino": "fil",
            # "finnish": "fin",
            # "french": "fra",
            # "german - fraktur": "frk",
            # "french, middle": "frm",
            # "western frisian": "fry",
            # "scottish gaelic": "gla",
            # "irish": "gle",
            # "galician": "glg",
            # "greek, ancient": "grc",
            # "gujarati": "guj",
            # "haitian": "hat",
            # "haitian creole": "hat",
            # "hebrew": "heb",
            # "hindi": "hin",
            # "croatian": "hrv",
            # "hungarian": "hun",
            # "armenian": "hye",
            # "inuktitut": "iku",
            # "indonesian": "ind",
            # "icelandic": "isl",
            # "italian": "ita",
            # "italian - old": "ita_old",
            # "javanese": "jav",
            # "japanese": "jpn",
            # "kannada": "kan",
            # "georgian": "kat",
            # "georgian - old": "kat_old",
            # "kazakh": "kaz",
            # "central khmer": "khm",
            # "kirghiz": "kir",
            # "kyrgyz": "kir",
            # "kurmanji (kurdish - latin script)": "kmr",
            # "korean": "kor",
            # "korean (vertical)": "kor_vert",
            # "kurdish (arabic script)": "kur",
            # "lao": "lao",
            # "latin": "lat",
            # "latvian": "lav",
            # "lithuanian": "lit",
            # "luxembourgish": "ltz",
            # "malayalam": "mal",
            # "marathi": "mar",
            # "macedonian": "mkd",
            # "maltese": "mlt",
            # "mongolian": "mon",
            # "maori": "mri",
            # "malay": "msa",
            # "burmese": "mya",
            # "nepali": "nep",
            # "dutch": "nld",
            # "flemish": "nld",
            # "norwegian": "nor",
            # "occitan": "oci",
            # "oriya": "ori",
            # "orientation and script detection module": "osd",
            # "panjabi": "pan",
            # "punjabi": "pan",
            # "polish": "pol",
            # "portuguese": "por",
            # "pushto": "pus",
            # "pashto": "pus",
            # "quechua": "que",
            # "romanian": "ron",
            # "moldavian": "ron",
            # "moldovan": "ron",
            # "russian": "rus",
            # "sanskrit": "san",
            # "sinhala": "sin",
            # "sinhalese": "sin",
            # "slovak": "slk",
            # "slovak - fraktur": "slk_frak",
            # "slovenian": "slv",
            # "sindhi": "snd",
            # "spanish": "spa",
            # "castilian": "spa",
            # "spanish - old": "spa_old",
            # "castilian - old": "spa_old",
            # "albanian": "sqi",
            # "serbian": "srp",
            # "serbian - latin": "srp_latn",
            # "sundanese": "sun",
            # "swahili": "swa",
            # "swedish": "swe",
            # "syriac": "syr",
            # "tamil": "tam",
            # "tatar": "tat",
            # "telugu": "tel",
            # "tajik": "tgk",
            # "tagalog": "tgl",
            # "thai": "tha",
            # "tigrinya": "tir",
            # "tonga": "ton",
            # "turkish": "tur",
            # "uighur": "uig",
            # "uyghur": "uig",
            # "ukrainian": "ukr",
            # "urdu": "urd",
            # "uzbek": "uzb",
            # "uzbek - cyrilic": "uzb_cyrl",
            # "vietnamese": "vie",
            # "yiddish": "yid",
            # "yoruba": "yor",


        ### (OPTIONAL) ***Image extraction parameters(if user want to extract image both 
        ### ***'extract_images_in_pdf' and 'extract_image_block_to_payload' will be true)*** ###
        extract_images_in_pdf=True, # extract image from pdf
        extract_image_block_to_payload=True, 
        extract_image_block_output_dir=path,
        extract_image_block_types=["Image"], # this will be set to "Image" if user want to extract image from pdf


        ### (OPTIONAL) ***Table ( If user wnat it will be true)*** ###
        infer_table_structure=True, # Keep tables as structured HTML, not jumbled text


        ### (Always Wanted and Asked) ***Chunk Parameter*** ###
        max_characters=3000, # Maximum characters per chunk
        new_after_n_chars=3800, # Starts a new chunk after this many characters
        combine_text_under_n_chars=200, # Combines small text blocks under this character count

    )
    print(f"‚úÖ Extracted {len(elements)} elements")
    return elements