In [1]:
# chunk_exporter.py
import json
import uuid


In [2]:
def export_text_chunks(texts, source_pdf, output_file="chunks.jsonl"):
    with open(output_file, "a", encoding="utf-8") as f:
        for text in texts:
            raw_text = text.text if hasattr(text, "text") else str(text)

            record = {
                "chunk_id": str(uuid.uuid4()),
                "modality": "text",
                "source_pdf": source_pdf,
                "page_number": None,
                "raw_text": raw_text,
                "image_b64": None,
                "gold_questions": []
            }
            f.write(json.dumps(record) + "\n")


In [3]:
def export_image_chunks(images_b64, source_pdf, output_file="chunks.jsonl"):
    with open(output_file, "a", encoding="utf-8") as f:
        for img in images_b64:
            record = {
                "chunk_id": str(uuid.uuid4()),
                "modality": "image",
                "source_pdf": source_pdf,
                "page_number": None,
                "raw_text": None,
                "image_b64": img,
                "gold_questions": []
            }
            f.write(json.dumps(record) + "\n")


In [4]:
# App/Ingestion.py
import os
from App.Ingestion import create_chunks_from_pdf  # adjust import if needed


def process_pdfs_in_directory(directory_path):
    """
    Iterate over all PDFs in a directory and yield (pdf_name, elements).
    """
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            elements = create_chunks_from_pdf(file_path)
            yield filename, elements


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# App/Ingestion_chain.py
from App.Ingestion import table_text_segregation, get_images
from App.summarizer import summarize_texts_tables, summarize_images
from App.VectorDB import add_documents_to_vector_db


def ingestion_chain(pdf_directory, retriever):
    """
    Complete ingestion pipeline:
    Directory of PDFs ‚Üí extract ‚Üí export chunks ‚Üí summarize ‚Üí add to vector DB
    """
    try:
        from App.ollama_running import ensure_ollama_running
        ensure_ollama_running()

        for pdf_name, elements in process_pdfs_in_directory(pdf_directory):
            print(f"\nüìò Ingesting: {pdf_name}")

            # Segregate elements
            tables, texts = table_text_segregation(elements)
            images = get_images(elements)

            print(
                f"üìÑ Texts: {len(texts)}, "
                f"üìä Tables: {len(tables)}, "
                f"üñºÔ∏è Images: {len(images)}"
            )

            # ---- EXPORT RAW CHUNKS (PERSISTENT DATASET) ----
            export_text_chunks(
                texts=texts,
                source_pdf=pdf_name
            )

            export_image_chunks(
                images_b64=images,
                source_pdf=pdf_name
            )

            print("‚úÖ Chunks exported to chunks.jsonl")

            # ---- DENSE SUMMARIZATION ----
            text_summaries, table_summaries = summarize_texts_tables(texts, tables)
            img_summaries = summarize_images(images)

            print("‚úÖ Summarization complete")

            # ---- VECTOR DB INGESTION (RAG ONLY) ----
            add_documents_to_vector_db(
                texts,
                text_summaries,
                tables,
                table_summaries,
                images,
                img_summaries,
                retriever=retriever
            )

            print("‚úÖ Added to vector database")

        print("\nüéØ Ingestion completed for all PDFs.")
        return True

    except Exception as e:
        print(f"‚ùå Error in ingestion pipeline: {str(e)}")
        raise RuntimeError(f"Ingestion failed: {str(e)}")


In [None]:
from App.console_app import initialize_retriever
retriever = initialize_retriever()

In [None]:
ingestion_chain(pdf_directory=r"D:\Projects\Multimodal-Retrieval-Augmented-Generation\Evaluation", retriever=retriever)  # Replace with actual path and retriever