--------
Processes all PDF files in the specified folder:
1. Ingest PDF files from raw and Converts PDFs to Markdown using Marker.
2. Splits text into smaller chunks using LangChain splitters.
3. Assigns stable document IDs via MD5 hashing.
4. Saves processed chunks to a JSONL file in processed directory
-------

In [None]:
import os
import hashlib
import jsonlines
from tqdm import tqdm
from langchain_text_splitters import (
    MarkdownHeaderTextSplitter,
    RecursiveCharacterTextSplitter,
)
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models

#### Helper Functions

In [None]:
# ------------------------------------------------------------
# 1️⃣ Utility: Clean up PDF filenames (remove spaces/dots)
# ------------------------------------------------------------

def parse_file_name(filename):
    if filename.endswith(".pdf"):
        name_part = filename[:-4]
        name_part = name_part.replace(" ", "").replace(".", "")
        cleaned_filename = name_part + ".pdf"
    else:
        cleaned_filename = filename.replace(" ", "").replace(".", "")
    return cleaned_filename



In [None]:
# ------------------------------------------------------------
# 2️⃣ Convert PDF → Markdown using Marker
# ------------------------------------------------------------

def convert_pdf_to_markdown(fname, reference_folder, model_lst):
    """
    Converts a PDF to markdown text using Marker models.
    """
    md_filename = fname.rsplit(".", 1)[0] + ".md"
    pdf_filename = os.path.join(reference_folder, fname)

    print(f"📄 Converting PDF: {pdf_filename}")
    full_text, _, _ = convert_single_pdf(pdf_filename, model_lst, batch_multiplier=1)

    return full_text

In [None]:
# ------------------------------------------------------------
# 3️⃣ Process a single PDF into smaller text chunks with metadata
# ------------------------------------------------------------

def download_and_process_pdf_file(
    f_key, text_splitter, markdown_splitter, model_lst, reference_folder="./raw/"
):
    temp_file_name = parse_file_name(f_key)
    temp_file_path = os.path.join(reference_folder, temp_file_name)

    # Convert PDF to Markdown
    mdfile = convert_pdf_to_markdown(temp_file_name, reference_folder, model_lst)

    # Split markdown by headers (H1, H2, H3)
    md_header_split = markdown_splitter.split_text(mdfile)

    documents = []
    for split in md_header_split:
        # Further split into smaller overlapping chunks
        split_texts = text_splitter.split_text(split.page_content)

        for i, split_text in enumerate(split_texts):
            # Create unique document ID based on filename + part number
            document_id = f"{f_key}_part_{i}"
            hash_object = hashlib.md5(document_id.encode())
            hash_hex = hash_object.hexdigest()
            document_id = hash_hex[:10]

            # Metadata for tracking source & chunk position
            metadata_dict = {
                "document_id": document_id,
                "pdf_name": f_key,
                "pdf_part": i,
            }
            metadata_dict.update(split.metadata)

            # Store final structured document
            documents.append({"metadata": metadata_dict, "content": split_text})

    return documents

In [None]:
# ------------------------------------------------------------
# 4️⃣ List all PDF files in a given folder
# ------------------------------------------------------------

def list_pdf_files(directory_path):
    filenames = [f for f in os.listdir(directory_path) if f.lower().endswith(".pdf")]
    return filenames

# Use the correct folder where PDFs actually exist
pdf_folder = os.path.join("..", "data", "raw") if os.getcwd().endswith("notebooks") else os.path.join(".", "data", "raw")

print("📂 Found PDF files:", list_pdf_files(pdf_folder))

In [None]:
# ------------------------------------------------------------
# 5️⃣ Main processing logic
# ------------------------------------------------------------

if __name__ == "__main__":
    # No folder creation here — we assume PDFs already exist
    if not os.path.exists(pdf_folder):
        raise FileNotFoundError(f"PDF folder not found: {pdf_folder}")

    # List PDF files
    filenames = list_pdf_files(pdf_folder)

    if not filenames:
        print(f"No PDF files found in {pdf_folder}")
    else:
        print(f"Found {len(filenames)} PDF(s) in {pdf_folder}")


In [None]:

# Load Marker models for PDF → Markdown conversion
configure_logging()
model_lst = load_all_models()


In [None]:
# Setup splitters for text segmentation
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
chunk_size = 500
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)


In [None]:
# Store ALL chunks in a single flat list
documents = []

# Process each PDF → chunks
for filename in tqdm(filenames, desc="Processing PDFs"):
    print(f"\n📑 Processing: {filename}")
    splitted_doc = download_and_process_pdf_file(
        filename,
        text_splitter,
        markdown_splitter,
        model_lst,
        reference_folder=pdf_folder,
    )
    documents.extend(splitted_doc)  # Keep list flat

In [None]:
# ------------------------------------------------------------
# 6️⃣ Save processed chunks to JSONL
# ------------------------------------------------------------

output_dir = os.path.join(os.path.dirname(pdf_folder), "processed")
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "docs_processed.jsonl")
with jsonlines.open(output_path, mode="w") as writer:
    writer.write_all(documents)

print(f"\n✅ Processing complete. Saved {len(documents)} chunks to {output_path}")

In [None]:
#len(documents)