In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from pathlib import Path

In [None]:
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_directory = Path(pdf_directory)

    pdf_files = list(pdf_directory.glob("**/*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in {pdf_directory}")

    for pdf_file in pdf_files:
        print(f"Loading PDF file: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = "pdf"

            all_documents.extend(documents)  # Trải phẳng ra r append vào, chứ k append 1 cục
            print(f"Loaded {len(documents)} PDF files")

        except Exception as e:
            print(f"Failed to load {pdf_file.name} with PyPDFLoader: {e}")
            raise

    print(f"Loaded {len(all_documents)} PDF files")
    return all_documents


all_pdf_documents = process_all_pdfs("../data/pdf")
all_pdf_documents

In [None]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunk for betters RAG performance

    :param documents: Documents to split
    :param chunk_size: Size of each chunk
    :param chunk_overlap: Repeat size between chunks
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"\nExample chunk")
        print(f"\n{split_docs[0].page_content[:-1]}")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [None]:
split_chunk = split_documents(all_pdf_documents, 1000, 200)