In [15]:
### Load pdf files
from langchain.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader,
)

### PyPDF loader technique

In [16]:
print("PypdfLoader:")
try:
    pypdf_loader =PyPDFLoader("data/pdf/attention.pdf")
    pypdf_docs= pypdf_loader.load()
    print("pypdf docs:",pypdf_docs)
    print(f"Loaded {len(pypdf_docs)} pages")
    print("Page 1 content:", pypdf_docs[0].page_content[:100])  # Print first 100 characters of page 1
    print("Metadata of Page 1:", pypdf_docs[0].metadata)  # Print metadata of page 1
except Exception as e:
    print("Error loading with PyPDFLoader:", e)

PypdfLoader:
pypdf docs: [Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2014-06-02T05:45:18-07:00', 'moddate': '2014-06-02T05:53:55-07:00', 'rgid': 'PB:232502654_AS:428335265259521@1479134493492', 'source': 'data/pdf/attention.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/232502654\nComponents of attention\nArticle\xa0\xa0in \xa0\xa0Psychological Review · September 1971\nDOI: 10.1037/h0031333\nCITATIONS\n1,514\nREADS\n13,443\n2 authors, including:\nMichael Posner\nUniversity of Oregon\n468 PUBLICATIONS\xa0\xa0\xa0123,777 CITATIONS\xa0\xa0\xa0\nSEE PROFILE\nAll content following this page was uploaded by Michael Posner on 14 November 2016.\nThe user has requested enhancement of the downloaded file.'), Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2014-06-02T05:45:18-07:00', 'moddate': '2014-06-

In [17]:
##Method 2 : PyMuPDFLoader (fast and accurate)
print("PyMuPDFLoader:")
try:
    pymupdf_loader = PyMuPDFLoader("data/pdf/attention.pdf")
    pymupdf_docs = pymupdf_loader.load()
    print(f"Loaded {len(pymupdf_docs)} pages")
    print("Page 1 content:", pymupdf_docs[0].page_content[:100])  # Print first 100 characters of page 1
    print("Metadata of Page 1:", pymupdf_docs[0].metadata)  # Print metadata of page 1
except Exception as e:
    print("Error loading with PyMuPDFLoader:", e)

PyMuPDFLoader:
Loaded 10 pages
Page 1 content: See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/pu
Metadata of Page 1: {'producer': '', 'creator': '', 'creationdate': '2014-06-02T05:45:18-07:00', 'source': 'data/pdf/attention.pdf', 'file_path': 'data/pdf/attention.pdf', 'total_pages': 10, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2014-06-02T05:53:55-07:00', 'trapped': '', 'modDate': "D:20140602055355-07'00'", 'creationDate': "D:20140602054518-07'00'", 'page': 0}


Pdf loader comparison and conclusion
pypdf_Loader:
    Simple and reliable
    Good for most pdf
    Preserves page number
    No basic text extraction
    use when : standard text files
PyMuPdfloader
    Fast processing
    Good text extraction
    Image extraction support
    use :when speed is important

In [18]:
# Compare performance and features of both loaders
comparison = {
    'PyPDFLoader': {
        'pages_loaded': len(pypdf_docs),
        'first_page_length': len(pypdf_docs[0].page_content),
        'metadata_keys': list(pypdf_docs[0].metadata.keys())
    },
    'PyMuPDFLoader': {
        'pages_loaded': len(pymupdf_docs),
        'first_page_length': len(pymupdf_docs[0].page_content),
        'metadata_keys': list(pymupdf_docs[0].metadata.keys())
    }
}

print("Loader Comparison Results:")
for loader, metrics in comparison.items():
    print(f"\n{loader}:")
    print(f"  - Pages loaded: {metrics['pages_loaded']}")
    print(f"  - Characters in first page: {metrics['first_page_length']}")
    print(f"  - Available metadata fields: {', '.join(metrics['metadata_keys'])}")

Loader Comparison Results:

PyPDFLoader:
  - Pages loaded: 10
  - Characters in first page: 499
  - Available metadata fields: producer, creator, creationdate, moddate, rgid, source, total_pages, page, page_label

PyMuPDFLoader:
  - Pages loaded: 10
  - Characters in first page: 498
  - Available metadata fields: producer, creator, creationdate, source, file_path, total_pages, format, title, author, subject, keywords, moddate, trapped, modDate, creationDate, page


pdf extraction and common issues

In [19]:
# Example of pdf extraction
raw_pdf_text = """
Attention is all you need.



      The dominant sequence transduction models are based on complex
recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new
simple network architecture,


    the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation
task, improving over the existing best results, 

including ensembles, by more than 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.
    """

#apply cleaning functions
def clean_text(text):
    # Remove excessive whitespace and newlines
    text = ' '.join(text.split())
    #fix ligatures
    text = text.replace("f1","f1").replace("fi","fi")
    return text

cleaned = clean_text(raw_pdf_text)
print("before cleaning:", raw_pdf_text[:200])
print("---------\nafter cleaning:", cleaned[:200])


before cleaning: 
Attention is all you need.



      The dominant sequence transduction models are based on complex
recurrent or convolutional neural networks that include an encoder and a decoder. The best performin
---------
after cleaning: Attention is all you need. The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models a


In [20]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [34]:
import pytesseract
from pdf2image import convert_from_path
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List

class SmartPdfProcessor:

    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "],
        )

    def process_pdf(self, pdf_path: str) -> List[Document]:
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        processed_chunks: List[Document] = []

        for idx, page in enumerate(pages):
            text = self._clean_text(page.page_content)

            if len(text) < 50:
                imgs = convert_from_path(pdf_path, first_page=idx+1, last_page=idx+1)
                text = pytesseract.image_to_string(imgs[0])
                text = self._clean_text(text)
                if len(text) < 50:
                    continue

            chunks = self.text_splitter.create_documents(
                [text],
                [{
                    **page.metadata,
                    "page": idx+1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(text),
                }]
            )
            processed_chunks.extend(chunks)
            print(f"Processed page {idx+1}")

        return processed_chunks

    def _clean_text(self, text: str) -> str:
        return " ".join(text.split()).replace("ﬁ","fi").replace("ﬂ","fl")


In [35]:
preprocessor = SmartPdfProcessor()


In [23]:
preprocessor 

<__main__.SmartPdfProcessor at 0x30c896f30>

In [37]:
##process a pdf if available
try:
    smart_chunks = preprocessor.process_pdf("data/pdf/attention.pdf")
    print(f"Total smart chunks created: {len(smart_chunks)}")
    for i, chunk in enumerate(smart_chunks[:3]):  # Display first 3 chunks
        print(f"Chunk {i+1} metadata:", chunk.metadata)
        print(f"Chunk {i+1} content preview:", chunk.page_content[:100])


    # print("Sample chunk content:", smart_chunks[0].page_content[:200])  #
except Exception as e:
    print("Error processing PDF with SmartPdfProcessor:", e)

Processed page 1
Processed page 2
Processed page 3
Processed page 4
Processed page 5
Processed page 6
Processed page 7
Processed page 8
Processed page 9
Processed page 10
Total smart chunks created: 30
Chunk 1 metadata: {'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2014-06-02T05:45:18-07:00', 'moddate': '2014-06-02T05:53:55-07:00', 'rgid': 'PB:232502654_AS:428335265259521@1479134493492', 'source': 'data/pdf/attention.pdf', 'total_pages': 10, 'page': 1, 'page_label': '1', 'chunk_method': 'smart_pdf_processor', 'char_count': 491}
Chunk 1 content preview: See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/pu
Chunk 2 metadata: {'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '2014-06-02T05:45:18-07:00', 'moddate': '2014-06-02T05:53:55-07:00', 'rgid': 'PB:232502654_AS:428335265259521@1479134493492', 'source': 'data/pdf/attention.pdf', 'total_pages': 10, 'page': 2, 'page_label': '2', 'chunk_method': 'smart_pdf_processor', '