# PDF-Extraction

This notebooks holds a couple use case specific implementations to extract certain data out of PDF documents

## Setup and Configuration

In [None]:
# -- imports --
import os
import pandas as pd
import pymupdf
import pymupdf4llm
from bs4 import BeautifulSoup
from docling.document_converter import DocumentConverter
from io import StringIO
from IPython.display import clear_output
from ollama import chat
from ollama import ChatResponse
from paddleocr import PaddleOCR
from paddleocr import PPStructureV3
from pydantic import BaseModel, RootModel
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from typing import List, Literal

# -- data paths --
example_pdf = '../data/pdf_labeled/2.pdf'  # short document
example_img = '../data/imgs_labeled/color/2_9.jpg'  # rather difficult example of same pdf as above

# -- config --

## Extract text from machine-written documents

Showcasing how to use three different tools for extraction of machine written PDF documents. 

If layout isn't terribly important PyMuPDF can be used for accurate extraction, otherwise either Docling aswell as Marker seem to offer good options for semi-structured extraction (HTML, MD).

Most libraries have more functionality and config options than shown -> have a look into the respective docs before use!

*... maybe also have a look at pyPDFIUM which seemingly has similar performance to PyMuPDF*

In [None]:
def extract_with_pymupdf(markdown=True):
    if markdown:  # takes (10x) longer! 
        return pymupdf4llm.to_markdown(example_pdf)
    else:
        with pymupdf.open(example_pdf) as doc:
            return chr(12).join([page.get_text() for page in doc])

def extract_with_docling():
    docling = DocumentConverter()
    conv_result = docling.convert(example_pdf)
    return conv_result.document.export_to_markdown()  # export_to_markdown() | export_to_html()

def extract_with_marker():
    config_no_llm = {"output_format":"markdown"}
    config_llm = {
        "output_format":"markdown",  # [markdown|json|html]
        "llm_service": "marker.services.ollama.OllamaService",
        "use_llm":True,
        "ollama_base_url":"http://localhost:11434",  # ollama-default: "http://localhost:11434"
        "ollama_model":"gemma3:27b",  
        "TableConverter_use_llm":True,
        # + prompts!
    }
    config_parser = ConfigParser(config_no_llm)

    converter = PdfConverter(
        config=config_parser.generate_config_dict(),
        artifact_dict=create_model_dict(),
        renderer=config_parser.get_renderer(),
        processor_list=config_parser.get_processors(),
        llm_service=config_parser.get_llm_service(),
    )

    conv_result = converter(example_pdf)
    markdown_text, _, _ = text_from_rendered(conv_result) 
    return markdown_text


pymu_result = extract_with_pymupdf()
doc_result = extract_with_docling()
mrk_result = extract_with_marker()

clear_output() # clear logs from libraries

print('PyMuPDF')
print('-----------------------------------------------------------------')
print(pymu_result)
for i in range(3):
    print('=================================================================')
print('Docling')
print('-----------------------------------------------------------------')
print(doc_result)
for i in range(3):
    print('=================================================================')
print('Marker')
print('-----------------------------------------------------------------')
print(mrk_result)

## Extract text from images using OCR

While libraries like Docling also offer an OCR option, better results have been achieved with Paddle based tooling. If `use_structure=False` is set, uses a basic algorithm to create some rough structure.

⚠️ Paddle-based models may not work in the notebook. Theres been a dependency issue going on with ipy-packages (ipykernel, ipywidgets) and paddlepaddle. Using the code in an external python file while running following commands to install the dependencies needed for PaddleOCR worked though:
```
pip install paddleocr[all]
pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/  # for an RTX 4090 with Cuda 12.7
```

In [None]:
# follow the installation guide: https://www.paddleocr.ai/main/en/version3.x/installation.html

class TextElement:
    """
    Simple class to define a textelement and its bounding box coordinates obtained by performing OCR using PP-OCRv5.
    """
    def __init__(self, text, top_left, bot_left, top_right, bot_right):
        self.text = text
        # Tuples; [0] = x-coordinate, [1] = y-coordinate
        self.top_left = top_left
        self.bot_left = bot_left
        self.top_right = top_right
        self.bot_right = bot_right
    
    def __repr__(self):
        return f"TextElement('{self.text}', with top left corner of bounding box at {self.top_left})"


def perform_OCR(use_structure=False):
    if use_structure:
        pipeline = PPStructureV3(text_recognition_model_name="en_PP-OCRv4_mobile_rec",
                                device="gpu",
                                use_doc_orientation_classify=True,
                                use_doc_unwarping=False,
                                use_textline_orientation=False,
                                use_seal_recognition=True,
                                use_table_recognition=True,
                                use_formula_recognition=False,
                                use_chart_recognition=False,
                                use_region_detection=False)
        output = pipeline.predict(example_img)
        clear_output()

        for res in output:
             # res.save_to_markdown | json ()
             res.print()
    else:
        ocr = PaddleOCR(lang='en', device='gpu')
        text_elements: List[TextElement] = []
        ocr_results = ocr.predict(example_img)
        ocr_item = ocr_results[0] 
        # for each recognized text element (each "box" in the visual output of the OCR result) extract text and coordinates
        for idx in range(len(ocr_results[0]["rec_polys"])):
            box_coords = ocr_item["rec_polys"][idx]
            text = ocr_item["rec_texts"][idx]

            if text.strip():  # if text not empty
                top_left = tuple(box_coords[0])
                bot_left = tuple(box_coords[3])
                top_right = tuple(box_coords[1])
                bot_right = tuple(box_coords[2])

                text_elements.append(TextElement(text,top_left, bot_left, top_right, bot_right))
        # post-process to create formatted string
        text = ""
        for i, element in enumerate(text_elements):
            text += element.text.strip()

            if i + 1 >= len(text_elements):
                continue
            # check if next text is on similar height, if not -> \n
            elif vertical_overlap_ratio(element, text_elements[i + 1]) <= 0.75:  # if needed, tweak ratio threshold
                text += '\n'
            else:
                text += '\t' 
                
        clear_output()
        print(text)
    

def vertical_overlap_ratio(e1, e2):
    """
    Simple function to compute the ratio of vertical overlap of two bounding boxes. 
    """
    # get biggest possible vertical line of both boxes
    top1 = min(e1.top_left[1], e1.top_right[1])
    bot1 = max(e1.bot_left[1], e1.bot_right[1])
    top2 = min(e2.top_left[1], e2.top_right[1])
    bot2 = max(e2.bot_left[1], e2.bot_right[1])

    # find intersection
    overlap_top = max(top1, top2)
    overlap_bot = min(bot1, bot2)
    overlap_height = max(0, overlap_bot - overlap_top)

    # find smallest height to normalize
    height1 = bot1 - top1
    height2 = bot2 - top2

    min_height = min(height1, height2)
    if min_height == 0:
        return 0 
    
    return overlap_height / min_height

# PPStruct v3 doesn't work due to a dependency missing(?)
perform_OCR(False)

## Extract Stammdaten

Example on how to extract relevant data with a rule-based approach from pages that keep the exact same layout.

In [None]:
# using marker to extract text in HTML (a PDF-Reader that transforms to HTML could work aswell though!)

config = {
    'output_format':'html',
    'page_range':'0'  # TODO: replace with Stammdaten page if used
}

config_parser = ConfigParser(config)

converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    renderer=config_parser.get_renderer(),
    processor_list=config_parser.get_processors(),
    llm_service=config_parser.get_llm_service(),
)

base = converter(example_pdf)
html_txt, _, _ = text_from_rendered(base)

## use bs4 to edit HTML and create dictionary
soup = BeautifulSoup(html_txt, 'html.parser')
# swap <br/> tags with spaces
for br in soup.find_all('br'):
    br.replace_with(' ')
# normalize text in <th> tags by removing linebreaks
for th in soup.find_all("th"):
    th.string = " ".join(th.get_text(strip=True, separator=" ").split())
# create dictionary out of normalized html 
stammdaten = {}
rows = soup.find_all('tr')
for row in rows:
    cells = row.find_all('th')
    if len(cells) >= 2:
        for i in range(0, len(cells) - 1, 2):
            key = cells[i].get_text(strip=True)
            value = cells[i+1].get_text(strip=True)
            if key:  # avoid empty keys
                stammdaten[key] = value

# remove output of marker
clear_output()  

# print(stammdaten)
print(f"Name: {stammdaten['Vorname']} {stammdaten['Nachname']}")  # example output

## Extract Kursübersicht

Example on how to extract relevant data with a LLM approach from pages that differ in their layout.

In [None]:
# Note: 
# Using PP-StructureV3 for now as it usually gives better results. 
# Using the MarkDown output as the HTML output remains empty if there are no tables.
# Using OCR in any case, as the tables we got here are usually photos of documents.
example_img = './data/imgs_labeled/color/2_9.jpg'
pipeline = PPStructureV3(text_recognition_model_name="en_PP-OCRv4_mobile_rec",
                        device="gpu",
                        use_doc_orientation_classify=True,
                        use_doc_unwarping=True,
                        use_textline_orientation=False,
                        use_seal_recognition=True,
                        use_table_recognition=True,
                        use_formula_recognition=False,
                        use_chart_recognition=False,
                        use_region_detection=True
                        )

output = pipeline.predict(example_img)

html = output[0].markdown['markdown_texts']  # return htmls and md hybrid

# define output format
class Course(BaseModel):
    academic_field: Literal["Computer Science", "Mathematics"]
    course_name: str
    awarded_credits: float
class Courses(RootModel[List[Course]]):
    pass

response: ChatResponse = chat(
    model='gemma3:27b',  # TODO: try qwen 3
    messages=[
         {
            'role': 'user',
            'content': f"""
                You are given an HTML representation of a scanned page. 
                1. Determine if it is an academic record or transcript showing a single student's grades and/or earned credits. 
                2. If yes, identify all courses whose subject matter falls under the academic fields of "Computer Science" or "Mathematics" — this includes courses that are clearly related even if the document does not use those exact headings. Use your judgment to classify them based on their titles, descriptions, or course codes. 
                3. If it is not an academic record or transcript, return no results.
                Here is the HTML: 
                {html}
            """
        }
    ],
    format=Courses.model_json_schema()
)

print(pd.read_json(StringIO(response['message']['content'])))