[Reference](https://medium.com/@alice.yang_10652/extract-9-different-types-of-data-from-pdfs-with-python-c7b60cfbb232)

# Step 1: Install Spire.PDF

In [1]:
pip install Spire.PDF

Collecting Spire.PDF
  Downloading spire_pdf-11.6.1-py3-none-manylinux_2_31_x86_64.whl.metadata (7.0 kB)
Collecting plum-dispatch==1.7.4 (from Spire.PDF)
  Downloading plum_dispatch-1.7.4-py3-none-any.whl.metadata (1.8 kB)
Downloading spire_pdf-11.6.1-py3-none-manylinux_2_31_x86_64.whl (41.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading plum_dispatch-1.7.4-py3-none-any.whl (24 kB)
Installing collected packages: plum-dispatch, Spire.PDF
Successfully installed Spire.PDF-11.6.1 plum-dispatch-1.7.4


# Step 2: Import Required Module

In [2]:
from spire.pdf import *

# Extract Different Types of Data from PDFs in Python

## 1. Extract Text

In [4]:
from spire.pdf import *

# Load a PDF document
doc = PdfDocument()
doc.LoadFromFile('text_sample.pdf')

# Create a PdfTextExtractionOptions object
options = PdfTextExtractOptions()
options.IsExtractAllText = True

text = ""
# Extract text from each page
for i in range(doc.Pages.Count):
    extractor = PdfTextExtractor(doc.Pages[i])
    text += extractor.ExtractText(options) + "\n"

# Save the extracted text to a text file
with open('output/ExtractedText.txt', 'w', encoding='utf-8') as f:
    f.write(text)

## 2. Extract Table

In [5]:
from spire.pdf import *

# Load PDF document
doc = PdfDocument()
doc.LoadFromFile("table_sample.pdf")

# Create table extractor
extractor = PdfTableExtractor(doc)

# List to hold all extracted rows
output = []

# Extract tables page by page
for page_index in range(doc.Pages.Count):
    tables = extractor.ExtractTable(page_index)
    if tables:
        output.append(f"=== Page {page_index + 1} ===")
        for table_index, table in enumerate(tables):
            output.append(f"--- Table {table_index + 1} ---")
            row_count = table.GetRowCount()
            col_count = table.GetColumnCount()
            for r in range(row_count):
                row = [table.GetText(r, c) for c in range(col_count)]
                output.append("\t".join(row))
            output.append("")  # Empty line after each table

# Save output to a text file
with open("output/ExtractedTables.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(output))

doc.Close()

## 3. Extract Metadata

In [6]:
from spire.pdf import *

# Load PDF document
doc = PdfDocument()
doc.LoadFromFile("AddBuiltinProperties.pdf")

# Get built-in metadata
info = doc.DocumentInformation
print("Title:", info.Title)
print("Author:", info.Author)
print("Subject:", info.Subject)
print("Keywords:", info.Keywords)

doc.Close()

## 4. Extract Bookmarks (Outlines)

In [7]:
from spire.pdf import *

# Define a recursive function to extract and print bookmark titles and their page numbers
def extract_bookmarks(bookmark_collection, level=0):
    # Loop through each bookmark in the current collection
    for i in range(bookmark_collection.Count):
        # Retrieve the current bookmark
        bookmark = bookmark_collection[i]

        # Print the bookmark's title and page number, with indentation based on its hierarchy level
        print("  " * level + f"Title: {bookmark.Title}, Page: {bookmark.Destination.PageNumber + 1}")

        # Convert the current bookmark to a collection of nested bookmarks, if any
        child_bookmarks = bookmark.ConvertToBookmarkCollection()

        # If there are child bookmarks, call the function recursively to process them
        if child_bookmarks and child_bookmarks.Count > 0:
            extract_bookmarks(child_bookmarks, level + 1)

# Create a PdfDocument object
pdf = PdfDocument()

# Load the PDF file from disk
pdf.LoadFromFile("AddNestedBookmark.pdf")

# Start extracting bookmarks from the root bookmark collection
extract_bookmarks(pdf.Bookmarks)

pdf.Close()

## 5. Extract Attachments

In [8]:
from spire.pdf import *

# Load PDF document
doc = PdfDocument()
doc.LoadFromFile("attachment_example.pdf")

# Extract attachments
for i in range(doc.Attachments.Count):
    attachment = doc.Attachments[i]
    attachment.Data.Save(f"Output/{attachment.FileName}")

doc.Close()

## 6. Extract Images

In [9]:
from spire.pdf import *
import os

# Load PDF document
doc = PdfDocument()
doc.LoadFromFile("example.pdf")

# Create a PdfImageHelper object
helper = PdfImageHelper()
output_dir = "Images"

# Extract images
for i in range(doc.Pages.Count):
    for info in helper.GetImagesInfo(doc.Pages[i]):
        info.Image.Save(os.path.join(output_dir, f"Image-{i}.png"))

doc.Close()

## 7. Extract Form Field Values

In [10]:
from spire.pdf import *

# Load PDF document
doc = PdfDocument()
doc.LoadFromFile("FormFields.pdf")

# Get the forms from the document
form = doc.Form
formWidget = PdfFormWidget(form)
content = []

# Extract form data
for i in range(formWidget.FieldsWidget.Count):
    field = formWidget.FieldsWidget[i]

    if isinstance(field, PdfTextBoxFieldWidget):
        content.append(f"Textbox Name: {field.Name}\n")
        content.append(f"Textbox Value: {field.Text}\r\n")

    elif isinstance(field, PdfListBoxWidgetFieldWidget):
        content.append(f"Listbox Name: {field.Name}\nListbox Items:\n")
        for j in range(field.Values.Count):
            content.append(f"{field.Values.get_Item(j).Value}\n")
        content.append(f"Listbox Selected Item: {field.SelectedValue}\r\n")

    elif isinstance(field, PdfComboBoxWidgetFieldWidget):
        content.append(f"Combobox Name: {field.Name}\nCombobox Items:\n")
        for j in range(field.Values.Count):
            content.append(f"{field.Values.get_Item(j).Value}\n")
        content.append(f"Combobox Selected Item: {field.SelectedValue}\r\n")

    elif isinstance(field, PdfRadioButtonListFieldWidget):
        content.append(f"Radio Button Name: {field.Name}\n")
        content.append(f"Radio Button Selected Item: {field.SelectedValue}\r\n")

    elif isinstance(field, PdfCheckBoxWidgetFieldWidget):
        content.append(f"Checkbox Name: {field.Name}\n")
        content.append("Checkbox Status: Checked\n" if field.Checked else "Checkbox Status: Unchecked\r\n")

with open("output/GetFormValues.txt", "w", encoding="UTF-8") as f:
    f.writelines(content)

doc.Dispose()

## 8. Extract Hyperlinks

In [11]:
from spire.pdf import *

# Create a PdfDocument object
doc = PdfDocument()
# Load the PDF file
doc.LoadFromFile("hyperlink_sample.pdf")

# Prepare a list to store the extracted URLs
urls = []

# Loop through each page in the PDF
for page_index in range(doc.Pages.Count):
    # Get the annotation collection of the current page
    widgetCollection = doc.Pages[page_index].AnnotationsWidget

    # Check if there are any annotations
    if widgetCollection is not None and widgetCollection.Count > 0:
        for j in range(widgetCollection.Count):
            annotation = widgetCollection.get_Item(j)

            # Check for both types of hyperlink annotations
            if isinstance(annotation, PdfTextWebLinkAnnotationWidget):
                urls.append(annotation.Url)
            elif isinstance(annotation, PdfUriAnnotationWidget):
                urls.append(annotation.Uri)

# Save all extracted hyperlinks to a text file
with open("Output/ExtractHyperlinks.txt", "w", encoding="utf-8") as file:
    for url in urls:
        file.write(url + "\n")

doc.Close()

## 9. Extract Annotations

In [12]:
from spire.pdf.common import *
from spire.pdf import *

# Load a PDF document
pdf = PdfDocument()
pdf.LoadFromFile("Sample.pdf")

lines = []

# Extract annotations from each page
for page_index in range(pdf.Pages.Count):
    lines.append(f"Page {page_index + 1}:")

    annotations = pdf.Pages[page_index].AnnotationsWidget

    if annotations.Count:
        for i in range(annotations.Count):
            annotation = annotations.get_Item(i)

            # Skip annotations with no meaningful content
            if not annotation.Text.strip() and annotation.ModifiedDate.ToString() == "0001/1/1 0:00:00":
                continue

            lines.append("Annotation information:")
            lines.append(f"Text: {annotation.Text.strip() or 'N/A'}")
            lines.append(f"ModifiedDate: {annotation.ModifiedDate.ToString()}")
    else:
        lines.append("No annotations found.")

    lines.append("")

with open("output/ExtractAnnotations.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(lines))

pdf.Close()