In [2]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter
# from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
 
path = "../data/pdf_ex_table.pdf"
 
rsrcmgr = PDFResourceManager() 
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
 
f = open('../data/out.html', 'wb') # out으로 저장 python 파일 안에 가면 out이 저장됨 
device = HTMLConverter(rsrcmgr, f, codec=codec, laparams=laparams) # 변환
 
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0 #is for all
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
   interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
f.close()

In [5]:
import fitz

DATA_ROOT = "../data/"
OUTPUT_ROOT = DATA_ROOT + "pymupdf_ex/"

def open_doc_pymupdf(path="../data/pdf_sample.pdf"):
    doc = fitz.open(path)
    return doc

doc = open_doc_pymupdf()

In [2]:
# see what kind of methods the document has
from typing import List

def print_public_method(obj:object) -> List:
    methods = []
    for method in dir(obj):
        if not method.startswith("__") and \
            not method.startswith("_"):
                methods += [method]
    return methods

methods = print_public_method(doc)
print(methods)

['FontInfos', 'FormFonts', 'Graftmaps', 'InsertedImages', 'ShownPages', 'add_layer', 'add_ocg', 'authenticate', 'can_save_incrementally', 'chapter_count', 'chapter_page_count', 'close', 'convert_to_pdf', 'copy_page', 'del_toc_item', 'del_xml_metadata', 'delete_page', 'delete_pages', 'embfile_add', 'embfile_count', 'embfile_del', 'embfile_get', 'embfile_info', 'embfile_names', 'embfile_upd', 'extract_font', 'extract_image', 'ez_save', 'find_bookmark', 'fullcopy_page', 'get_char_widths', 'get_layer', 'get_layers', 'get_new_xref', 'get_oc', 'get_ocgs', 'get_ocmd', 'get_outline_xrefs', 'get_page_fonts', 'get_page_images', 'get_page_labels', 'get_page_numbers', 'get_page_pixmap', 'get_page_text', 'get_page_xobjects', 'get_sigflags', 'get_toc', 'get_xml_metadata', 'has_annots', 'has_links', 'init_doc', 'insert_file', 'insert_page', 'insert_pdf', 'internal_keep_annot', 'isEncrypted', 'is_closed', 'is_dirty', 'is_encrypted', 'is_fast_webaccess', 'is_form_pdf', 'is_pdf', 'is_reflowable', 'is_re

## Extract text from a PDF

In [3]:
import fitz

doc = open_doc_pymupdf()
out = open(OUTPUT_ROOT + "output.txt", "wb")
# iterate over the document pages
for page in doc:
    # get plain text
    text = page.get_text().encode("utf8")
    out.write(text)
    # write page delimiter (form feed 0x0c)
    out.write(bytes((12, )))
out.close()

## Extract images from a PDF

In [4]:
doc = open_doc_pymupdf()

for page_idx in range(len(doc)):
    # get the page - fitz.fitz.Page obj
    page = doc[page_idx]
    img_lst = page.get_images()
    
    # print the number of images found on the page
    if img_lst:
        print(f"Found {len(img_lst)} images on page {page_idx}")
    
    for img_idx, img in enumerate(img_lst, start=1):
        # get the XREF of the image
        xref = img[0]
        # create a Pixmap
        ##[TODO] What is a Pixmap obj?
        pix = fitz.Pixmap(doc, xref) 
        
        if pix.n - pix.alpha > 3: # CMYK: Convert to RGB first
            pix = fitz.Pixmap(fitz.csRGB, pix)
            
        # save the image as png
        pix.save(OUTPUT_ROOT+"page_%s-image_%s.png" % (page_idx, img_idx))
        pix = None
        
methods = print_public_method(page)

Found 1 images on page 2


## Merging PDF files with other types of file

- For converting page sequence, select the page to copy to,
- refer to here: [link](https://github.com/pymupdf/PyMuPDF/wiki/Inserting-Pages-from-other-PDFs)

In [7]:
doc = open_doc_pymupdf()
doc_table = open_doc_pymupdf(DATA_ROOT+"pdf_sample_table.pdf")

# merge the docs
doc.insert_file(doc_table)
doc.save(OUTPUT_ROOT+"pdf_sample_merged.pdf")

## Adding a watermark to a PDF

In [9]:
doc[1].bound()

Rect(0.0, 0.0, 612.0, 792.0)

In [11]:
doc = open_doc_pymupdf()

for page_idx in range(len(doc)):
    page = doc[page_idx]
    
    # insert an img watermark from a file name to fit the page bounds
    # page.bound() -> (x0, y0, x1, y1)
    page.insert_image(page.bound(), filename=OUTPUT_ROOT+"page_2-image_1.png", overlay=False)
    
doc.save(OUTPUT_ROOT+"pdf_sample_watermarked.pdf")

## Adding an image to a PDF

In [12]:
doc = open_doc_pymupdf()

for page_idx in range(len(doc)):
    page = doc[page_idx]
    
    # insert an image from a file name at the top left of the document
    page.insert_image(fitz.Rect(0, 0, 50, 50), filename=OUTPUT_ROOT+"page_2-image_1.png")

doc.save(OUTPUT_ROOT+"pdf_sample_img_inserted.pdf")

## Rotating a PDF

In [16]:
doc = open_doc_pymupdf()
page = doc[0]
page.set_rotation(180)
doc.save(OUTPUT_ROOT+"pdf_sample_rotated.pdf")

## Cropping a PDF

In [19]:
doc = open_doc_pymupdf()
page = doc[0]
page.set_cropbox(
    fitz.Rect(100,100,400,400)
)
doc.save(OUTPUT_ROOT+"pdf_sample_cropped.pdf")

## Attaching Files

In [34]:
doc = open_doc_pymupdf()
page = doc[0]
# create the point where you want to add the attachment
point = fitz.Point(100, 100)

# read the attachment file
attachment = open_doc_pymupdf(DATA_ROOT+"pdf_sample_table.pdf")
# get the document byte data as a buffer
attachment_data = attachment.tobytes()

# add the file annotation with the point, data and the file name
file_annotation = page.add_file_annot(
    point,
    attachment_data,
    # Note: the third parameter for the filename should
    # include the actual file extension.
    # Default: default attachment is "push bin".
    # However, you can change this by setting the `icon` parameter
    "attachment.pdf"
)
doc.save(OUTPUT_ROOT+"pdf_sample_attached.pdf")

In [35]:
doc = open_doc_pymupdf()
embedded_doc = open_doc_pymupdf(DATA_ROOT+"pdf_sample_table.pdf")
# get the document byte data as a buffer
embedded_data = embedded_doc.tobytes()

# embed with the file name and the data
# As with add_file_annot, the first parameter should include
# the actual file extension.
doc.embfile_add("embedded_file.pdf", embedded_data)

doc.save(OUTPUT_ROOT+"pdf_sample_embedded.pdf")

## Delete Pages