## 1. Installation

In [1]:
#!pip install pymupdf

In [2]:
import fitz  # PyMuPDF


## 2. Opening a PDF

In [3]:
doc = fitz.open("Unit-5.pdf")
print(len(doc))  # number of pages


14


## 3. Saving a PDF

In [4]:
doc.save("output.pdf")

In [5]:
## Optimized save (smaller file):
doc.save("output_optimized.pdf", garbage=4, deflate=True)


## 4. Working with Pages

In [6]:
## Access page:
page = doc[0]  # first page

In [7]:
page

page 0 of Unit-5.pdf

In [8]:
## Loop pages:
for page in doc:
    print(page.number)

0
1
2
3
4
5
6
7
8
9
10
11
12
13


## 5. Text Extraction

In [9]:
## Simple Text
text = page.get_text()
print(text)

88
Writing Skills
Check Your Progress 3
Carbon dioxide levels (and those of other green house gases, notably carbon monoxide,
methane and chlorofluorocarbons (CFCs) are rising at an alarming rate unprecedented in
the recent earth history and their increased presence will logically favour an increasingly
warmer surface environment.
While in the 1990s, carbon dioxide raised the Earth's surface temperature by 0.08oC, by the
2020s, the temperature would rise up to 0.12oC. Similarly the increased presence of CFCs
has undisputed effects on the ozone layer and the surface temperature would rise to 0.20oC.
Check Your Progress 4
The industrial waste and domestic sewage is pretreated together. The first step is the
mechanical screening of solids and its separation into solid waste and effluents. The solids
are further separated for primary and secondary sludge. The effluents or soluble fraction is
then aerated for aerobic digestion by microbes, which is once again screened for solids.
This efflu

In [10]:
## Structured Text (VERY IMPORTANT)
text_dict = page.get_text("dict")

In [11]:
text_dict

{'width': 595.0,
 'height': 842.0,
 'blocks': [{'number': 0,
   'type': 0,
   'bbox': (42.529998779296875,
    780.4600219726562,
    52.48999786376953,
    791.7999877929688),
   'lines': [{'spans': [{'size': 10.0,
       'flags': 4,
       'bidi': 0,
       'char_flags': 16,
       'font': 'Times-Roman',
       'color': 2367264,
       'alpha': 255,
       'ascender': 0.9110000133514404,
       'descender': -0.22300000488758087,
       'text': '88',
       'origin': (42.529998779296875, 789.5700073242188),
       'bbox': (42.529998779296875,
        780.4600219726562,
        52.48999786376953,
        791.7999877929688)}],
     'wmode': 0,
     'dir': (1.0, 0.0),
     'bbox': (42.529998779296875,
      780.4600219726562,
      52.48999786376953,
      791.7999877929688)}]},
  {'number': 1,
   'type': 0,
   'bbox': (42.529998779296875,
    50.85997009277344,
    281.81500244140625,
    62.19997024536133),
   'lines': [{'spans': [{'size': 8.0,
       'flags': 20,
       'bidi': 0,
   

text_dict contains:
1. blocks
2. lines
3. spans
4. fonts
5. positions

In [12]:
for block in text_dict["blocks"]:
    for line in block.get("lines", []):
        for span in line.get("spans", []):
            print(span["text"], span["size"], span["font"])

88 10.0 Times-Roman
Writing Skills 8.0 Times-Bold
Check Your Progress 3 10.0 Times-Bold
Carbon dioxide levels (and those of other green house gases, notably carbon monoxide, 10.0 Times-Roman
methane and chlorofluorocarbons (CFCs) are rising at an alarming rate unprecedented in 10.0 Times-Roman
the recent earth history and their increased presence will logically favour an increasingly 10.0 Times-Roman
warmer surface environment. 10.0 Times-Roman
While in the 1990s, carbon dioxide raised the Earth's surface temperature by 0.08 10.0 Times-Roman
o 5.800000190734863 Times-Roman
C, by the 10.0 Times-Roman
2020s, the temperature would rise up to 0.12 10.0 Times-Roman
o 5.800000190734863 Times-Roman
C. Similarly the increased presence of CFCs 10.0 Times-Roman
has undisputed effects on the ozone layer and the surface temperature would rise to 0.20 10.0 Times-Roman
o 5.800000190734863 Times-Roman
C. 10.0 Times-Roman
Check Your Progress 4 10.0 Times-Bold
The industrial waste and domestic sewage i

In [13]:
for block in text_dict["blocks"]:
    for line in block.get("lines", []):
        print(line)

{'spans': [{'size': 10.0, 'flags': 4, 'bidi': 0, 'char_flags': 16, 'font': 'Times-Roman', 'color': 2367264, 'alpha': 255, 'ascender': 0.9110000133514404, 'descender': -0.22300000488758087, 'text': '88', 'origin': (42.529998779296875, 789.5700073242188), 'bbox': (42.529998779296875, 780.4600219726562, 52.48999786376953, 791.7999877929688)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (42.529998779296875, 780.4600219726562, 52.48999786376953, 791.7999877929688)}
{'spans': [{'size': 8.0, 'flags': 20, 'bidi': 0, 'char_flags': 24, 'font': 'Times-Bold', 'color': 2367264, 'alpha': 255, 'ascender': 0.9110000133514404, 'descender': -0.22300000488758087, 'text': 'Writing Skills', 'origin': (42.529998779296875, 59.010009765625), 'bbox': (42.529998779296875, 51.722007751464844, 100.2555923461914, 60.794010162353516)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (42.529998779296875, 51.722007751464844, 100.2555923461914, 60.794010162353516)}
{'spans': [{'size': 10.0, 'flags': 20, 'bidi': 0, 'char_flags': 2

In [15]:
## Words with Coordinates (Best for ML / NLP)
words = page.get_text("words")

In [16]:
words

[(42.529998779296875,
  780.4600219726562,
  52.48999786376953,
  791.7999877929688,
  '88',
  0,
  0,
  0),
 (42.529998779296875,
  51.722007751464844,
  73.04999542236328,
  60.794010162353516,
  'Writing',
  1,
  0,
  0),
 (78.735595703125,
  51.722007751464844,
  100.2555923461914,
  60.794010162353516,
  'Skills',
  1,
  0,
  1),
 (184.25,
  50.85997009277344,
  211.697998046875,
  62.19997024536133,
  'Check',
  1,
  1,
  0),
 (213.79800415039062,
  50.85997009277344,
  235.030029296875,
  62.19997024536133,
  'Your',
  1,
  1,
  1),
 (237.19801330566406,
  50.85997009277344,
  274.4010314941406,
  62.19997024536133,
  'Progress',
  1,
  1,
  2),
 (276.81500244140625,
  50.85997009277344,
  281.81500244140625,
  62.19997024536133,
  '3',
  1,
  1,
  3),
 (184.25,
  71.37998962402344,
  213.96499633789062,
  82.7199935913086,
  'Carbon',
  2,
  0,
  0),
 (216.47097778320312,
  71.37998962402344,
  246.80099487304688,
  82.7199935913086,
  'dioxide',
  2,
  0,
  1),
 (249.306976318

In [17]:
## Each word:
## (x0, y0, x1, y1, "word", block_no, line_no, word_no)

In [18]:
## Example:
for w in words:
    print(w[4])  # actual word


88
Writing
Skills
Check
Your
Progress
3
Carbon
dioxide
levels
(and
those
of
other
green
house
gases,
notably
carbon
monoxide,
methane
and
chlorofluorocarbons
(CFCs)
are
rising
at
an
alarming
rate
unprecedented
in
the
recent
earth
history
and
their
increased
presence
will
logically
favour
an
increasingly
warmer
surface
environment.
While
in
the
1990s,
carbon
dioxide
raised
the
Earth's
surface
temperature
by
0.08oC,
by
the
2020s,
the
temperature
would
rise
up
to
0.12oC.
Similarly
the
increased
presence
of
CFCs
has
undisputed
effects
on
the
ozone
layer
and
the
surface
temperature
would
rise
to
0.20oC.
Check
Your
Progress
4
The
industrial
waste
and
domestic
sewage
is
pretreated
together.
The
first
step
is
the
mechanical
screening
of
solids
and
its
separation
into
solid
waste
and
effluents.
The
solids
are
further
separated
for
primary
and
secondary
sludge.
The
effluents
or
soluble
fraction
is
then
aerated
for
aerobic
digestion
by
microbes,
which
is
once
again
screened
for
solids.
This
efflu

In [20]:
words[0]

(42.529998779296875,
 780.4600219726562,
 52.48999786376953,
 791.7999877929688,
 '88',
 0,
 0,
 0)

## 6. Extract Text from Specific Region

In [21]:
rect = fitz.Rect(0, 0, 300, 200)  # area
text = page.get_text("text", clip=rect)

In [22]:
text

'Writing Skills\nCheck Your Progress 3\nCarbon dioxide levels (and th\nmethane and chlorofluorocar\nthe recent earth history and th\nwarmer surface environment.\nWhile in the 1990s, carbon di\n2020s, the temperature would\nhas undisputed effects on the\nCheck Your Progress 4\nThe industrial waste and dom\n'

## 7. Images

In [49]:
## Extract Image
doc = fitz.open("Get_Started_With_Smallpdf.pdf")
print(len(doc))

1


In [37]:
page = doc[0]

In [38]:
images = page.get_images(full=True)

for img in images:
    xref = img[0]
    base_image = doc.extract_image(xref)
    image_bytes = base_image["image"]

    with open("image.png", "wb") as f:
        f.write(image_bytes)


In [39]:
images

[]

In [40]:
images = page.get_drawings()

In [41]:
images

[{'items': [('l',
    Point(59.35049819946289, 733.756103515625),
    Point(59.35049819946289, 717.9581298828125)),
   ('l',
    Point(59.35049819946289, 717.9581298828125),
    Point(30.99949836730957, 717.9581298828125)),
   ('l',
    Point(30.99949836730957, 717.9581298828125),
    Point(30.99949836730957, 707.4600830078125)),
   ('l',
    Point(30.99949836730957, 707.4600830078125),
    Point(54.91749954223633, 667.756103515625)),
   ('l',
    Point(54.91749954223633, 667.756103515625),
    Point(73.47449493408203, 667.756103515625)),
   ('l',
    Point(73.47449493408203, 667.756103515625),
    Point(73.47449493408203, 706.3170776367188)),
   ('l',
    Point(73.47449493408203, 706.3170776367188),
    Point(80.99949645996094, 706.3170776367188)),
   ('l',
    Point(80.99949645996094, 706.3170776367188),
    Point(80.99949645996094, 717.9581298828125)),
   ('l',
    Point(80.99949645996094, 717.9581298828125),
    Point(73.47449493408203, 717.9581298828125)),
   ('l',
    Point(73.47

In [48]:
## Insert Image
rect = fitz.Rect(50, 50, 200, 200)
page.insert_image(rect, filename="logo.png")
output_filename = "logo-document.pdf"
doc.save(output_filename)
doc.close()


## 8. Tables

In [53]:
doc = fitz.open("Unit-5.pdf")
page = doc[3]  # Process the third page

# Detect tables on the page using table finder
tables = page.find_tables()

In [56]:
if not tables.tables:
    print("No tables found on this page.")
else:
    for index, table in enumerate(tables):
        print(f"\nTable {index+1} found:")

        # Convert the table to Markdown text
        md_table = table.to_markdown()
        print("\nMarkdown representation:")
        print(md_table)

        # Convert the table to a pandas DataFrame
        df_table = table.to_pandas()
        print("\nPandas DataFrame:")
        print(df_table)



Table 1 found:

Markdown representation:
|Age range|Body<br>Wt. Kg.|Energy<br>Kcal|MJ|Protein<br>g|Calcium<br>mg|Iron<br>mg|
|---|---|---|---|---|---|---|
|**Boys and**<br>**Girls**<br>0 up to 1 year<br>2 up to 4 years<br>4 up to 7 years<br>**Boys**<br>9 up to 12 years<br>2 up to 15 years<br>15 up to 18 years<br>**Girls**<br>9 up to 12 years<br>12 up to 15 years<br>15 up to 18 years|7.3<br>13.5<br>20.5<br>31.9<br>45.5<br>61.0<br>33.0<br>48.6<br>56.1|800<br>1400<br>1800<br>2500<br>2800<br>3000<br>2300<br>2300<br>2300|3.3<br>5.9<br>7.5<br>10.5<br>11.7<br>12.6<br>9.6<br>9.6<br>9.6|20<br>35<br>45<br>63<br>70<br>75<br>58<br>58<br>58|600<br>500<br>500<br>700<br>700<br>600<br>700<br>700<br>600|6<br>7<br>8<br>13<br>14<br>15<br>13<br>14<br>15|



Pandas DataFrame:
                                           Age range  \
0  Boys and\nGirls\n0 up to 1 year\n2 up to 4 yea...   

                                       Body\nWt. Kg.  \
0  7.3\n13.5\n20.5\n31.9\n45.5\n61.0\n33.0\n48.6\...   

       

## 9. Searching Text

In [57]:
text_instances = page.search_for("humans")

for inst in text_instances:
    print(inst)  # rectangle


Rect(413.83984375, 443.3800048828125, 447.2398376464844, 454.7200012207031)


In [58]:
## Highlight:
page.add_highlight_annot(inst)


'Highlight' annotation on page 3 of Unit-5.pdf

## 10. Annotations

In [61]:
## Add Highlight
rects = page.search_for("humans")

for r in rects:
    page.add_highlight_annot(r)


In [62]:
## Add Comment
annot = page.add_text_annot((100, 100), "Check this value")

## 12. Editing PDFs

In [63]:
## Insert Text
page.insert_text((50, 50), "Hello World", fontsize=12)

1

In [64]:
## Replace Text
rects = page.search_for("Old Value")

for r in rects:
    page.add_redact_annot(r)

page.apply_redactions()
page.insert_text((r.x0, r.y0), "New Value")


1

## 13. Merge PDFs

In [65]:
doc1 = fitz.open("Get_Started_With_Smallpdf.pdf")
doc2 = fitz.open("Unit-5.pdf")

doc1.insert_pdf(doc2)
doc1.save("merged.pdf")


## 14. Split PDFs

In [66]:
doc = fitz.open("merged.pdf")

for i, page in enumerate(doc):
    new_doc = fitz.open()
    new_doc.insert_pdf(doc, from_page=i, to_page=i)
    new_doc.save(f"page_{i}.pdf")


## 15. Render PDF as Image

In [67]:
pix = page.get_pixmap()
pix.save("page.png")

In [68]:
## High Resolution
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # zoom 2x


In [69]:
pix

Pixmap(DeviceRGB, (0, 0, 1190, 1684), 0)

## 16. Metadata

In [70]:
print(doc.metadata)


{'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'producer': 'Adobe PDF Library 15.0', 'creationDate': "D:20201014170810+02'00'", 'modDate': "D:20201014170810+02'00'", 'trapped': '', 'encryption': None}


In [72]:
## Set MetaData
doc.set_metadata({
    "title": "My PDF",
    "author": "Ankit"
})


## 17. Draw Shapes

In [73]:
page.draw_rect(fitz.Rect(50, 50, 200, 200))
page.draw_line((50, 50), (200, 200))


Point(200.0, 200.0)

## 18. Forms

In [74]:
for widget in page.widgets():
    print(widget.field_name, widget.field_value)
