## PDFplumber Practice Questions (Easyâ€“Medium)

1. Extract all text from the first page using pdfplumber.

In [3]:
import pdfplumber 
with pdfplumber.open("files/simple.pdf") as pdf :
    text = pdf.pages[0].extract_text()
    print(text)

This is a simple PDF file.
It contains just a few lines of text.
Generated using Python and ReportLab.


2. Extract text from all pages in your PDF.

In [5]:
import pdfplumber 
with pdfplumber.open("files/py_modules_test.pdf") as pdf :
    for number, page in enumerate(pdf.pages, start = 1) :
        print("Page number :", number)
        print("Content:\n",page.extract_text())
        print()

Page number : 1
Content:
 INTRODUCTION
This PDF is created to test PyPDF2 built-in features.
MODULES OVERVIEW
PYTHON HAS MANY BUILT-IN MODULES LIKE OS, SYS, MATH, RANDOM.

Page number : 2
Content:
 TEXT EXTRACTION TEST
This page contains normal readable text.
UPPERCASE HEADING SAMPLE
FILE HANDLING AND DATA PROCESSING

Page number : 3
Content:
 ENCRYPTION AND MERGING TEST
This page helps test merging, splitting, and encryption.



3. Count the total number of pages in your PDF.

In [8]:
import pdfplumber as pp
with pp.open("files/py_modules_test.pdf") as pdf :
    print("Number of pages :",len(pdf.pages))

Number of pages : 3


4. Detect whether your PDF is scanned (no extractable text).

In [9]:
import pdfplumber as pp 
scanned = True
with pp.open("files/scanned_test_pdf.pdf") as pdf :
    for page in pdf.pages :
        content = page.extract_text()
        if content :
            print(content)
            scanned = False
if scanned :
    print("PDF is scanned(no extractable text).")

PDF is scanned(no extractable text).


5. Extract the first table on page 1 of your PDF.

In [13]:
import pdfplumber as pp 
with pp.open("files/pdfplumber_tables_test.pdf") as pdf :
    tables1 = pdf.pages[0].extract_tables()
    for table in tables1 :
        for row in table :
            print(row)

['Name', 'Subject', 'Marks']
['Aadi', 'Maths', '88']
['Bharat', 'Physics', '76']
['Charan', 'Chemistry', '92']
['Divya', 'English', '81']


6. Extract all tables from your entire PDF.

In [21]:
import pdfplumber as pp
tables = [] 
with pp.open("files/pdfplumber_tables_test.pdf") as pdf :
    for page in pdf.pages :
        t = page.extract_tables()
        if t :
            tables.extend(t)
if len(tables) != 0 :
    for no, table in enumerate(tables, start = 1):
        print("Table number :",no)
        for row in table :
            print(row)
else :
    print("No table found.")

Table number : 1
['Name', 'Subject', 'Marks']
['Aadi', 'Maths', '88']
['Bharat', 'Physics', '76']
['Charan', 'Chemistry', '92']
['Divya', 'English', '81']
Table number : 2
['Item', 'Price', 'Quantity', 'Total']
['Pen', '10', '5', '50']
['Notebook', '50', '2', '100']
['Pencil', '5', '10', '50']
['Eraser', '3', '4', '12']


7. Extract all words along with their bounding-box coordinates.

In [1]:
import pdfplumber as pp 
with pp.open("files/simple.pdf") as pdf :
    for page in pdf.pages :
        words = page.extract_words()
        for word in words :
            print(word)

{'text': 'This', 'x0': 78.0, 'x1': 96.89, 'top': 80.07000000000005, 'doctop': 80.07000000000005, 'bottom': 90.07000000000005, 'upright': True, 'height': 10.0, 'width': 18.89, 'direction': 'ltr'}
{'text': 'is', 'x0': 99.67, 'x1': 106.89, 'top': 80.07000000000005, 'doctop': 80.07000000000005, 'bottom': 90.07000000000005, 'upright': True, 'height': 10.0, 'width': 7.219999999999999, 'direction': 'ltr'}
{'text': 'a', 'x0': 109.67, 'x1': 115.23, 'top': 80.07000000000005, 'doctop': 80.07000000000005, 'bottom': 90.07000000000005, 'upright': True, 'height': 10.0, 'width': 5.560000000000002, 'direction': 'ltr'}
{'text': 'simple', 'x0': 118.01, 'x1': 146.9, 'top': 80.07000000000005, 'doctop': 80.07000000000005, 'bottom': 90.07000000000005, 'upright': True, 'height': 10.0, 'width': 28.89, 'direction': 'ltr'}
{'text': 'PDF', 'x0': 149.68, 'x1': 169.68, 'top': 80.07000000000005, 'doctop': 80.07000000000005, 'bottom': 90.07000000000005, 'upright': True, 'height': 10.0, 'width': 20.0, 'direction': 'lt

8. Extract all embedded images from a page.

In [13]:
import pdfplumber as pp 
with pp.open("files/pdfplumber_images_test.pdf") as pdf :
    for page in pdf.pages :
        for image in page.images :
            print("Image details :", image)

Image details : {'x0': 172.6378, 'y0': 551.8898, 'x1': 422.63779999999997, 'y1': 701.8898, 'width': 249.99999999999997, 'height': 150.0, 'name': 'FormXob.5974296f20194b8b4a00d812ba9306a0', 'stream': <PDFStream(4): raw=942, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': [/'ASCII85Decode', /'FlateDecode'], 'Height': 150, 'Length': 942, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 250}>, 'srcsize': (250, 150), 'imagemask': None, 'bits': 8, 'colorspace': [/'DeviceRGB'], 'mcid': None, 'tag': None, 'object_type': 'image', 'page_number': 1, 'top': 140.0, 'bottom': 290.0, 'doctop': 140.0}
Image details : {'x0': 172.6378, 'y0': 401.8898, 'x1': 422.63779999999997, 'y1': 551.8897999999999, 'width': 249.99999999999997, 'height': 149.99999999999994, 'name': 'FormXob.f7d05691d5f68991fb50934ae33ba222', 'stream': <PDFStream(5): raw=1010, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': [/'ASCII85Decode', /'FlateDecode'], 'Height': 150, 'Length': 1010, 'Subtype': /'Imag

9. Extract all lines and shapes (vector objects) from the PDF.

In [17]:
import pdfplumber as pp 
with pp.open("files/pdfplumber_vectors_test.pdf") as pdf :
    for page in pdf.pages :
        print("Lines :", len(page.lines))
        for line in page.lines :
            print(line) 
        print("Rectangle :", len(page.rects))
        for rect in page.rects :
            print(rect)
        print("Curves :", len(page.curves))
        for curve in page.curves :
            print(curve) 

Lines : 2
{'x0': 50.0, 'y0': 741.8898, 'x1': 550.0, 'y1': 741.8898, 'width': 500.0, 'height': 0.0, 'pts': [(50.0, 100.0), (550.0, 100.0)], 'linewidth': 2.0, 'stroke': True, 'fill': False, 'evenodd': False, 'stroking_color': 0, 'non_stroking_color': 0, 'mcid': None, 'tag': None, 'object_type': 'line', 'page_number': 1, 'path': [('m', (50.0, 100.0)), ('l', (550.0, 100.0))], 'dash': None, 'top': 100.0, 'bottom': 100.0, 'doctop': 100.0}
{'x0': 50.0, 'y0': 641.8898, 'x1': 550.0, 'y1': 691.8898, 'width': 500.0, 'height': 50.0, 'pts': [(50.0, 150.0), (550.0, 200.0)], 'linewidth': 2.0, 'stroke': True, 'fill': False, 'evenodd': False, 'stroking_color': 0, 'non_stroking_color': 0, 'mcid': None, 'tag': None, 'object_type': 'line', 'page_number': 1, 'path': [('m', (50.0, 150.0)), ('l', (550.0, 200.0))], 'dash': None, 'top': 150.0, 'bottom': 200.0, 'doctop': 150.0}
Rectangle : 2
{'x0': 50.0, 'y0': 491.8898, 'x1': 250.0, 'y1': 591.8897999999999, 'width': 200.0, 'height': 99.99999999999994, 'pts': [(

10. Extract a table from the PDF and save it as a CSV file.

In [27]:
import pdfplumber as pp 
import csv 
with pp.open("files/pdfplumber_tables_test.pdf") as pdf :
    table = pdf.pages[0].extract_table()
    if table :
        with open("files/table.csv", "w", newline='') as file :
            writer = csv.writer(file)
            for row in table :
                writer.writerow(row)
        print("Content written successfully.")
    else :
        print("No table found.")

Content written successfully.
