# pyPDF2 Tutorial

In [1]:
import PyPDF2 as pdf
from PyPDF2 import PdfReader, PdfWriter

In [2]:
print(dir(pdf))



In [3]:
# Reading a PDF file
file = open("./sample.pdf","rb")

reader = PdfReader(file)

In [4]:
# Get metadata of PDF doc
info = reader.metadata
info

{'/Producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/',
 '/CreationDate': 'D:20230808144017'}

### Number of Pages in PDF

In [5]:
# Get number of pages
num_pages = len(reader.pages)
num_pages

2

In [6]:
# extract text
text = reader.pages[1].extract_text()
text

'Cras gravida lectus eu nulla malesuada, vitae rhoncus lectus ultricies. Maecenas quis odio sem. Ut\nut diam neque. Curabitur eu risus nec nisi tempus mattis. Proin eget ante arcu. Nullam\ncondimentum urna elit, id gravida tortor ullamcorper in. Suspendisse et neque odio. Suspendisse\npotenti. Pellentesque sodales, ipsum vitae fermentum varius, velit dui gravida quam, non finibus\nlectus dolor vel sem. Nulla facilisi.\nSed a erat vel justo ullamcorper vulputate nec non tortor. Mauris ac arcu et turpis bibendum luctus\nut et sapien. Sed at turpis luctus, congue ex eu, convallis sapien. Nullam nec urna quis mauris\naccumsan aliquam. Maecenas in magna sit amet dolor interdum feugiat non eu mi. Ut nec odio et\nsem dictum malesuada. Nam ut velit vestibulum, dapibus ligula a, suscipit orci. Nullam aliquam vel\nurna eu tristique. Nulla sit amet quam ut est iaculis euismod ac nec nisl. Cras quis nunc vitae magna\ntincidunt dapibus non sit amet eros.'

### Get Metadata from PDF

In [7]:
# Function to get metadata
def get_pdf_metadata(pdf_file_path):
    with open(pdf_file_path, "rb") as f:
        reader = PdfReader(f)
        info = reader.metadata
    return info

get_pdf_metadata("./sample.pdf")

{'/Producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/',
 '/CreationDate': 'D:20230808144017'}

### Get Text from PDF

In [8]:
# Function to get text from PDF
def get_text_from_pdf(pdf_file_path):
    with open(pdf_file_path, "rb") as f:
        reader = PdfReader(f)
        results = []
        for i in range(0, len(reader.pages)):
            text = reader.pages[i].extract_text()
            results.append(text)
    return "".join(results) # convert list to a single doc

get_text_from_pdf("./sample.pdf")

'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed pharetra neque vel elit scelerisque, eu\naliquam justo faucibus. Vivamus faucibus risus ut enim scelerisque, nec varius tortor convallis.\nFusce aliquam ex in tellus mollis dignissim. Proin ullamcorper, libero nec feugiat cursus, nunc nisl\nvarius nulla, in sollicitudin mi eros nec risus. Nulla eu vestibulum purus. Vivamus tincidunt justo felis,\neu bibendum nulla efficitur eget. Suspendisse a lacus purus.\nPraesent sit amet leo eget mi tristique interdum. Integer ac eros turpis. Sed luctus, purus in\nimperdiet consectetur, quam purus aliquam nibh, ac dictum metus purus a lectus. Maecenas\nfacilisis, odio nec facilisis commodo, sapien dui lobortis mauris, vel commodo nulla velit vel dolor. In\ncongue elit et odio vulputate, a facilisis ipsum dapibus. Sed sagittis libero in tellus consectetur, non\ndictum nisi cursus. Sed quis velit et sem elementum tempus. Ut facilisis, sapien at egestas mattis,\nturpis metus cursus justo, n

### Splitting PDFs
- Into Multiple PDFs (All Pages)
- Upto a Particular Page (Range of Pages)
- Last Page

In [16]:
# Splitting PDF into multiple PDF pages
import os
def split_pdf(pdf_file_path):
    with open(pdf_file_path, "rb") as f:
        reader = PdfReader(f)
        # get all the pages
        for page_num in range(0,len(reader.pages)):
            selected_page = reader.pages[page_num]

            # writer to write
            writer = PdfWriter()
            writer.add_page(selected_page)

            fileName = os.path.splitext(pdf_file_path)[0]
            output_fileName = f"{fileName}_page_{page_num+1}.pdf"

            # save and compile to PDF
            with open(output_fileName,"wb") as out:
                writer.write(out)

            print("Created a PDF: {}".format(output_fileName))
            

In [17]:
split_pdf("./sample.pdf")

Created a PDF: ./sample_page_1.pdf
Created a PDF: ./sample_page_2.pdf


In [20]:
# Split PDF upto a particular page
import os
def get_pdf_upto(pdf_file_path, start_page:int=0, end_page:int=0):
    with open(pdf_file_path, "rb") as f:
        reader = PdfReader(f)

        writer = PdfWriter()

        # get all the pages
        for page_num in range(start_page,end_page):
            selected_page = reader.pages[page_num]

            # writer to write 
            writer.add_page(selected_page)

            fileName = os.path.splitext(pdf_file_path)[0]
            output_fileName = f"{fileName}_from_{start_page}_to_{end_page}.pdf"

        # save and compile to PDF
        with open(output_fileName,"wb") as out:
            writer.write(out)

            print("Created a PDF: {}".format(output_fileName))
            

In [21]:
get_pdf_upto("./sample2.pdf", 0, 3)

Created a PDF: ./sample2_from_0_to_3.pdf


In [23]:
# Get last page of PDF
def get_pdf_last_page(pdf_file_path):
    with open(pdf_file_path, "rb") as f:
        reader = PdfReader(f)

        writer = PdfWriter()

        selected_page = reader.pages[len(reader.pages)-1]

        # writer to write 
        writer.add_page(selected_page)

        fileName = os.path.splitext(pdf_file_path)[0]
        output_fileName = f"{fileName}_last_page.pdf"

        # save and compile to PDF
        with open(output_fileName,"wb") as out:
            writer.write(out)

            print("Created a PDF: {}".format(output_fileName))

In [24]:
get_pdf_last_page("./sample2.pdf")

Created a PDF: ./sample2_last_page.pdf


### Merging PDFs

In [25]:
# fetch all PDF files
import os
def fetch_all_files(parent_folder:str):
    target_files = []
    for path, subdirs, files in os.walk(parent_folder):
        for name in files:
            if name.endswith(".pdf"):
                target_files.append(os.path.join(path,name))
    return target_files

In [27]:
fetch_all_files("./mergeMe/")

['./mergeMe/sample_page_1.pdf', './mergeMe/sample_page_2.pdf']

In [34]:
from PyPDF2 import PdfMerger
def merge_pdf(list_of_pdfs, output_file_name="final_merged_file.pdf"):
    merger = PdfMerger()
    with open(output_file_name, "wb") as f:
        for file in list_of_pdfs:
            merger.append(file)
        merger.write(f)

In [35]:
pdf_list = fetch_all_files("./mergeMe/")

In [36]:
merge_pdf(pdf_list)

### Rotate a PDF Page

In [11]:
from PyPDF2 import PdfReader, PdfWriter
import os
def rotate_pdf(pdf_file_path, page_num:int, rotation:int=90):
    with open(pdf_file_path,"rb") as f:
        reader = PdfReader(f)
        writer = PdfWriter()

        writer.add_page(reader.pages[page_num])

        # Rotate
        writer.pages[0].rotate(rotation)
        
        fileName = os.path.splitext(pdf_file_path)[0]
        output_fileName = f"{fileName}_{rotation}_rotated_page.pdf"

        # save and compile to PDF
        with open(output_fileName,"wb") as out:
            writer.write(out)

        print("Rotated a page: {}".format(output_fileName))

In [12]:
rotate_pdf("./sample.pdf", 0)

Rotated a page: ./sample_90_rotated_page.pdf


In [13]:
rotate_pdf("./sample2.pdf", 2, 180) # rotation angle must be multiple of 90

Rotated a page: ./sample2_180_rotated_page.pdf


### Working with images in a PDF

In [3]:
from PyPDF2 import PdfReader

In [4]:
# Extract Images from a PDF
def extract_images_from_pdf(pdf_file_path):
    with open(pdf_file_path,"rb") as f:
        reader = PdfReader(f)
        for page_num in range(0, len(reader.pages)):
            selected_page = reader.pages[page_num]

            # Get the images
            for img_file_obj in selected_page.images:
                with open(img_file_obj.name,"wb") as out:
                    out.write(img_file_obj.data)

In [5]:
extract_images_from_pdf("./sample_pdf_with_images.pdf")

In [9]:
from PIL import Image
import os

In [10]:
# Convert Image to PDF
def convert_image_to_pdf(image_file_path):
    my_img = Image.open(image_file_path)
    img = my_img.convert("RGB")
    fileName = f"{os.path.splitext(image_file_path)[0]}.pdf"
    img.save(fileName)

In [11]:
convert_image_to_pdf("./Im1.jpg")