<a href="https://colab.research.google.com/github/AndreaKvc/PDF-manipulation/blob/main/PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installing essential libraries
!pip install PyPDF2


In [None]:
#Merging multiple PDF files

import PyPDF2

def merge_pdfs(pdf_list, output_path):
 pdf_writer = PyPDF2.PdfWriter()
 for pdf in pdf_list:
   pdf_reader = PyPDF2.PdfReader(pdf)
   for page_num in range(len(pdf_reader.pages)):
    pdf_writer.add_page(pdf_reader.pages[page_num])

   with open(output_path, 'wb') as out:
    pdf_writer.write(out)
    print(f"Merged PDF saved as{output_path}")

#Let's use the above function
merge_pdfs(['Page+1.pdf','Page+2.pdf','Page+3.pdf'],'merged.pdf')

#Output file will be merged.pdf

In [None]:
#Splitting PDF file to multiple PDF files/Pages
import PyPDF2

def split_pdf(pdf_path, output_dir):
  pdf_reader = PyPDF2.PdfReader(pdf_path)
  for page_num in range(len(pdf_reader.pages)):
    pdf_writer = PyPDF2.PdfWriter()
    pdf_writer.add_page(pdf_reader.pages[page_num])
    output_path = f"{output_dir}/page{page_num + 1}.pdf"

    with open(output_path, 'wb') as out:
      pdf_writer.write(out)
    print(f"Saved {output_path}")
#Lets use above function
split_pdf('merged.pdf', 'pdf_files')

In [None]:
#installing essential libraries

!pip install pdfplumber

In [None]:
#Extracting text from Pdf file

import pdfplumber

def extract_text(pdf_path, output_text_path):
 with pdfplumber.open(pdf_path) as pdf:
    full_text = ''
    for page in pdf.pages:
      full_text += page.extract_text() + '\n'

    with open(output_text_path, 'w') as f:
      f.write(full_text)
    print(f"Extracted text is isaved as {output_text_path}")

#Lets use the above function
extract_text('merged.pdf', 'output.txt')

In [None]:
#Extract images from the PDF file
!pip install pymupdf

In [None]:
#Extract images from PDF file.

import fitz # PyMuPDF

def extract_images(pdf_path, output_dir):
  pdf_document = fitz.open(pdf_path)
  for page_index in range(len(pdf_document)):
    page = pdf_document.load_page(page_index)
    image_list = page.get_images(full=True)

    for img_index, img in enumerate(image_list):
      xref = img[0]
      base_image = pdf_document.extract_image(xref)
      image_bytes = base_image["image"]
      image_ext = base_image["ext"]
      image_filename = f"{output_dir}/image_{page_index + 1}_{img_index + 1}.{image_ext}"

      with open(image_filename, 'wb') as image_file:
        image_file.write(image_bytes)

      print(f"Saved {image_filename}")

    #Lets use the above function
    extract_images('merged.pdf', 'folder')

In [None]:
#install lib pypdf2
!pip install pypdf2

In [None]:
#Creating password to protect pdf (Encrypted PDF file).
def encrypt_pdf(input_pdf, output_pdf, password):
  pdf_reader = PyPDF2.PdfReader(input_pdf)
  pdf_writer = PyPDF2.PdfWriter()

  for page_num in range(len(pdf_reader.pages)):
    pdf_writer.add_page(pdf_reader.pages[page_num])

  pdf_writer.encrypt(password)

  with open(output_pdf, 'wb') as out:
    pdf_writer.write(out)

  print(f"Encrypted PDF file is saved as {output_pdf}")

  #Lets use above function
  encrypt_pdf('merged.pdf', 'encrypted.pdf', 'pass123')

In [None]:
#Remove Password from the password's protected PDF's (Decryption)

def decrypt_pdf(input_pdf, output_pdf, password):
  pdf_reader = PyPDF2.PdfReader(input_pdf)
  pdf_reader.decrypt(password)
  pdf_writer = PyPDF2.PdfWriter()

  for page_num in range(len(pdf_reader.pages)):
    pdf_writer.add_page(pdf_reader.pages[page_num])

  with open(output_pdf, 'wb') as out:
    print(f"Decrypted PDF file is saved as {output_pdf}")
#Let use the above function
decrypt_pdf('encrypted.pdf', 'decrypted.pdf', 'pass123')

In [None]:
!pip install pypdf2

In [None]:
#Re-arranging pages in pdf file

def rearrange_pages(input_pdf, output_pdf, page_order):
  pdf_reader = PyPDF2.PdfReader(input_pdf)
  pdf_writer = PyPDF2.PdfWriter()

  for page_num in page_order:
    page_writer.add_page(pdf_reader.pages[page_num])

    with open(output_pdf, 'wb') as out:
      pdf_writer.write(out)

    print(f"Rearranged PDF is saved {output_pdf}")

#Lets use the above function
#rearrange_pages('merged.pdf', 'rearranged.pdf' [2,1,0])
# [2,1,0] means files pages in reverse  order

rearrange_pages('merged.pdf', 'rearranged.pdf', [2, 1, 0])
#Lets remove the page number 1 from  the file as well.

In [None]:
!pip install PyPDf2
#Read metadate of the PDF file

def read_metadata(pdf_file):
  pdf_reader = PyPDF2.PdfReader(pdf_file)
  metadata = pdf_reader.metadata

  print("Metadata of the PDF file is:")
  for key, value in metadata.items():
    print(f"{key} : {value}")

#Lets use the above function
read_metadata('merged.pdf')

In [None]:
#Optimize the size of the Pdf file(Compressing PDF file)
# Add Metadata to the PDF files (e.g. Title, Author, etc.)
import PyPDF2

def add_metadata(input_pdf, output_pdf, title, author):
  pdf_reader = PyPDF2.PdfReader(input_pdf)
  pdf_writer = PyPDF2.PdfWriter()

  for page_num in range(len(pdf_reader.pages)):
   pdf_writer.add_page(pdf_reader.pages[page_num])
# Set the metadata directly using add_metadata
  metadata = {
    '/Title': title,
    '/Author': author,
    '/Producer':''
 }
  pdf_writer.add_metadata(metadata)

 with open(output_pdf,'wb') as out:
  pdf_writer.write(out)
  print(f"PDF with added metadata saved as {output_pdf}")

# Usageadd_metadata('Portrait.pdf', 'metadata_added.pdf', 'Sample File for Python Coding','Dr.Raj
# Reading PDF Metadata
import PyPDF2

def read_metadata(pdf_file):
  pdf_reader = PyPDF2.PdfReader(pdf_file)
  metadata = pdf_reader.metadata

 print("Metadata of the PDF:")
 for key, value in metadata.items():
   print(f" {key}: {value}")

# Usage
read_metadata('metadata_added.pdf')

In [None]:
# Rotating Pages - Potrait to Landscape
import PyPDF2
def rotate_pages(input_pdf, output_pdf, rotation):
 pdf_reader = PyPDF2.PdfReader(input_pdf)
 pdf_writer = PyPDF2.PdfWriter()

 for page_num in range(len(pdf_reader.pages)):
  page = pdf_reader.pages[page_num]
  page.rotate(rotation)
  pdf_writer.add_page(page)

  with open(output_pdf,'wb') as out:
   pdf_writer.write(out)
  print(f"New PDF saved as {output_pdf}")

# Usage - Landscape to Portait (use angle either as -90 or 270)
rotate_pages('Landscape.pdf','Portrait.pdf', -90)