In [None]:
# https://realpython.com/pdf-python/
# https://www.geeksforgeeks.org/working-with-pdf-files-in-python/

#!pip install --upgrade PyPDF2==2.12.1

## 1. Extracting text from PDF file

In [None]:
# importing required modules
import PyPDF2

In [None]:
# creating a pdf file object
pdfFileObj = open('00372915.pdf', 'rb')

# We opened the example.pdf in binary mode. And saved the file object as pdfFileObj.

In [None]:
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

# Here, we create an object of PdfFileReader class of PyPDF2 module and pass the PDF file object 
# & get a PDF reader object.

In [None]:
# printing number of pages in pdf file
print(pdfReader.numPages)

# numPages property gives the number of pages in the PDF file. 
# For example, in our case, it is 20 (see first line of output).

In [None]:
# creating a page object
pageObj = pdfReader.getPage(0)

# Now, we create an object of PageObject class of PyPDF2 module. 
# PDF reader object has function getPage() which takes page number (starting from index 0)
# as argument and returns the page object.

In [None]:
# extracting text from page
print(pageObj.extractText())

# Page object has function extractText() to extract text from the PDF page.

In [None]:
# closing the pdf file object
pdfFileObj.close()

# At last, we close the PDF file object.

## 2. Rotating PDF pages

In [None]:
def PDFrotate(origFileName, newFileName, rotation):
  
    # creating a pdf File object of original pdf
    pdfFileObj = open(origFileName, 'rb')
      
    # creating a pdf Reader object
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  
    # creating a pdf writer object for new pdf
    pdfWriter = PyPDF2.PdfFileWriter()
      
    # rotating each page
    for page in range(pdfReader.numPages):
  
        # creating rotated page object
        pageObj = pdfReader.getPage(page)
        pageObj.rotateClockwise(rotation)
  
        # adding rotated page object to pdf writer
        pdfWriter.addPage(pageObj)
  
    # new pdf file object
    newFile = open(newFileName, 'wb')
      
    # writing rotated pages to new file
    pdfWriter.write(newFile)
  
    # closing the original pdf file object
    pdfFileObj.close()
      
    # closing the new pdf file object
    newFile.close()     



In [None]:
def main():
  
    # original pdf file name
    origFileName = '00372915.pdf'
     
    # new pdf file name
    newFileName = 'rotated_example.pdf'
      
    # rotation angle
    rotation = 270
      
    # calling the PDFrotate function
    PDFrotate(origFileName, newFileName, rotation)
      
if __name__ == "__main__":
    # calling the main function
    main()

## 3. Merging PDF files

In [None]:
def PDFmerge(pdfs, output):
    # creating pdf file merger object
    pdfMerger = PyPDF2.PdfFileMerger()
  
    # appending pdfs one by one
    for pdf in pdfs:
        pdfMerger.append(pdf)
  
    # writing combined pdf to output pdf file
    with open(output, 'wb') as f:
        pdfMerger.write(f)

In [None]:
def main():
    # pdf files to merge
    pdfs = ['00372915.pdf', 'rotated_example.pdf']
  
    # output pdf file name
    output = 'combined_example.pdf'
  
    # calling pdf merge function
    PDFmerge(pdfs=pdfs, output=output)
  
  
if __name__ == "__main__":
    # calling the main function
    main()

## 4. Splitting PDF file

In [None]:
def PDFsplit(pdf, splits):
    # creating input pdf file object
    pdfFileObj = open(pdf, 'rb')
      
    # creating pdf reader object
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
      
    # starting index of first slice
    start = 0
      
    # starting index of last slice
    end = splits[0]
      
      
    for i in range(len(splits)+1):
        # creating pdf writer object for (i+1)th split
        pdfWriter = PyPDF2.PdfFileWriter()
          
        # output pdf file name
        outputpdf = pdf.split('.pdf')[0] + str(i) + '.pdf'
          
        # adding pages to pdf writer object
        for page in range(start,end):
            pdfWriter.addPage(pdfReader.getPage(page))
          
        # writing split pdf pages to pdf file
        with open(outputpdf, "wb") as f:
            pdfWriter.write(f)
  
        # interchanging page split start position for next split
        start = end
        try:
            # setting split end position for next split
            end = splits[i+1]
        except IndexError:
            # setting split end position for last split
            end = pdfReader.numPages
          
    # closing the input pdf file object
    pdfFileObj.close()

In [None]:
def main():
    # pdf file to split
    pdf = '00372915.pdf'
      
    # split page positions
    splits = [2,4]
      
    # calling PDFsplit function to split pdf
    PDFsplit(pdf, splits)

In [None]:
if __name__ == "__main__":
    # calling the main function
    main()
    
# Output will be three new PDF files with split 1 (page 0,1), split 2(page 2,3), split 3(page 4-end).