# Extracting the relevant pages from the PDF

In [6]:
# importing required modules
from PyPDF2 import PdfReader, PdfWriter

In [7]:
# creating a pdf reader object
reader = PdfReader('2022 Rechnung.pdf')

In [8]:
# finding the page with the "Inhaltsverzeichnis" (table of contents)

number_of_pages = len(reader.pages)
page_ihv = 0

print(number_of_pages)

for page_number in range(number_of_pages): # number_of_pages
    page = reader.pages[page_number]
    text = page.extract_text()

    if "Inhaltsverzeichnis" in text:
        page_ihv = page_number
        break

206


#### Finding the number of pages of the PDF part that are of interest to us. We look for the number of pages that the "Inhaltsverzeichnis" tells us.
#### I have a string with all the content of the "Inahltsverzeichnis" page. I look for the page number (first digit in the string till the last digit before non digits are coming aka. "start") indicated after the first expression that contains the words "Aufwand" and "Ertrag". 
#### I also look up the next digit indicated in the string till the last digit before non digits are coming (aka. "end" + 1).
#### Once I know these two numbers, I am able to automatically extract the pages of interest.

In [9]:
# getting the content of the "Inhaltsverzeichnis" page
content_ihv = reader.pages[page_ihv].extract_text()


# this content is of type string
# print(type(content_ihv))


# now we need the relevant spots. So I keep the characters that remain after the first occurrence of "Aufwand"
content_ihv = content_ihv.split("Aufwand")[1]


### The following code looks up the page numbers
start = ""
end = ""

# looking up the start page (looping until I encounter a digit) and saving the characters up to this first digit
shortened = ""
for position in range(len(content_ihv)):
    if content_ihv[position].isdigit():
        start += content_ihv[position]
        if not content_ihv[position + 1].isdigit():
            break
    
    shortened += content_ihv[position]

# the characters saved before in "shortened" are needed to check that also the words "Ertrag" and "lle" (from "Alle" or "alle") appear between "Aufwand" and the digit
# this way we are sure to have gone for the right expression and respectively right page number
if "Ertrag" and "lle" not in shortened:
    raise Exception("Parsing error")

# looping through the remaining characters, till we get the next digit
remainder = content_ihv.split(start)[1] 


for position in range(len(remainder)):
    if remainder[position].isdigit():
        end += remainder[position]
        if not remainder[position + 1].isdigit():
            break

# that next digit - 1 gives us the last page of the PDF pages of interest to us
end = str(int(end) - 1)

# according to the "Inhaltsverzeichnis", it's page is number 1 (visible by having a look at the pdf). Let's see what our index of that page is
print(page_ihv) # 2 for us -> therefore we need to add that difference ("page_ihv" - 1) to both the "start" & "end" parameters

start = int(start) + page_ihv - 1
end = int(end) + page_ihv - 1

2


In [11]:
# it sorts out the useful pages, given the start and ending point and ignores empty / semi empty pages (<50 characters)

writer = PdfWriter()

export = open('2022 SG reduced.pdf', 'wb')

# looping through the pages needed
for page in range(start, end + 1):

    content = reader.pages[page].extract_text()

    if len(content) < 50: # to ignore empty / semiempty pages
        continue
    # print(len(content))

    writer.add_page(reader.pages[page])

# export the right pages
writer.write(export)
export.close()