# Reading Bern reports singularily

## 1. Some reports are not read by PyPDF2 reader

#### Let's try to read them singularily. These are 2016, 2018, 2019, 2021

In [1]:
from tabula import read_pdf
import pandas as pd
from PyPDF2 import PdfReader, PdfWriter
import numpy as np


In [2]:
reader = PdfReader("not_read\\2019.pdf") # error fixed by installing "pip install pycryptodome"

ignore '/Perms' verify failed


#### I was pointed to this command through this source: https://stackoverflow.com/questions/73701005/pypdf2-error-pycryptodome-is-required-for-aes-algorithm

In [3]:
number_of_pages = len(reader.pages)
page_ihv = 0

print(number_of_pages)

for page_number in range(2): # number_of_pages
    page = reader.pages[page_number]
    text = page.extract_text()
    print(text)


311
Stadt Bern
19Jahresbericht 2019
Jahresrechnung
Band 1
Stadt Bern
Erlacherhof
Junkerngasse 47Postfach3000 Bern 8
T
 031 321 62 1
0
E
 
stadtkanzlei@bern.ch
www.bern.ch


In [4]:
reader.pages[0].extract_text()

'Stadt Bern\n19Jahresbericht 2019\nJahresrechnung\nBand 1'

#### File can be read, let's jump back to bern_page_selector.py

## 2. The following code is needed to determine at which coordinates the reports from Bern need to be cut (2005 Bern is used as a base case)

In [5]:
reader = PdfReader("reduced reports\\reduced 2005.pdf", "r")
page = reader.pages[1]

In [6]:
print(page.mediabox) 

RectangleObject([0, 0, 595.22, 842])


In [7]:
writer = PdfWriter()

In [8]:
for page in range(len(reader.pages)):
    single_page = reader.pages[page]
    single_page.mediabox.lower_left = 30, 46
    single_page.mediabox.upper_right = 567, 775
    writer.add_page(single_page)

In [9]:
export = open('2005 Bern cropped.pdf', 'wb')
writer.write(export)
export.close()

#### The base case coordinates are the following. These values can now be included (hardcoded) in the bern_data_extraction.py file

In [10]:
(LL_x_base, LL_y_base) = (0, 0)
(UR_x_base, UR_y_base) = (595.22, 842)

## 3. Now, we read the report with Tabula

In [11]:
df_tabula = read_pdf("cut reports\\cut reduced 2005.pdf", pages="all")

In [12]:
# lets analyze two single dfs
print(df_tabula[0].head())
print(df_tabula[1].head())
print(df_tabula[2].head())

# no matter which of the 3 DFs we take, we need to fix the headings
print(df_tabula[0].columns)

                             Nettoergebnis            0.00          0.00.1   
0                            Total Aufwand  881'200'987.48  896'918'778.28  \
1                       30 Personalaufwand  287'409'820.07  289'731'702.86   
2   300 Personalaufw.Behörden/Kommissionen    2'151'979.60    2'220'426.54   
3  301 Löhne Verwaltungs-/Betriebspersonal  233'559'984.28  236'494'605.65   
4                 302 Löhne der Lehrkräfte       38'415.85        5'000.00   

           0.00.2  
0  868'458'439.71  
1  276'038'513.98  
2    2'930'023.80  
3  224'251'160.52  
4       26'819.35  
                  37 Durchlaufende Beiträge  5'286'123.05  6'649'800.00   
0               371 Durchl. Beiträge Kanton  3'639'700.00  5'000'000.00  \
1  372 Durchl. Beiträge Gemeinden/-verbände     73'130.00    105'000.00   
2        375 Durchl. Beiträge private Inst.    960'000.00    952'800.00   
3    376 Durchl. Beiträge private Haushalte    613'293.05    592'000.00   
4      38 Einlagen in Spezialfinanzi

#### The first row of every PDFs is taken as heading data. It needs to go down and become the row.

#### The heading is always "Type", "Rechnung {year of report}", "Budget {year of report}", "Rechnung {year of report - 1}"

In [13]:
df_1 = df_tabula[0].copy()

In [14]:
# Copying the header to the first row
df_1.iloc[0] = df_1.columns

In [15]:
year = int("2005")  # variable of the year, now it is hardcoded but will be extracted from the file name (in the python bern_data_extraction script)

In [16]:
columns = [f"Type", f"Rechnung {year}", f"Budget {year}", f"Rechnung {year - 1}"]   # new columns

In [17]:
# setting the first row

df_1.columns = columns

#### Now that it worked for one page, let's do it for all of them

In [18]:
for single_df in df_tabula:
    single_df.iloc[0] = single_df.columns
    single_df.columns = columns

In [19]:
# concatenating the DFs

df = pd.concat(df_tabula)

In [20]:
df.reset_index(drop=True)

Unnamed: 0,Type,Rechnung 2005,Budget 2005,Rechnung 2004
0,Nettoergebnis,0.00,0.00.1,0.00.2
1,30 Personalaufwand,287'409'820.07,289'731'702.86,276'038'513.98
2,300 Personalaufw.Behörden/Kommissionen,2'151'979.60,2'220'426.54,2'930'023.80
3,301 Löhne Verwaltungs-/Betriebspersonal,233'559'984.28,236'494'605.65,224'251'160.52
4,302 Löhne der Lehrkräfte,38'415.85,5'000.00,26'819.35
...,...,...,...,...
97,480 Entnahmen aus Spezialfinanzierungen,-5'235'151.80,-324'645.28,-915'201.00
98,49 Interne Verrechnungen,-45'424'876.78,-44'176'538.96,-40'839'106.34
99,490 Intern verrechneter Aufwand,-31'658'822.28,-30'094'572.96,-27'052'171.14
100,491 Intern verrechnete Passivzinsen,-9'966'889.52,-10'291'653.00,-10'158'499.20


In [21]:
df

Unnamed: 0,Type,Rechnung 2005,Budget 2005,Rechnung 2004
0,Nettoergebnis,0.00,0.00.1,0.00.2
1,30 Personalaufwand,287'409'820.07,289'731'702.86,276'038'513.98
2,300 Personalaufw.Behörden/Kommissionen,2'151'979.60,2'220'426.54,2'930'023.80
3,301 Löhne Verwaltungs-/Betriebspersonal,233'559'984.28,236'494'605.65,224'251'160.52
4,302 Löhne der Lehrkräfte,38'415.85,5'000.00,26'819.35
...,...,...,...,...
14,480 Entnahmen aus Spezialfinanzierungen,-5'235'151.80,-324'645.28,-915'201.00
15,49 Interne Verrechnungen,-45'424'876.78,-44'176'538.96,-40'839'106.34
16,490 Intern verrechneter Aufwand,-31'658'822.28,-30'094'572.96,-27'052'171.14
17,491 Intern verrechnete Passivzinsen,-9'966'889.52,-10'291'653.00,-10'158'499.20


#### Looks good. Now it can be added in a compact form to bern_data_extraction.py