# Data extraction for one city for one year

## From the PDF with the income statement data of the city of St. Gallen (year 2022) till the final dataframe and the excel file

#### Importing the needed libraries

In [1]:
from tabula import read_pdf
import pandas as pd
from PyPDF2 import PdfWriter, PdfReader


## 1. Importing the income statements from one year for one municipality

In [2]:
df = read_pdf("2022 SG.pdf", pages="all")

In [3]:
# This cell aims to show that df is a list of DFs: one DF per page extracted from the document

print(type(df)) # it is a list 
print(len(df)) # of 4 elements

print(df[0]) # the single DFs is what we are looking for
print(len(df[0])) # the lenght is also correct

<class 'list'>
4
   Erfolgsrechnung nach Kostenart Alle DirektionenRechnung Budget   
0                                           2021 2022              \
1                             667 662 996 676 632 100               
2                            669 234 686 -654 888 370               
3                              -1 571 690  21 743 730               
4                             264 728 060 273 568 210               
5                                 2 709 454 1 867 310               
6                             144 538 179 143 738 970               
7                               68 680 690 67 865 300               
8                                             889 730               
9                                   844 506 8 801 740               
10                              39 799 252 40 945 200               
11                                5 245 198 5 956 230               
12                                2 910 781 3 503 730               
13               

#### After understanding the shape of what we have imported, we can see that we have a list of dataframes (4 in our case, so one per page)

In [4]:
print(df[0].head())

  Erfolgsrechnung nach Kostenart Alle DirektionenRechnung Budget   
0                                          2021 2022              \
1                            667 662 996 676 632 100               
2                           669 234 686 -654 888 370               
3                             -1 571 690  21 743 730               
4                            264 728 060 273 568 210               

  Nachtragskredit  Unnamed: 0  Konto   
0            2022         NaN    NaN  \
1      69 858 232         NaN    NaN   
2             NaN         NaN    NaN   
3      69 858 232         NaN    NaN   
4       1 929 450         NaN   30.0   

                                      Unnamed: 1   Rechnung 2022  Unnamed: 2   
0                                            NaN   AufwandErtrag         NaN  \
1                                        Aufwand  806 710 659.40         NaN   
2                                         Ertrag  802 024 241.24         NaN   
3  Aufwandüberschuss (+) / Ert

In [5]:
print(df[0].columns)

Index(['Erfolgsrechnung nach Kostenart Alle DirektionenRechnung Budget',
       'Nachtragskredit', 'Unnamed: 0', 'Konto', 'Unnamed: 1', 'Rechnung 2022',
       'Unnamed: 2', 'Abweichung'],
      dtype='object')


#### We can see that Tabula is good but not perfect. It also read the title displayed outside the frame of the table. To make things easier, we cut the pdf using the PyPDF2 library 

## 2. PDF cropping

In [6]:
# instantiating a reader object and reading its first page

reader = PdfReader("2022 SG.pdf", "r")
page = reader.pages[1]

In [7]:
# this is used to understand how the file is read by the PdfReader

print(page.mediabox) 

# N.B. although the PDF is displayed in the landscape format in Adobe DC, it has a vertical shape according to the numbers

RectangleObject([0, 0, 595.276, 841.89])


In [8]:
# instantiating a writer object

writer = PdfWriter()

In [9]:
# this code cuts every page of the pdf and saves them alltogehter in the writer object

for page in range(len(reader.pages)):
    single_page = reader.pages[page]
    # single_page.mediabox.upper_right = 500, 700
    single_page.mediabox.upper_left = 0, 800
    single_page.mediabox.lower_left = 0, 25
    writer.add_page(single_page)

# given the vertical orientation, the lowerleft and upperleft commands need to be used
# This instead of upperright, what had led to an unexpected output initially

In [10]:
# exporting the cropped PDF

export = open('2022 SG cropped.pdf', 'wb')
writer.write(export)
export.close()

#### Seeing that some of the PDFs (when going through all the ones from SG) got wrongly cropped, we need to take something into account

## 3. Fixing cropping errors

In [11]:
# one of the PDFs wrongly cropped is the one from 2018

# reading it
reader_18 = PdfReader("2018 SG.pdf", "r")
page_18 = reader_18.pages[1]

# reading the base case report (2022 SG)
reader = PdfReader("2022 SG.pdf", "r")
page = reader.pages[1]

In [12]:
# how the one from 2018 is read         
print(page_18.mediabox, "\n") 

# vs how the 2022 was read
print(page.mediabox, "\n") 

RectangleObject([0, 0, 642.283, 888.898]) 

RectangleObject([0, 0, 595.276, 841.89]) 



#### The 2022 report was used as a base case to determine what parameters to use to successfully cut the PDF. That is how we need to adapt the "cutting" code

In [13]:
# coordinates of the base case (2022 report)
(LL_x_base, LL_y_base) = page.cropbox.lower_left
(UR_x_base, UR_y_base) = page.cropbox.upper_right

In [14]:
# coordinates of the specific year's report
(LL_x, LL_y) = page_18.cropbox.lower_left
(UR_x, UR_y) = page_18.cropbox.upper_right

In [15]:
# difference of the report's coordinates with the base case
D_LL_x = LL_x - LL_x_base
D_LL_y = LL_y - LL_y_base
D_UR_x = UR_x - UR_x_base
D_UR_y = UR_y - UR_y_base

In [16]:
# printing the difference
print(D_LL_x, D_LL_y, D_UR_x, D_UR_y)

23.505 23.505 23.502 23.503


In [17]:
# writer object
writer = PdfWriter()

In [18]:
# adjusted "cutting" code
for page in range(len(reader_18.pages)):
    single_page = reader_18.pages[page]
    single_page.mediabox.lower_left = (0 + D_LL_x, 25 + D_LL_y)
    single_page.mediabox.upper_left = (0 + D_LL_x, 800 + D_UR_y)
    writer.add_page(single_page)

In [19]:
# exporting the cropped PDF

export = open('2018 SG cropped.pdf', 'wb')
writer.write(export)
export.close()

#### It worked, the coordinates of the base case report will be hardcoded in the python script used for all PDFs. Now we are left with the problem of the 2021 report being cut too little.

In [20]:
# reading the sizes of the 2021 SG report

# reading it
reader_21 = PdfReader("2021 SG.pdf", "r")
page_21 = reader_21.pages[1]

# how the 2021 is read
print(page_21.mediabox) 


RectangleObject([23.5039, 23.504, 618.78, 865.394])


In [21]:
# rereading the base case report (2022 SG)
reader = PdfReader("2022 SG.pdf", "r")
page = reader.pages[1]

# coordinates of the base case (2022 report)
(LL_x_base, LL_y_base) = page.cropbox.lower_left
(UR_x_base, UR_y_base) = page.cropbox.upper_right

# coordinates of the specific year's report
(LL_x, LL_y) = page_21.cropbox.lower_left
(UR_x, UR_y) = page_21.cropbox.upper_right

# difference of the report's coordinates with the base case
D_LL_x_21 = LL_x - LL_x_base
D_LL_y_21 = LL_y - LL_y_base
D_UR_x_21 = UR_x - UR_x_base
D_UR_y_21 = UR_y - UR_y_base

In [22]:
# let's see the difference 
print(D_LL_x_21, D_LL_y_21, D_UR_x_21, D_UR_y_21)


# coordinates report
print("\nReport: ")
print(LL_x, LL_y)
print(UR_x, UR_y)

# coordinates base case (SG 2022)
print("\nBase case: ")
print(LL_x_base, LL_y_base)
print(UR_x_base, UR_y_base)

0 0 47.007 47.008

Report: 
0 0
642.283 888.898

Base case: 
0 0
595.276 841.89


In [23]:
# given that we have different size values (when reading the 2021 pdf report),
# we need to adjust it in a different way. We cannot simply make a transfer of coordinates as before
# Let's calculate the factor by which this should happen
# that is on the x axis (UR_x / UR_x_base) and y axis (UR_y / UR_y_base)

factor_x =  UR_x / UR_x_base
factor_y = UR_y / UR_y_base

In [24]:
# this is by how much that difference should change
print(round(factor_x, 2), round(factor_y, 2))

1.08 1.06


In [25]:
# writer object
writer = PdfWriter()

In [26]:
# adjusted "cutting" code
for page in range(len(reader_18.pages)):
    single_page = reader_18.pages[page]
    single_page.mediabox.upper_left = (0, 800 * factor_y)    
    single_page.mediabox.lower_left = (0, 25 * factor_y)
    writer.add_page(single_page)

In [27]:
# exporting the cropped PDF

export = open('2021 SG cropped.pdf', 'wb')
writer.write(export)
export.close()

In [28]:
# how the just cropped PDF is being read

reader_21 = PdfReader("2021 SG cropped.pdf", "r")
page_21 = reader_21.pages[1]

# how the 2021 cropped is read
print(page_21.mediabox) 

RectangleObject([0, 26.39590682868308211286510115, 642.283, 844.6690185178586276116832368])


In [29]:
reader_base = PdfReader("2022 SG.pdf", "r")
reader_18 = PdfReader("2018 SG.pdf", "r")
reader_21 = PdfReader("2021 SG.pdf", "r")

In [30]:
reader = reader_21

In [31]:
## in the end the solution is to have a code that depends on whether that difference is constant or not.
## If so, there is a traslation. If not the factor is calculated

# base case
(LL_x_base, LL_y_base) = (0, 0)
(UR_x_base, UR_y_base) = (595.276, 841.89)

# difference from the base case report St. Gallen (reduced 2022 Rechnung). This report was used to determine the cutting coordinates
(LL_x, LL_y) = reader.pages[1].cropbox.lower_left
(UR_x, UR_y) = reader.pages[1].cropbox.upper_right

print((LL_x, LL_y))
print((UR_x, UR_y))

D_LL_x = round(float(LL_x) - LL_x_base)
D_LL_y = round(float(LL_y) - LL_y_base)
D_UR_x = round(float(UR_x) - UR_x_base)
D_UR_y = round(float(UR_y) - UR_y_base)

print(D_LL_x, D_LL_y, D_UR_x, D_UR_y)

if D_LL_x == D_LL_y == D_UR_x == D_UR_y:
    # print("all is equal")

    formula_UL = (0 + D_LL_x, 800 + D_UR_y)
    formula_LL = (0 + D_LL_x, 25 + D_LL_y)

else:
    factor_x =  float(UR_x) / UR_x_base
    factor_y = float(UR_y) / UR_y_base
    print(factor_x, factor_y)

    formula_UL = (0, 800 * factor_y)
    formula_LL = (0, 25 * factor_y)

# cutting every page of the PDF and saving them alltogehter in the writer object

writer = PdfWriter()

for page in range(len(reader.pages)):
    single_page = reader.pages[page]
    single_page.mediabox.upper_left = formula_UL
    single_page.mediabox.lower_left = formula_LL
    writer.add_page(single_page)

export = open("2021 SG cropped.pdf", 'wb')
writer.write(export)
export.close() 


(0, 0)
(642.283, 888.898)
0 0 47 47
1.0789667313985447 1.0558362731473234


#### The formula works and will be implemented in the overall data extraction script like that. The only issue is that the 2021 report got wrongly cut. The reason is that the report has not the same width / breadth ratio. Therefore, I will have to cut it manually. At least, the formula, should work for all forms that share the same ratio

In [32]:
## adding a code that knows the base ratio and raises an error in case of a different one
## to signal that the report needs a manual cut

base_ratio = round(841.89 / 595.276, 2)

print(base_ratio)
print(round(UR_y - LL_y, 2))
print(round(UR_x - LL_x, 2))
print(round((UR_y - LL_y) / (UR_x - LL_x), 2))

# if float(round((UR_y - LL_y) / (UR_x - LL_x), 2)) != base_ratio: # this cell was commented out, 
#     raise Exception("Different ratio!")                          # to allow the workbook to run fully
 

1.41
888.90
642.28
1.38


In [33]:
## manual cropping of the 2021 report. Manual in the sense that different values are tried until it makes sense

writer = PdfWriter()

for page in range(len(reader_21.pages)):
    single_page = reader_21.pages[page]
    single_page.mediabox.lower_left = (0 , 48)
    single_page.mediabox.upper_left = (0 , 824.898)
    writer.add_page(single_page)

export = open("2021 SG cropped.pdf", 'wb')
writer.write(export)
export.close()

#### Now that the PDF got successfully cropped, we can reread it with Tabula and proceed

## 4. Data manipulation with Tabula

In [34]:
df_tabula = read_pdf("2022 SG cropped.pdf", pages="all")

#### Analyzing input type and content

In [35]:
# lets analyze two single dfs
print(df_tabula[0].head())
print(df_tabula[1].head())

# no matter which of the 4 DFs we take, we need to fix the first row, whose data also belongs to the columns
print(df_tabula[0].columns)

      Rechnung        Budget Nachtragskredit  Konto   
0         2021          2022            2022    NaN  \
1  667 662 996   676 632 100      69 858 232    NaN   
2  669 234 686  -654 888 370             NaN    NaN   
3   -1 571 690    21 743 730      69 858 232    NaN   
4  264 728 060   273 568 210       1 929 450   30.0   

                                      Unnamed: 0   Rechnung 2022  Unnamed: 1   
0                                            NaN   AufwandErtrag         NaN  \
1                                        Aufwand  806 710 659.40         NaN   
2                                         Ertrag  802 024 241.24         NaN   
3  Aufwandüberschuss (+) / Ertragsüberschuss (-)    4 686 418.16         NaN   
4                                Personalaufwand  261 488 439.52         NaN   

         Abweichung  
0  RG / BU inkl. NK  
1        60 220 327  
2      -147 135 871  
3       166 146 612  
4       -14 009 220  
    Rechnung     Budget Nachtragskredit  Konto   
0     

#### This code takes the column names, adds the part that got saved as first row and removes the first row. We also remove the one empty column and rename another 2 columns

In [36]:
first_df = df_tabula[0].copy() # copying the df of the first page to apply manipulation to it

# saving old column names
old_col_names = first_df.columns.to_list()


# list with new column names
new_col_names = []
for name in old_col_names:
    first_line = " " + str(first_df.loc[0, name])
    if "nan" in first_line: # not copying nan values
        first_line = ""
    elif "AufwandErtrag" in first_line: # renaming "AufwandErtrag" with "Aufwand / Ertrag"
        first_line = " Aufwand / Ertrag"
    new_col_names.append(name + first_line)


# overwriting the column names
first_df.columns = new_col_names 


first_df = first_df.iloc[1:, :]  # eliminating the first row
first_df.rename({'Unnamed: 0': 'Beschreibung'}, axis=1, inplace=True) # renaming column "Unnamed: 0" -> "Beschreibung"  
first_df.drop(columns=['Unnamed: 1'], inplace=True) # removing "Unnamed: 1"



In [37]:
first_df.head()

Unnamed: 0,Rechnung 2021,Budget 2022,Nachtragskredit 2022,Konto,Beschreibung,Rechnung 2022 Aufwand / Ertrag,Abweichung RG / BU inkl. NK
1,667 662 996,676 632 100,69 858 232,,Aufwand,806 710 659.40,60 220 327
2,669 234 686,-654 888 370,,,Ertrag,802 024 241.24,-147 135 871
3,-1 571 690,21 743 730,69 858 232,,Aufwandüberschuss (+) / Ertragsüberschuss (-),4 686 418.16,166 146 612
4,264 728 060,273 568 210,1 929 450,30.0,Personalaufwand,261 488 439.52,-14 009 220
5,2 709 454,1 867 310,,300.0,Behörden und Kommissionen,1 732 456.22,-134 854


#### Making the dataframe cleaning for all the dataframes extracted (one per page of the PDF document)

In [38]:
cleaned_dfs = []

for single_df in df_tabula:

    print(single_df.head())

    old_col_names = single_df.columns.to_list()
    new_col_names = []
    for name in old_col_names:
        first_line = " " + str(single_df.loc[0, name])
        if "nan" in first_line: # not copying nan values
            first_line = ""
        elif "AufwandErtrag" in first_line: # renaming "AufwandErtrag" with "Aufwand / Ertrag"
            first_line = " Aufwand / Ertrag"
        new_col_names.append(name + first_line)
    single_df.columns = new_col_names 
    single_df = single_df.iloc[1:, :]  # eliminating the first row
    single_df.rename({'Unnamed: 0': 'Beschreibung'}, axis=1, inplace=True)

    print(single_df.head())

    single_df.drop(columns=['Unnamed: 1'], inplace=True) # removing "Unnamed: 1"

    print(single_df.head())


    cleaned_dfs.append(single_df)

      Rechnung        Budget Nachtragskredit  Konto   
0         2021          2022            2022    NaN  \
1  667 662 996   676 632 100      69 858 232    NaN   
2  669 234 686  -654 888 370             NaN    NaN   
3   -1 571 690    21 743 730      69 858 232    NaN   
4  264 728 060   273 568 210       1 929 450   30.0   

                                      Unnamed: 0   Rechnung 2022  Unnamed: 1   
0                                            NaN   AufwandErtrag         NaN  \
1                                        Aufwand  806 710 659.40         NaN   
2                                         Ertrag  802 024 241.24         NaN   
3  Aufwandüberschuss (+) / Ertragsüberschuss (-)    4 686 418.16         NaN   
4                                Personalaufwand  261 488 439.52         NaN   

         Abweichung  
0  RG / BU inkl. NK  
1        60 220 327  
2      -147 135 871  
3       166 146 612  
4       -14 009 220  
  Rechnung 2021   Budget 2022 Nachtragskredit 2022  Kont

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_df.rename({'Unnamed: 0': 'Beschreibung'}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_df.drop(columns=['Unnamed: 1'], inplace=True) # removing "Unnamed: 1"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_df.rename({'Unnamed: 0': 'Beschreibung'}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

In [39]:
cleaned_dfs

[   Rechnung 2021   Budget 2022 Nachtragskredit 2022  Konto   
 1    667 662 996   676 632 100           69 858 232    NaN  \
 2    669 234 686  -654 888 370                  NaN    NaN   
 3     -1 571 690    21 743 730           69 858 232    NaN   
 4    264 728 060   273 568 210            1 929 450   30.0   
 5      2 709 454     1 867 310                  NaN  300.0   
 6    144 538 179   143 738 970            1 666 500  301.0   
 7     68 680 690    67 865 300                  NaN  302.0   
 8            NaN       889 730              103 920  303.0   
 9        844 506     8 801 740               19 000  304.0   
 10    39 799 252    40 945 200               55 300  305.0   
 11     5 245 198     5 956 230                  NaN  306.0   
 12     2 910 781     3 503 730               84 730  309.0   
 13    82 390 279    91 265 930            8 274 285   31.0   
 14    10 462 247    12 651 940              843 250  310.0   
 15     5 369 902     7 331 850              688 045  3

#### combining all dataframes (one per page extracted) into a single df

In [40]:
# I am cominbing all dataframes (one per page extracted) into a single df

df = pd.concat(cleaned_dfs)

In [41]:
# veryfying that the output is correct

print(df)
print(df.info()) # as we can see, during the process the column "Nachtragskredit 2022.0" got added. 
                 # This could be do to tabula creating it by mistake during the reading process

                 # Also, the row numbers repeat themselves, so we have to reset the row indexes

   Rechnung 2021   Budget 2022 Nachtragskredit 2022  Konto   
1    667 662 996   676 632 100           69 858 232    NaN  \
2    669 234 686  -654 888 370                  NaN    NaN   
3     -1 571 690    21 743 730           69 858 232    NaN   
4    264 728 060   273 568 210            1 929 450   30.0   
5      2 709 454     1 867 310                  NaN  300.0   
..           ...           ...                  ...    ...   
12    -4 166 715    -4 058 500                  NaN  492.0   
13   -10 850 201   -11 999 310                  NaN  493.0   
14   -12 481 111   -10 440 050                  NaN  494.0   
15    -9 855 945   -24 058 510                  NaN  498.0   
16   -17 095 596           NaN                  NaN  499.0   

                                     Beschreibung   
1                                         Aufwand  \
2                                          Ertrag   
3   Aufwandüberschuss (+) / Ertragsüberschuss (-)   
4                                 Personala

In [42]:
df.drop(columns=['Nachtragskredit 2022.0'], inplace=True) # removing "Nachtragskredit 2022.0"
df.reset_index(inplace=True) # resetting indexes

In [43]:
# check

print(df)
print(df.info()) # we do not need the former index row either, so let's drop it

     index Rechnung 2021   Budget 2022 Nachtragskredit 2022  Konto   
0        1   667 662 996   676 632 100           69 858 232    NaN  \
1        2   669 234 686  -654 888 370                  NaN    NaN   
2        3    -1 571 690    21 743 730           69 858 232    NaN   
3        4   264 728 060   273 568 210            1 929 450   30.0   
4        5     2 709 454     1 867 310                  NaN  300.0   
..     ...           ...           ...                  ...    ...   
99      12    -4 166 715    -4 058 500                  NaN  492.0   
100     13   -10 850 201   -11 999 310                  NaN  493.0   
101     14   -12 481 111   -10 440 050                  NaN  494.0   
102     15    -9 855 945   -24 058 510                  NaN  498.0   
103     16   -17 095 596           NaN                  NaN  499.0   

                                      Beschreibung   
0                                          Aufwand  \
1                                           Ertrag 

In [44]:
df.drop(columns=['index'], inplace=True) # removing "index" column

In [45]:
# check

print(df)
print(df.info()) # brilliant

    Rechnung 2021   Budget 2022 Nachtragskredit 2022  Konto   
0     667 662 996   676 632 100           69 858 232    NaN  \
1     669 234 686  -654 888 370                  NaN    NaN   
2      -1 571 690    21 743 730           69 858 232    NaN   
3     264 728 060   273 568 210            1 929 450   30.0   
4       2 709 454     1 867 310                  NaN  300.0   
..            ...           ...                  ...    ...   
99     -4 166 715    -4 058 500                  NaN  492.0   
100   -10 850 201   -11 999 310                  NaN  493.0   
101   -12 481 111   -10 440 050                  NaN  494.0   
102    -9 855 945   -24 058 510                  NaN  498.0   
103   -17 095 596           NaN                  NaN  499.0   

                                      Beschreibung   
0                                          Aufwand  \
1                                           Ertrag   
2    Aufwandüberschuss (+) / Ertragsüberschuss (-)   
3                          

#### Now that everything works, I am saving the merged df with financial information from the year 2022 from the City of St. Gallen as an Excel file and manually check that all revelvant information got read. That is the case

In [46]:
df.to_excel("Combined 2022 SG.xlsx")

## 5. Fixing Pandas errors

#### When running the Pandas code created before to manipulate the single DF on multiple reports, we get an error. It says "['Unnamed: 1'] not found in axis". This is an undesired column that it apparently is not found. 

#### We also get pointed to a documentation link (https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy).

#### This seems to be related to some warnings we got when creating the code before

In [47]:
# code we used
cleaned_dfs = []

for single_df in df_tabula:

    print(single_df.head())

    old_col_names = single_df.columns.to_list()
    new_col_names = []
    for name in old_col_names:
        first_line = " " + str(single_df.loc[0, name])
        if "nan" in first_line: # not copying nan values
            first_line = ""
        elif "AufwandErtrag" in first_line: # renaming "AufwandErtrag" with "Aufwand / Ertrag"
            first_line = " Aufwand / Ertrag"
        new_col_names.append(name + first_line)
    single_df.columns = new_col_names 
    single_df = single_df.iloc[1:, :]  # eliminating the first row
    # single_df.rename({'Unnamed: 0': 'Beschreibung'}, axis=1, inplace=True)
    single_df = single_df.rename({'Unnamed: 0': 'Beschreibung'}, axis=1)

    print(single_df.head())

    # single_df.drop(columns=['Unnamed: 1'], inplace=True) # removing "Unnamed: 1"
    single_df = single_df.drop(columns=['Unnamed: 1']) 

    print(single_df.head())


    cleaned_dfs.append(single_df)

  Rechnung 2021   Budget 2022 Nachtragskredit 2022  Konto   
0          2021          2022                 2022    NaN  \
1   667 662 996   676 632 100           69 858 232    NaN   
2   669 234 686  -654 888 370                  NaN    NaN   
3    -1 571 690    21 743 730           69 858 232    NaN   
4   264 728 060   273 568 210            1 929 450   30.0   

                                      Unnamed: 0   
0                                            NaN  \
1                                        Aufwand   
2                                         Ertrag   
3  Aufwandüberschuss (+) / Ertragsüberschuss (-)   
4                                Personalaufwand   

  Rechnung 2022 Aufwand / Ertrag  Unnamed: 1 Abweichung RG / BU inkl. NK  
0                  AufwandErtrag         NaN            RG / BU inkl. NK  
1                 806 710 659.40         NaN                  60 220 327  
2                 802 024 241.24         NaN                -147 135 871  
3                   

#### Browsing a bit around, we get recommended not to use the replace parameter (https://stackoverflow.com/questions/33727667/pandas-settingwithcopywarning-a-value-is-trying-to-be-set-on-a-copy-of-a-slice). This indeed helps getting rid of the warnings

#### The warnings have disappeared, but the error is still there in the main python script. That means, it was not related to the warnings

In [48]:
# rerunning the code with "report 2008", This is the first one processed an throwing an error

df_tabula_08 = read_pdf("cut reduced 2008 Rechnung.pdf", pages="all")


cleaned_dfs = []

for single_df in df_tabula_08:

    print(single_df.head())
    print(type(single_df.head()))

    old_col_names = single_df.columns.to_list()
    new_col_names = []
    for name in old_col_names:
        first_line = " " + str(single_df.loc[0, name])
        if "nan" in first_line: # not copying nan values
            first_line = ""
        elif "AufwandErtrag" in first_line: # renaming "AufwandErtrag" with "Aufwand / Ertrag"
            first_line = " Aufwand / Ertrag"
        new_col_names.append(name + first_line)
    single_df.columns = new_col_names 
    single_df = single_df.iloc[1:, :]  # eliminating the first row
    single_df = single_df.rename({'Unnamed: 0': 'Beschreibung'}, axis=1)

    print(single_df.head())

    # single_df = single_df.drop(columns=['Unnamed: 1']) ## commented out to allow for the notebook to run as a whole
    #                                                    ## to see the error, you need to uncomment it

    print(single_df.head())


    cleaned_dfs.append(single_df)

                                      Rechnung\r2007  Voranschlag\r2008   
0  609 422 232.71\r\r\r\r222 319 590.30\r\r\r2 00...                NaN  \
1  117 742 875.05\r\r\r63 572 285.75\r\r\r10 707 ...                NaN   

   Konto  Rechnung\r2008  Abweichung von\rBudget u. Nachtr.  
0    NaN             NaN                                NaN  
1    NaN             NaN                                NaN  
<class 'pandas.core.frame.DataFrame'>
  Rechnung\r2007 609 422 232.71\r\r\r\r222 319 590.30\r\r\r2 002 011.35   
1  117 742 875.05\r\r\r63 572 285.75\r\r\r10 707 ...                     \

   Voranschlag\r2008  Konto  Rechnung\r2008  Abweichung von\rBudget u. Nachtr.  
1                NaN    NaN             NaN                                NaN  
  Rechnung\r2007 609 422 232.71\r\r\r\r222 319 590.30\r\r\r2 002 011.35   
1  117 742 875.05\r\r\r63 572 285.75\r\r\r10 707 ...                     \

   Voranschlag\r2008  Konto  Rechnung\r2008  Abweichung von\rBudget u. Nachtr.  
1    

In [49]:
df_tabula_08[0].head()

Unnamed: 0,Rechnung\r2007 609 422 232.71\r\r\r\r222 319 590.30\r\r\r2 002 011.35,Voranschlag\r2008,Konto,Rechnung\r2008,Abweichung von\rBudget u. Nachtr.
0,609 422 232.71\r\r\r\r222 319 590.30\r\r\r2 00...,,,,
1,117 742 875.05\r\r\r63 572 285.75\r\r\r10 707 ...,,,,


#### As can be seen, the issue is due to Tabula. The PDFs somehow, do not get read properly. Let's see if it can be fixed. Otherwise this might mean excluding a big chunk of PDFs, which would not allow a proper data analysis and comparison between different years.

#### The reader.ipynb in the "tabula reading documents" folder is used to get a feeling for how many reports are affected by wrong scanning. Only the datraframes from 2014, 2016 and 2022 seem to have a colum "Unnamed: 1". This is not even a sufficient condition to tell that these three dataframes are read correctly.

#### The next step is to try out another solution

In [50]:
# trying out with PyPDF2, that was already imported (added the PdfReader to the import statement) and with the report SG 2008


## reader object
reader = PdfReader("cut reduced 2008 Rechnung.pdf")  # cut reduced 2008 Rechnung.pdf

## finding the page with the "Inhaltsverzeichnis" (table of contents)
number_of_pages = len(reader.pages)
page_ihv = 0

for page_number in range(number_of_pages):

    page = reader.pages[page_number]

    # print(page.extract_text())
    print(repr(page.extract_text()))



'Rechnung\n2007Voranschlag\n2008Konto Rechnung\n2008Abweichung von \nBudget u. Nachtr.\n9   589 392 548.78  575 287 100  3 A u f w a n d   609 422 232.71   504 933+\n    30 385 600 P    \n    3 244 600 S    \n        \n   212 618 936.90  224 104 700  30 Personalaufwand   222 319 590.30  4 543 010–\n    1 386 000 S    \n    1 371 900 P    \n   2 024 242.15  2 102 200  300 Behörden und Kommissionen  2 002 011.35   105 689–\n     5 500 S    \n   111 351 725.65  118 530 700  301 Löhne  117 742 875.05  2 491 625–    1 019 600 S    \n     684 200 P    \n   64 014 100.65  63 941 800  302 Löhne der Lehrkräfte  63 572 285.75  1 140 714–     687 700 P    \n     83 500 S    \n   10 427 703.85  10 936 400  303 Sozialversicherungsbeiträge  10 707 193.45   229 207–   14 987 267.45  17 467 400  304 Versicherungs- / Sparkassenbeiträge  18 258 719.75   791 320+   1 058 047.10  1 107 800  305 Unfallversicherungsbeiträge  1 071 308.35   36 492–    709 980.05   794 300  306 Dienstkleider und Verpflegungsz

In [51]:
df_tabula = read_pdf("cut reduced 2016 Rechnung.pdf", pages="all")

In [52]:
df_tabula[1]

Unnamed: 0,Rechnung\r2015,Voranschlag\r2016,Konto,Rechnung\r2016,Abweichung von\rBudget u. Nachtr.
0,40 169 875.24\r3 074 341.83\r37 337 464.16\r24...,,,,
1,8 655 864.97\r144 852 556.39\r\r\r\r35 009 206...,,,,
