In [1]:
import pandas as pd
import numpy as np
import camelot.io as camelot
import sys

 # adding folders up to system path to access functions
sys.path.insert(0, '../../../')
from driveFolderHandler import downloadTempDriveFolder, deleteTempDriveFolder

In [2]:
path = "../temp-docs"
downloadTempDriveFolder(folderID='1vWwEj2wILIswfYPVl0HVUOt-ZDngxNDa',
                        path = path)

Retrieving folder list


Processing file 1CGRvnTlzmI2ufFM6G9MvjTeD0Sbr1lgq DE_IUP_2023.pdf
Processing file 1m0N8GFVNiyF9VRzg7YR7m0_fn4n4uDoP DE_PPL_2023.pdf
Building directory structure completed


Retrieving folder list completed
Building directory structure
Downloading...
From: https://drive.google.com/uc?id=1CGRvnTlzmI2ufFM6G9MvjTeD0Sbr1lgq
To: /Users/pcork/epic/dw-dashboard/year1/DE/temp-docs/DE_IUP_2023.pdf
100%|██████████| 1.30M/1.30M [00:00<00:00, 9.49MB/s]
Downloading...
From: https://drive.google.com/uc?id=1m0N8GFVNiyF9VRzg7YR7m0_fn4n4uDoP
To: /Users/pcork/epic/dw-dashboard/year1/DE/temp-docs/DE_PPL_2023.pdf
100%|██████████| 860k/860k [00:00<00:00, 3.02MB/s]

Files stored in ../temp-docs



Download completed


## Delaware

### Appendix A, Comprehensive PPL

In [3]:
de_ppl = camelot.read_pdf(path+"/DE_IUP_2023.pdf", 
                           pages='25-29', flavor='stream', row_tol=1)
print(len(de_ppl))

Parsing Delaware IUP Appendix A
5


In [33]:
# combine all 4 pages into a single file and show length
de_ppl_output = pd.concat([de_ppl[0].df, de_ppl[1].df, de_ppl[2].df,
                           de_ppl[3].df, de_ppl[4].df]).reset_index(drop=True)
print(len(de_ppl_output))
# length is much longer than PPL because rows have been spliced apart, need to correct by recombining them

192


In [112]:
# write out temp file
# de_ppl_output.to_csv("../data/year1/csv/8-Delaware_temp_output.csv", index=False)

# THEN, in excel, manually assign each row a value for a new "key" column 
# such that when it is grouped by, it will combine the rows as they appear in the PDF

# read in manually updated file, which should be the same with one additional column, 'key'
# which represents the rows that should be joined together to rebuild the PDF table
de_ppl_input = pd.read_csv("../data/year1/csv/8-Delaware_temp_input.csv")

In [113]:
# drop empty rows from input file, then replace all NAs with empty string for joining
de_ppl_input = de_ppl_input.fillna('')
de_ppl_input = de_ppl_input.query("key != ''").copy()

In [114]:
de_ppl_input.head(20)

Unnamed: 0,key,0,1,2,3,4,5,6,7,8,9,10,11,12,13
3,1,,,,,,,,,,,,,,Anticipated to
4,1,,,,,Consolidation to,,,,,,,,,be Bypassed
5,1,,Countryside,,Upgrades and,Artesian - See Adam,,,,,,,State,,for State
6,1,1.0,Hamlet MHP,66.0,Interconnect,Gould emails,"$745,470.00",,800.0,State,,,Grant,"$745,470.00",Funds
7,2,,,,Water System,"New distribution, fire",,,,,,,,,Anticipated to
8,2,,,,Improvements and,"hydrants, water meters",,,,,,,,,be Bypassed
9,2,,Stage Village,,Consolidation with,and connections to,,,,,,,State,,for State
10,2,2.0,MHP,93.0,Delmar,Delmar,"$687,800.00",U,730.0,State,,,Grant,"$687,800.00",Funds
11,3,,,,,"System Upgrades, add",,,,,,,,,
12,3,,Willow Tree,,MHP Water System,storage Kitts,,,,Suppleme,,,Sup,,


In [115]:
# create empty dataframe to assign the group by columns to
de_ppl = pd.DataFrame(columns=['Rank', 'Water System/Borrower', 'Population Served', 'Comprehensive Project Name',
                               'Project Description', 'Amount', 'DAC? - A, E, W, U', 'Total Points',
                               'Funding Appropriation', 'Financing', 'Terms', 'Anticipated Subsidy', 
                               'Anticipated Subsidy Amount', 'Notes'])

# group each column with a function that joins strings with a space across all rows that have the manually created key in common
de_ppl['Rank'] = de_ppl_input.groupby(['key'])['0'].transform(lambda x : " ".join(x).strip())
de_ppl['Water System/Borrower'] = de_ppl_input.groupby(['key'])['1'].transform(lambda x : " ".join(x).strip())
de_ppl['Population Served'] = de_ppl_input.groupby(['key'])['2'].transform(lambda x : " ".join(x).strip())
de_ppl['Comprehensive Project Name'] = de_ppl_input.groupby(['key'])['3'].transform(lambda x : " ".join(x).strip())
de_ppl['Project Description'] = de_ppl_input.groupby(['key'])['4'].transform(lambda x : " ".join(x).strip())
de_ppl['Amount'] = de_ppl_input.groupby(['key'])['5'].transform(lambda x : " ".join(x).strip())
de_ppl['DAC? - A, E, W, U'] = de_ppl_input.groupby(['key'])['6'].transform(lambda x : " ".join(x).strip())
de_ppl['Total Points'] = de_ppl_input.groupby(['key'])['7'].transform(lambda x : " ".join(x).strip())
de_ppl['Funding Appropriation'] = de_ppl_input.groupby(['key'])['8'].transform(lambda x : " ".join(x).strip())
de_ppl['Financing'] = de_ppl_input.groupby(['key'])['9'].transform(lambda x : " ".join(x).strip())
de_ppl['Terms'] = de_ppl_input.groupby(['key'])['10'].transform(lambda x : " ".join(x).strip())
de_ppl['Anticipated Subsidy'] = de_ppl_input.groupby(['key'])['11'].transform(lambda x : " ".join(x).strip())
de_ppl['Anticipated Subsidy Amount'] = de_ppl_input.groupby(['key'])['12'].transform(lambda x : " ".join(x).strip())
de_ppl['Notes'] = de_ppl_input.groupby(['key'])['13'].transform(lambda x : " ".join(x).strip())

In [116]:
de_ppl = de_ppl.drop_duplicates(ignore_index=True)
# drop the consolidated row where all of the former column names ended up after aggregating
de_ppl.reset_index(inplace=True, drop=True)

In [117]:
# because different pages ended up formatting differently based on column values, break up the grouped df into three parts
# to resolve these issues

# the first page is unaffected, splice for re-adding later on
de_ppl_1 = de_ppl.iloc[:14,].reset_index(drop=True).copy()

In [118]:
# starting at row 15-30 (12, Greenwood), Project Description and Amount get combined and need to be split by /n
# then all columns need to be moved over one because of this combination collapsing each value into the wrong column

de_ppl_2 = de_ppl.iloc[15:31,].reset_index(drop=True).copy() 

# moving from right to left, replace the column's values with the column to the left
# until we arrive at the point where data was merged together erroneously
de_ppl_2['Notes'] = de_ppl_2['Anticipated Subsidy Amount']
de_ppl_2['Anticipated Subsidy Amount'] = de_ppl_2['Anticipated Subsidy']
de_ppl_2['Anticipated Subsidy'] = de_ppl_2['Terms']
de_ppl_2['Terms'] = de_ppl_2['Financing']
de_ppl_2['Financing'] = de_ppl_2['Funding Appropriation']
de_ppl_2['Funding Appropriation'] = de_ppl_2['Total Points']
de_ppl_2['Total Points'] = de_ppl_2['DAC? - A, E, W, U']
de_ppl_2['DAC? - A, E, W, U'] = de_ppl_2['Amount']

# split by dollar sign to handle issue where linebreak doesn't always appear
de_ppl_2[['Project Description','Amount']] = de_ppl_2['Project Description'].str.split('$', expand=True)

# add $ back
de_ppl_2['Amount'] = "$" + de_ppl_2['Amount']

# remove linebreak where it appears
de_ppl_2['Project Description'] = de_ppl_2['Project Description'].str.replace("\n", "")

# note that (19, Newark) is the only row where project name and description conflate, need to manually fix
de_ppl_2.iloc[9, 3] = "SWFWTP PFAs Treatment - $4.4M EC"
de_ppl_2.iloc[9, 4] = 'PFA WTP Upgrades'

In [119]:
# keep the second portion of unaffected rows
de_ppl_3 = de_ppl.iloc[31:50,]

In [120]:
# then again at 52 through the end of the df, the DAC column is attached to the Amount column (this only affects 4 values, rest are empty)
# and then all subsequent columns need to be moved over
de_ppl_4 = de_ppl.iloc[51:76,].reset_index(drop=True).copy()

# moving from right to left, replace the column's values with the column to the left
# until we arrive at the point where data was merged together erroneously
de_ppl_4['Notes'] = de_ppl_4['Anticipated Subsidy Amount']
de_ppl_4['Anticipated Subsidy Amount'] = de_ppl_4['Anticipated Subsidy']
de_ppl_4['Anticipated Subsidy'] = de_ppl_4['Terms']
de_ppl_4['Terms'] = de_ppl_4['Financing']
de_ppl_4['Financing'] = de_ppl_4['Funding Appropriation']
de_ppl_4['Funding Appropriation'] = de_ppl_4['Total Points']
de_ppl_4['Total Points'] = de_ppl_4['DAC? - A, E, W, U']

# split Amount into two columns, then fill the NA DAC values
de_ppl_4[['Amount','DAC? - A, E, W, U']] = de_ppl_4['Amount'].str.split('\n', expand=True)
de_ppl_4['DAC? - A, E, W, U'].fillna('', inplace=True)

In [121]:
# bring together individual sections of PPL
de_ppl_final = pd.concat([de_ppl_1, de_ppl_2, de_ppl_3, de_ppl_4])
de_ppl_final.reset_index(drop=True, inplace=True)

# fix some final typos
de_ppl_final['Funding Appropriation'] = de_ppl_final['Funding Appropriation'].str.replace("Suppleme ntal", "Supplemental")
de_ppl_final['Funding Appropriation'] = de_ppl_final['Funding Appropriation'].str.replace("Protectio n", "Protection")

In [122]:
de_ppl_final.to_csv("../data/year1/csv/8-Delaware_PPL.csv", index=False)

In [4]:
deleteTempDriveFolder(path)

Deleted folder and all files within ../temp-docs
