In [1]:
## Import necessary packages
import pandas as pd
import numpy as np
import tkinter as tk
import PyPDF2 
import tabula
import tkinter
import camelot
import ghostscript
import re
import os
import csv
import urllib
import io
import warnings
warnings.filterwarnings("ignore")

# New York

### Table 1
Although the format of each page is largely the same, each page does not parse evenly due to different notes or text widths on each page.

There are, however, several commonalities that allow for generalization for a handful of pages. These are referred to by the first page to have a repeating pattern in the document.

Others are treated as one-off data cleaning processes.

In [9]:
ny_pdfs = camelot.read_pdf("https://www.health.ny.gov/environmental/water/drinking/iup/2023/docs/final_2023_intended_use_plan.pdf",
                                pages='38-70', flavor = 'stream', row_tol=30, split_text=True,
                                flag_size=True)

In [10]:
# the final names and order of columns for the entire document, used to easily reorder columns after processing a subset of pages
final_cols = ['Project.Number', 'County', 'System.Name', 'Borrower', 'Description', 'Pop', 'Project.Cost', 'Score', 'Cumulative.Total']

In [11]:
# for appending each subset of pages after parsing
final_list_of_dfs = []

In [12]:
def reduce_noise(df):
    '''Helper function that gets rid of random portions of notices included in the table that get split up by parsing.'''

    df = df.replace("\nNOTE:  Short - Term Loan Closed, Project Cost Shown \n", "", regex=True)
    df = df.replace("is Remaining Balance Not Funded With Short Term Loan", "", regex=True)
    df = df.replace("\nvaluation", "", regex=True)
    df = df.replace("\nEligibility Line \*\*", "", regex=True)
    df = df.replace("\nNOTE: Fin\nRemainin", "", regex=True)
    df = df.replace("\net Rate Prog\nh Market Rat", "", regex=True)
    df = df.replace("\nram-Cost Shown is \ne Financing", "", regex=True)
    df = df.replace("\nanced through SRF Mark\ng Balance Not Funded Wit", "", regex=True)
    df = df.replace("\nNOTE: F\nRemaini", "", regex=True)
    df = df.replace("\nt Rate Prog\n Market Ra", "", regex=True)
    df = df.replace("\nram-Cost Shown is \nte Financing", "", regex=True)
    df = df.replace("\ninanced through SRF Marke\nng Balance Not Funded With", "", regex=True)
    df = df.replace("\ninanced through SRF Marke\nng Balance Not Funded With", "", regex=True)
    df = df.replace("\nNOTE: Fi\nRemainin", "", regex=True)
    df = df.replace("\nNOTE: F\nRemainin", "", regex=True)
    df = df.replace("\nt Rate Pr\n Market R", "", regex=True)
    df = df.replace("\nogram-Cost Shown is \nate Financing", "", regex=True)
    df = df.replace("\nnanced through SRF Marke\ng Balance Not Funded With", "", regex=True)
    df = df.replace("\ninanced through SRF Marke\ng Balance Not Funded With", "", regex=True)
    df = df.replace("\nNOTE:  Short - Term Loan Closed, Project Cost Shown \n", "", regex=True)
    df = df.replace("is Remaining Balance Not Funded With Short Term Loan", "", regex=True)
    df = df.replace("\nNOT\nis Re", "", regex=True)
    df = df.replace("\nst Shown \n Term Loan", "", regex=True)
    df = df.replace("\nd, Project Co\nd With Short", "", regex=True)
    df = df.replace("\net Rate Prog\nh Market Rat", "", regex=True)
    df = df.replace("nram-Cost Shown is \ne Financing", "", regex=True)
    df = df.replace("nram-Cost Shown is \ne Financing", "", regex=True)
    df = df.replace("\nE:  Short - Term Loan Close\nmaining Balance Not Funde", "", regex=True)
    df = df.replace("\nNOTE: F\nRemaini", "", regex=True)
    df = df.replace("\ninanced through SRF Mark\nng Balance Not Funded Wit", "", regex=True)
    df = df.replace("\nram-Cost Shown is \ne Financing", "", regex=True)
    
    df = df.replace("NOTE: Financed through SRF Market Rate Program-Cost Shown is", "", regex=True)
    df = df.replace("Remaining Balance Not Funded With Market Rate Financing", "", regex=True)

    return df

#### New York: "Page 0" Format
This format works for pages 0, 1, and 3

In [13]:
def page_zero_format(i:int):

    ny0 = ny_pdfs[i].df

    # remove note noise across columns
    ny0 = reduce_noise(ny0)

    # drop header column
    ny0 = ny0.iloc[1:,].copy()
    ny0.reset_index(drop=True)

    # split combined columns
    ny0[["Pop", "Score", "Cumulative.Total", "Project.Cost"]] = ny0[4].str.split("\n", expand=True)

    # split system and borrower based on the / between them
    ny0[2] = ny0[2].str.replace("\n", "")
    ny0[["System.Name", "Borrower"]] = ny0[2].str.split(" / ", expand=True)

    # drop expanded columns
    ny0.drop(columns=[2,4], inplace=True)

    # rename remaining original columns
    ny0.rename(columns={0:'Project.Number', 1:'County', 3:'Description'}, inplace=True)

    ny0 = ny0[final_cols]

    final_list_of_dfs.append(ny0)


#### New York: "Page 5" Format
This block of code works on almost all rows in pages 5, 7, 9 11-13, 15-19, 21-32.

One exception is that some rows will not correctly split the System and Borrower column, leaving both values in the System column and a None object in the Borrower column. This will be resolved once aggregated.

In [14]:

def page_five_format(i:int):

    ny5 = ny_pdfs[i].df

    # drop header column
    ny5 = ny5.iloc[1:,].copy()
    ny5.reset_index(drop=True)

    # replace note noise split across columns
    ny5 = reduce_noise(ny5)

    ny5[2] = ny5[2].str.replace("\n", "")
    ny5[["System.Name", "Borrower"]] = ny5[2].str.split(" / ", expand=True)

    ny5.drop(columns=[2], inplace=True)

    ny5.rename(columns={0:'Project.Number', 1:'County', 3:'Description', 4:'Pop', 5:'Project.Cost', 6:'Score', 7:"Cumulative.Total"}, inplace=True)

    # remove the extra noise that isn't caught by previous filters
    ny5['Pop'] = ny5['Pop'].str.replace("n", "", regex=True)
    ny5 = ny5.replace("\n", "", regex=True)
    ny5['Project.Cost'] = ny5['Project.Cost'].str.replace("anced through SRF Marke Balance Not Funded With", "", regex=True)
    ny5['Project.Cost'] = ny5['Project.Cost'].str.replace("anced through SRF Markeg Balance Not Funded With", "", regex=True)
    ny5['Project.Cost'] = ny5['Project.Cost'].str.replace("anced through SRF MarkeBalance Not Funded With", "", regex=True)
    ny5['Project.Cost'] = ny5['Project.Cost'].str.replace("anced through SRF MarkeBalance Not Funded With", "", regex=True)

    ny5 = ny5[final_cols]

    final_list_of_dfs.append(ny5)

### New York: One-Off Formats

#### Page 2

In [15]:
def page_two_format(i):

    ny2 = ny_pdfs[i].df
    ny2 = ny2.iloc[1:,].copy()

    # remove note noise across columns
    ny2 = reduce_noise(ny2)

    # split project code and County
    ny2[['Project.Number', 'County']] = ny2[0].str.split("\n", expand=True)

    # split System.Name and Borrower
    ny2[['System.Name', 'Borrower']] = ny2[1].str.split("/", expand=True)

    # split Pop, Project.Cost, Score, Cumulative.Total
    ny2[['Pop', 'Score', 'Cumulative.Total', 'Project.Cost']] = ny2[3].str.split("\n", expand=True)

    # remove old columns
    ny2.drop(columns=[0,1,3], inplace=True)
    ny2.rename(columns={2:'Description'}, inplace=True)

    # remove linebreaks
    ny2 = ny2.replace("\n", "", regex=True)

    ny2 = ny2[final_cols]

    final_list_of_dfs.append(ny2)

#### Page 4

In [16]:
def page_four_format(i):

    ny4 = ny_pdfs[i].df
    ny4 = ny4.iloc[1:,].copy().reset_index(drop=True)

    # split System.Name and Borrower
    ny4[['System.Name', 'Borrower']] = ny4[2].str.split(" /", expand=True)

    #manually fix truncated pop value
    ny4.iloc[4,4] = '6,552,718'

    # remove note noise across columns
    ny4 = reduce_noise(ny4)

    ny4.drop(columns=[2], inplace=True)
    ny4.rename(columns={0:'Project.Number', 1:'County', 3:'Description', 4:'Pop', 5:'Project.Cost', 
                        6:'Score', 7:'Cumulative.Total'}, inplace=True)

    ny4 = ny4.replace("\n", "", regex=True)

    ny4 = ny4[final_cols]

    final_list_of_dfs.append(ny4)

#### Page 6

In [17]:
def page_six_format(i):

    ny6 = ny_pdfs[i].df
    ny6 = reduce_noise(ny6)

    ny6 = ny6.iloc[1:,].copy().reset_index(drop=True)

    # split System.Name and Borrower
    ny6[['System.Name', 'Borrower']] = ny6[2].str.split(" /", expand=True)

    # manually add population data
    ny6['Pop'] = ['1,500', '15,779', '7,606', '1,040', '6,552,718', '150',
                '450', '1,907', '5,500', '4,500', '400', '350', '25,000',
                '20,600', '12,000', '4,000', '9,889']

    # for each row, replace the population stuck in the middle of the description
    for i in range(len(ny6[3])):
        ny6.iloc[i,3] = ny6.iloc[i,3].replace(ny6.iloc[i,9], "")

    ny6.rename(columns={0:'Project.Number', 1:'County', 3:'Description', 4:'Project.Cost', 5:'Score', 
                        6:'Cumulative.Total'}, inplace=True)

    ny6 = ny6.replace("\n", "", regex=True)

    ny6 = ny6[final_cols]

    final_list_of_dfs.append(ny6)

#### Page 8

In [18]:
def page_eight_format(i):        
        ny8 = ny_pdfs[i].df
        ny8 = reduce_noise(ny8)

        ny8 = ny8.iloc[1:,].copy().reset_index(drop=True)

        #manually reconstruct system, borrower, and descriptions,
        ny8['System.Name'] = ['AUBURN', 'ALEXANDRIA VILLAGE', 'NORTH GRANVILLE WD #1', 'ELMIRA WATER BOARD',
                'NCCRWP', 'HORSEHEADS VILLAGE', 'WD #3', 'MALTAVILLE WATER ', 'MASSENA (T)',
                'COOK STREET WD', 'ASHLAND WD', 'PHILADELPHIA (T)', 'KENSICO WATER DISTRICT',
                'ALBANY CITY', 'WD2EXT1NORTH&WTP RELOCATION', 'SALAMANCA CITY', 'BAINBRIDGE VILLAGE']

        ny8['Borrower'] = ['Auburn (C)', 'Alexandria Bay (V)', 'Granville (T)', 'Elmira Water Board', 'Chautauqua County',
                'Horseheads (T)', 'Campbell (T)', 'Malta (T)', 'Massena (T)', 'Norfolk (T)', 'Ashland (T)',
                'Philadelphia (T)', 'Mount Pleasant (T)', 'Albany MWFA', 'Chautauqua (T)', 'Salamanca (C)', 'Bainbridge (V)']

        ny8['Description'] = ['Comprehensive upgrades to City of Auburn Water Filtration Plant, Upgrade Water Treatment Plant',
                'WTP upgrades and water main replacement., Upgrade Water Treatment Plant, Upgrade Distribution System',
                'Source development and disribution improvements for the North Granville WD., Upgrade Ground Water Source, Upgrade Distribution System',
                'Elmira Refresh Initiative., Upgrade Transmission, Upgrade Distribution System',
                'North Chautauqua County Regional Water Project., New Water District',
                'Oak Hill Water District Extension, New Distribution System',
                'Serve private contaminated/poor yielding wells., Extend Water District',
                'Maltaville WD., New Water District',
                'Proposed Service Area No. 1, New Water District',
                'Extend Distribution System',
                'Ashland WD Extension Along NYS Rt. 23., Extend Distribution System',
                'New Proposed Water System, New Distribution System',
                'Kensico WD Storage Tank and Treatment Improvements, Upgrade Storage, Upgrade Distribution System',
                'Feura Bush WTP Upgrades., Upgrade Water Treatment Plant, Upgrade Storage',
                'New Surface Water Treatment Rule Compliance Facility, Extend Transmission, Extend Distribution System',
                'Abandon Existing WTP, New Well Source, New Pump Building., Upgrade Water Treatment Plant',
                'Water System Improvements - WM Replacement, New Well, Booster PS Improvements, New Ground Water Source, Upgrade Pump Station, Upgrade Distribution System']

        ny8.rename(columns={0:'Project.Number', 1:'County', 3:'Pop', 4:'Project.Cost', 5:'Score', 6:'Cumulative.Total'}, inplace=True)

        ny8 = ny8.replace("\n", "", regex=True)
        ny8['Project.Cost'] = ny8['Project.Cost'].str.replace("n", "", regex=True)

        ny8 = ny8[final_cols]

        final_list_of_dfs.append(ny8)

#### Fix errors in 10, 14, 20

In [19]:
def page_ten_format(i):

    ny10 = ny_pdfs[10].df
    ny10 = reduce_noise(ny10)

    ny10 = ny10.iloc[1:,].copy().reset_index(drop=True)

    ny10[['System.Name', 'Borrower']] = ny10[2].str.split("/", 1, expand=True)

    # One of the results of combining the reduce_noise() function with splitting by the linebreak is that on certain pages, it creates split lists
    # with inconsistent lengths, making the expand=True option difficult to use.
    # This ugly for loop gets rid of wherever those remaining artefacts are so the data can correctly be assigned

    # create empty column to fill
    ny10['Pop'] = ''
    ny10['Score'] = ''
    ny10['Cumulative.Total'] = ''
    ny10['Project.Cost'] = ''

    # for each row
    for i in range(len(ny10[4])):

        # store a list of the split values
        each = ny10.iloc[i,4].split("\n")

        # remove the empty strings and spaces left by reduce_noise() and splitting
        while "" in each: each.remove("")
        while " " in each: each.remove(" ")
        while "r" in each: each.remove("r")

        # store split values in the respective columns
        ny10['Pop'].iloc[i] = each[0]
        ny10['Score'].iloc[i] = each[1]
        ny10['Cumulative.Total'].iloc[i] = each[2]
        ny10['Project.Cost'].iloc[i] = each[3]

    ny10.rename(columns={0:'Project.Number', 1:'County', 3:'Description'}, inplace=True)

    ny10 = ny10.replace("\n", "", regex=True)

    ny10 = ny10[final_cols]

    final_list_of_dfs.append(ny10)

In [20]:
def page_fourteen_format(i):

    ny14 = ny_pdfs[i].df
    ny14 = reduce_noise(ny14)

    ny14 = ny14.iloc[1:,].copy().reset_index(drop=True)

    ny14[['System.Name', 'Borrower']] = ny14[2].str.split("/", 1, expand=True)

    #manually fix row that splits incorrectly due to extra / in name
    ny14['System.Name'].iloc[10] = "COUNTY RT. 57/HUNTLEYRD WATER DISTRICT"
    ny14['Borrower'].iloc[10] = "Schroeppel (T)"

    ny14.rename(columns={0:'Project.Number', 1:'County', 3:'Description', 4:'Pop', 5:'Project.Cost', 6:'Score', 7:'Cumulative.Total'}, inplace=True)

    ny14 = ny14.replace("\n", "", regex=True)

    ny14 = ny14[final_cols]

    final_list_of_dfs.append(ny14)

In [21]:
def page_twenty_format(i):

    ny20 = ny_pdfs[20].df
    ny20 = reduce_noise(ny20)

    ny20 = ny20.iloc[1:,].copy().reset_index(drop=True)

    ny20[['System.Name', 'Borrower']] = ny20[2].str.split("/", 1, expand=True)

    #manually fix row that splits incorrectly due to extra / in name
    ny20['System.Name'].iloc[8] = "POUGHKEEPSIE CITY	TOWN TREATMENT PLANT"
    ny20['Borrower'].iloc[8] = "Poughkeepsie JWB"

    # One of the results of combining the reduce_noise() function with splitting by the linebreak is that on certain pages, it creates split lists
    # with inconsistent lengths, making the expand=True option difficult to use.
    # This ugly for loop gets rid of wherever those remaining artefacts are so the data can correctly be assigned

    # create empty column to fill
    ny20['Pop'] = ''
    ny20['Score'] = ''
    ny20['Cumulative.Total'] = ''
    ny20['Project.Cost'] = ''

    # for each row
    for i in range(len(ny20[4])):

        # store a list of the split values
        each = ny20.iloc[i,4].split("\n")

        # remove the empty strings and spaces left by reduce_noise() and splitting
        while "" in each: each.remove("")
        while " " in each: each.remove(" ")

        # store split values in the respective columns
        ny20['Pop'].iloc[i] = each[0]
        ny20['Score'].iloc[i] = each[1]
        ny20['Cumulative.Total'].iloc[i] = each[2]
        ny20['Project.Cost'].iloc[i] = each[3]

    ny20.rename(columns={0:'Project.Number', 1:'County', 3:'Description'}, inplace=True)

    ny20 = ny20.replace("\n", "", regex=True)

    ny20 = ny20[final_cols]

    final_list_of_dfs.append(ny20)

### Finalize Merged New York Data

In [22]:
# for appending each subset of pages after parsing
final_list_of_dfs = []

zero_list = [0,1,3]
five_list = [5, 7, 9, 11, 12, 13, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]

for num in range(len(ny_pdfs)):
    if num in zero_list:
        page_zero_format(num)
    elif num in five_list:
        page_five_format(num)
    elif num == 2:
        page_two_format(num)
    elif num == 4:
        page_four_format(num)
    elif num == 6:
        page_six_format(num)
    elif num == 8:
        page_eight_format(num)
    elif num == 10:
        page_ten_format(num)
    elif num == 14:
        page_fourteen_format(num)
    elif num == 20:
        page_twenty_format(num)


In [23]:
ny_tbl = pd.concat(final_list_of_dfs)
ny_tbl.reset_index(drop=True, inplace=True)

# fix minor corrects as they are discovered
ny_tbl['Project.Number'] = ny_tbl['Project.Number'].str.replace("  S", "", regex=True)

# extract the 'code' that ends up attached to some project #s and then remove from the original column
ny_tbl['Code'] = ny_tbl['Project.Number'].copy()
ny_tbl['Code'] = ny_tbl['Code'].str.replace('(\d)', '', regex=True)
ny_tbl['Project.Number'] = ny_tbl['Project.Number'].str[:5]

# for each row where Sys.Name/Borrower didn't split correctly, resplit and store the values
for i in ny_tbl[ny_tbl['Borrower'].isna()].index:
    ny_tbl['Borrower'].iloc[i] = ny_tbl['System.Name'].iloc[i].split("/", 1)[1]
    ny_tbl['System.Name'].iloc[i] = ny_tbl['System.Name'].iloc[i].split("/", 1)[0]

### removing misc characters
ny_tbl = ny_tbl.replace("  ", " ", regex=True)
ny_tbl['Description'].str.strip()
ny_tbl['System.Name'].str.strip()
ny_tbl['Borrower'].str.strip()

ny_tbl

Unnamed: 0,Project.Number,County,System.Name,Borrower,Description,Pop,Project.Cost,Score,Cumulative.Total,Code
0,18660,Ulster,ELLENVILLE (VILLAGE) WATER DIS,Ellenville (V),Phase 2 of #15003 - New Well - replace GWUDI w...,4323,"$2,800,000",H,"$2,800,000",
1,18700,Ulster,ROSENDALE WATER DISTRICT,Rosendale (T),"Phase 2 of project 18111. Cost overrun., Upgra...",2200,"$2,070,000",H,"$4,870,000",
2,17163,Essex,TICONDEROGA WD,Ticonderoga (T),New Surface Water Treatment Rule Compliance Fa...,5000,$0,H,"$4,870,000",
3,18061,Franklin,TUPPER LAKE V,Tupper Lake (V),New wells to replace SW source and inadequate ...,5500,$0,H,"$4,870,000",
4,18381,Erie,LAWTONS WATER CO,North Collins (T),Aquisit'n of private PWS & improvements for SW...,95,$0,H,"$4,870,000",
...,...,...,...,...,...,...,...,...,...,...
553,18323,Erie,ORCHARD PARK VILLAGE,Orchard Park (V),"Quaker St Water Line Replacement., Upgrade Dis...",3100,"$1,900,000",10,"$2,794,026,832",
554,18620,Columbia,KINDERHOOK VILLAGE,Kinderhook (V),Williams Street & Albany Avenue Water Main Rep...,1385,"$1,864,500",10,"$2,795,891,332",
555,18145,Dutchess,STAATSBURG WATER SYSTEM,DCWWA,Upgrade Distribution System,1072,"$1,815,600",10,"$2,797,706,932",
556,18447,Schenectady,PRINCETOWN WATER SUPPLY,Princetown (T),Replacement of corroded distribution system co...,600,"$1,214,500",10,"$2,798,921,432",


In [25]:
ny_tbl.to_csv("final_ppls/32-NewYork_base_ppl.csv", index=False)

### New York: Table 2
BIL-GS

In [937]:
ny_pdfs_2 = camelot.read_pdf("https://www.health.ny.gov/environmental/water/drinking/iup/2023/docs/final_2023_intended_use_plan.pdf",
                                pages='72-78', flavor = 'stream', row_tol=30, split_text=True,
                                flag_size=True)

In [1249]:
ny_bil_list = []

for i in range(len(ny_pdfs_2)):
    ny_bil = ny_pdfs_2[i].df
    
    # remove first row
    ny_bil = ny_bil.iloc[1:,]
    # split sys name and borrower
    ny_bil[['System.Name', 'Borrower']] = ny_bil[2].str.split("/", expand=True)
    # drop line breaks
    ny_bil = ny_bil.replace("\n", "", regex=True)
    # rename columns
    ny_bil.rename(columns={0:'Project.Number', 1:'County', 3:'Description', 4:'Pop', 5:"Project.Cost", 6:'Score', 7:'Cumulative.Total'}, inplace=True)
    # follow standardized order
    ny_bil = ny_bil[final_cols]
    # get rid of summary columns
    ny_bil = ny_bil[ny_bil["Project.Number"] != ''].copy()
    # add to list
    ny_bil_list.append(ny_bil)

# turn into single dataframe
ny_bil_df = pd.concat(ny_bil_list)

#len = 120 as intended
ny_bil_df

### removing misc characters
ny_bil_df = ny_bil_df.replace("  ", " ", regex=True)
ny_bil_df['Description'].str.strip()
ny_bil_df['System.Name'].str.strip()
ny_bil_df['Borrower'].str.strip()

ny_bil_df

Unnamed: 0,Project.Number,County,System.Name,Borrower,Description,Pop,Project.Cost,Score,Cumulative.Total
1,18181,Oswego,ORWELL WD,Orwell (T),New Source to Address Source Capacitiy Violati...,150,"$3,027,500",155,"$3,027,500"
2,18563,Herkimer,ILION (V) WATER WORKS,Ilion (V),Medium Priority Distribution Imprvts and Easte...,8610,"$28,474,350",145,"$31,501,850"
3,18811,Schuyler,WATKINS GLEN VILLAGE,Watkins Glen (V),Water System Improvements. Address CT and Turb...,2149,"$22,052,300",145,"$53,554,150"
4,18846,Otsego,RICHFIELD SPRINGS VILLAGE,Richfield Springs (V),"RW Transmission Main SWTR Compliance., New Pum...",1200,"$1,155,569",140,"$54,709,719"
5,19151,St.Lawrence,MORRISTOWN VILLAGE,Morristown (T),WTP & Distribution Improvements. River Road Ea...,490,"$16,789,000",140,"$71,498,719"
...,...,...,...,...,...,...,...,...,...
2,19139,Suffolk,SUFFOLK COUNTY WATER AUTHORITY,SCWA,Replacement of Well No. 2A at the Oval Drive W...,1009264,"$1,200,000",15,"$910,911,781"
3,19178,Westchester,YONKERS CITY,Yonkers (C),"Replace 30"" water main under Saw Mill River Pk...",196086,"$4,321,880",15,"$915,233,661"
4,19275,Suffolk,RIVERHEAD WD,Riverhead (T),"New Ground Storage Tank at East Wind Drive, Ne...",28000,"$7,427,500",15,"$922,661,161"
5,19148,Suffolk,SUFFOLK COUNTY WATER AUTHORITY,SCWA,"Replacement of 10,000 feet of water main., New...",1009264,"$3,427,390",10,"$926,088,551"


In [1256]:
ny_bil_df.to_csv("final_ppls/32-NewYork_bil_ppl.csv")

### New York: Table 3
Lead

In [5]:
ny_pdfs_3 = camelot.read_pdf("https://health.ny.gov/environmental/water/drinking/iup/2023/docs/draft_iup_amendment_4_2023.pdf",
                                pages='8-13', flavor = 'stream', row_tol=32, split_text=True, flag_size=True)

Invalid stream (index 50) within object 703 0: Stream has ended unexpectedly
Invalid stream (index 50) within object 703 0: Stream has ended unexpectedly
Invalid stream (index 50) within object 703 0: Stream has ended unexpectedly
Invalid stream (index 50) within object 703 0: Stream has ended unexpectedly


In [6]:
ny_lead_list = []

for i in range(len(ny_pdfs_3)):

    ny_l = ny_pdfs_3[i].df

    if i < 5:
        # cut out header and footer on all but last page
        ny_l = ny_l.iloc[1:-1,].copy().reset_index(drop=True)
    else:
        ny_l = ny_l.iloc[1:,].copy().reset_index(drop=True)

    # make DAC status its own column
    ny_l['DAC'] = ny_l[2].str.extract("(DAC)")
    ny_l['DAC'] = ny_l['DAC'].fillna("")

    ny_l = ny_l.replace("\n", " ", regex=True)

    # fix manual oddities
    ny_l = ny_l.replace("</s>", "", regex=True)
    ny_l = ny_l.replace("<s>", "", regex=True)
    ny_l[4] = ny_l[4].str.replace("m ", "", regex=True)

    # remove DAC status from sysname or borrower
    ny_l[2] = ny_l[2].replace("DAC", "", regex=True)

    # split into columns
    ny_l[['System.Name','Borrower']] = ny_l[2].str.split("/", expand=True)

    ny_l.rename(columns={0:"Project.Number", 1:'County', 3:"Description", 4:"Pop", 5:"Project.Cost", 6:"Score", 7:"Cumulative.Total"}, inplace=True)

    ny_l.drop(columns=[2], inplace=True)

    ny_l = ny_l[["Project.Number", 'County', "System.Name", "Borrower", "DAC", "Pop", "Description", "Project.Cost", "Score", "Cumulative.Total"]]
    
    ny_lead_list.append(ny_l)

In [7]:
ny_lead_tbl = pd.concat(ny_lead_list)
ny_lead_tbl.reset_index(drop=True, inplace=True)
ny_lead_tbl

### removing misc characters
ny_lead_tbl = ny_lead_tbl.replace("  ", " ", regex=True)
ny_lead_tbl['Description'].str.strip()
ny_lead_tbl['System.Name'].str.strip()
ny_lead_tbl['Borrower'].str.strip()

ny_lead_tbl

Unnamed: 0,Project.Number,County,System.Name,Borrower,DAC,Pop,Description,Project.Cost,Score,Cumulative.Total
0,19287,Rensselaer,TROY CITY PWS,Troy (C),DAC,49170,"Lead service line inventory., Upgrade Distribu...","$572,000",70,"$572,000"
1,19411,Cayuga,AUBURN,Auburn (C),DAC,29788,"Lead Service Line Inventory., Upgrade Distribu...","$917,988",65,"$1,489,988"
2,19365,Orange,NEWBURGH CITY,Newburgh (C),DAC,28000,"Lead Service Line Inventory., Upgrade Distribu...","$968,000",65,"$2,457,988"
3,19326,Dutchess,AMENIA WATER DISTRICT NO 1,Amenia (T),DAC,1000,"Lead Service Line Inventory., Upgrade Distribu...","$481,000",50,"$2,938,988"
4,19426,Herkimer,ILION (V) WATER WORKS,Ilion (V),DAC,8610,"Lead service line replacement., Upgrade Distri...","$4,071,590",45,"$7,010,578"
...,...,...,...,...,...,...,...,...,...,...
104,19439,Orange,MONROE (T),Monroe (T),DAC,1500,"Lead Service Line Inventory, Upgrade Distribut...","$91,250",15,"$315,009,561"
105,19389,Dutchess,RED HOOK WATER DISTRICT #1,Red Hook (T),DAC,1440,"Lead Service Line Inventory, Upgrade Distribut...","$178,000",15,"$315,187,561"
106,19434,Columbia,KINDERHOOK VILLAGE,Kinderhook (V),,1385,"Lead service line inventory., Upgrade Distribu...","$192,000",15,"$315,379,561"
107,19380,Dutchess,MILLERTON VILLAGE,Millerton (V),DAC,1150,"Lead Service Line Inventory, Upgrade Distribut...","$128,000",15,"$315,507,561"


In [8]:
ny_lead_tbl.to_csv("final_ppls/32-NewYork_lead.csv")

### New York: Document 3, Table 1
EC - NOTE: Code produces PFAS/DAC columns that are sometimes, but not always, off by one row. Given the small size of the table, this is manually fixed in a spreadsheet prior to upload.

In [950]:
ny_pdfs_4 = camelot.read_pdf("https://health.ny.gov/environmental/water/drinking/iup/2023/docs/final_iup_amendment_2_2023.pdf",
                                pages='8-9', flavor = 'stream', row_tol=30, split_text=True,
                                flag_size=True)

In [1252]:
ny_ec_list = []

for i in range(len(ny_pdfs_4)):

    ny_ec = ny_pdfs_4[i].df

    if i < 1:
        # cut out header and footer on all but last page
        ny_ec = ny_ec.iloc[1:-1,].copy().reset_index(drop=True)
    else:
        ny_ec = ny_ec.iloc[1:,].copy().reset_index(drop=True)

    # make DAC/PFAS status its own column
    ny_ec['DAC'] = ny_ec[2].str.extract("(DAC)")
    ny_ec['DAC'] = ny_ec['DAC'].fillna("")

    ny_ec['PFAS'] = ny_ec[2].str.extract("(PFAS)")
    ny_ec['PFAS'] = ny_ec['PFAS'].fillna("")

    ny_ec = ny_ec.replace("\n", " ", regex=True)

    # remove DAC/PFAS status from sysname or borrower
    ny_ec[2] = ny_ec[2].replace("DAC", "", regex=True)
    ny_ec[2] = ny_ec[2].replace("PFAS/", "", regex=True)
    ny_ec[2] = ny_ec[2].replace("PFAS", "", regex=True)

    # split into columns
    ny_ec[['System.Name','Borrower']] = ny_ec[2].str.split("/", expand=True)

    ny_ec.rename(columns={0:"Project.Number", 1:'County', 3:"Description", 4:"Pop", 5:"Project.Cost", 6:"Score", 7:"Cumulative.Total"}, inplace=True)

    ny_ec.drop(columns=[2], inplace=True)

    ny_ec = ny_ec[["Project.Number", 'County', "System.Name", "Borrower", "DAC", "PFAS", "Description", "Project.Cost", "Score", "Cumulative.Total"]]

    ny_ec_list.append(ny_ec)

ny_ec_tbl = pd.concat(ny_ec_list)

ny_ec_tbl.reset_index(drop=True, inplace=True)

### removing misc characters
ny_ec_tbl = ny_ec_tbl.replace("  ", " ", regex=True)
ny_ec_tbl['Description'].str.strip()
ny_ec_tbl['System.Name'].str.strip()
ny_ec_tbl['Borrower'].str.strip()

ny_ec_tbl

Unnamed: 0,Project.Number,County,System.Name,Borrower,DAC,PFAS,Description,Project.Cost,Score,Cumulative.Total
0,19489,Clinton,SCHUYLER FALLS (T),Clinton County,,,Morrisonville WD Sands Road Extension to serve...,"$5,500,000",125,"$5,500,000"
1,18973,Suffolk,RIVERHEAD WD,Riverhead (T),DAC,PFAS,Consolidation WD Extension - Forge Rd MHPs due...,"$1,185,000",110,"$6,685,000"
2,18967,Nassau,TOWN OF HEMPSTEAD WD,Hempstead (T),DAC,PFAS,"1,4-dioxane treatment for Well No. 13 at Levit...","$8,110,000",80,"$14,795,000"
3,19078,Suffolk,GREENLAWN WD,Greenlawn WD,,,"Well No. 11 - AOP treatment for 1,4-dioxane re...","$5,546,550",80,"$20,341,550"
4,19007,Nassau,PLAINVIEW WD,Plainview WD,,,"Plant No. 3 - AOP/GAC treatment for 1,4-dioxan...","$4,196,941",80,"$24,538,491"
5,19008,Nassau,PLAINVIEW WD,Plainview WD,,PFAS,"Plant No. 4 - AOP/GAC treatment for 1,4-dioxan...","$7,508,700",80,"$32,047,191"
6,19066,Nassau,GARDEN CITY PARK WD,Garden City Park WD,,,"Plant No. 8 - AOP/GAC treatment for 1,4-dioxan...","$7,272,000",80,"$39,319,191"
7,19110,Rensselaer,POESTENKILL WD #2,Poestenkill (T),,PFAS,Creation of Poestenkill WD #2 to provide water...,"$5,711,000",70,"$45,030,191"
8,19125,Ontario,RUSHVILLE VILLAGE,Rushville (V),,PFAS,WTP Improvements to address harmful algal bloo...,"$1,371,239",70,"$46,401,430"
9,19104,Suffolk,SUFFOLK COUNTY WATER AUTHORITY,SCWA,,,Connection of public water to residences in Ca...,"$6,850,000",60,"$53,251,430"


In [1258]:
ny_ec_tbl.to_csv("final_ppls/32-NewYork_ec.csv")