In [1]:
## Import necessary packages
import pandas as pd
import numpy as np
import camelot
import warnings
warnings.filterwarnings("ignore")

##  Indiana

### Q4 Final PPL

Previous parsing efforts included previous quarters, but it has been determined that each quarter's lists are both comprehensive and complete, so we only need the final tables from the Q4 documents.

In [2]:
print("Parsing...")
# get the full table
in_pdf_1 = camelot.read_pdf("https://www.in.gov/ifa/srf/files/Revised-DWSRF-2023-DW-Project-Priority-List-Q4-Final-6.5.23.pdf", 
                           pages='1-2', flavor='stream', row_tol=10)
print(len(in_pdf_1))

Parsing...
2


In [3]:
# read in first page
in_ppl_1 = in_pdf_1[0].df

# set columns
in_ppl_1.columns = ['PPL Rank', 'PPL Score', 'Participant', 'MHI', 'Population Served', 'PWSID No(s).', 'SRF Project No.', 'Project Description',
                'Lead Service Line Replacement Cost', 'Emerging Contaminants?', 'Disadvantaged Community?', 'Estimated Green Project Reserve Cost',
                'Green Project Reserve Category', 'Current User Rate (per 4,000 gallons)', 'Estimated Post-Project User Rate (per 4,000 gallons)', 'Requested Funds',
                'Cumulative Requested Funds']

# remove column name rows
in_ppl_1 = in_ppl_1.iloc[3:,].copy()

# prep for merging with others
in_ppl_1 = in_ppl_1.reset_index(drop=True)

In [4]:
# read remainder of first table before it transitions to applicant only projects
in_ppl_2 = in_pdf_1[1].df.iloc[:30,].copy()

# set Rank and Score as one column because they were mixed together
in_ppl_2.columns = ['Rank.Score', 'Participant', 'MHI', 'Population Served', 'PWSID No(s).', 'SRF Project No.', 'Project Description',
                'Lead Service Line Replacement Cost', 'Emerging Contaminants?', 'Disadvantaged Community?', 'Estimated Green Project Reserve Cost',
                'Green Project Reserve Category', 'Current User Rate (per 4,000 gallons)', 'Estimated Post-Project User Rate (per 4,000 gallons)', 'Requested Funds',
                'Cumulative Requested Funds']

# split rank and score into two columns and drop the temp column
in_ppl_2[['PPL Rank', 'PPL Score']] = in_ppl_2['Rank.Score'].str.split("\n", expand=True)
in_ppl_2 = in_ppl_2.drop(columns=['Rank.Score']).copy()

# reset and reorder columns to prep for merge
in_ppl_2 = in_ppl_2[['PPL Rank', 'PPL Score', 'Participant', 'MHI', 'Population Served', 'PWSID No(s).', 'SRF Project No.', 'Project Description',
                'Lead Service Line Replacement Cost', 'Emerging Contaminants?', 'Disadvantaged Community?', 'Estimated Green Project Reserve Cost',
                'Green Project Reserve Category', 'Current User Rate (per 4,000 gallons)', 'Estimated Post-Project User Rate (per 4,000 gallons)', 'Requested Funds',
                'Cumulative Requested Funds']].copy()

in_ppl_2 = in_ppl_2.reset_index(drop=True)

In [5]:
# isolate application only table because it combines rank and score into one feature
in_ppl_3 = in_pdf_1[1].df.iloc[34:39,].copy()

# set columns and prep for merging
in_ppl_3.columns = ['PPL Rank', 'Participant', 'MHI', 'Population Served', 'PWSID No(s).', 'SRF Project No.', 'Project Description',
                'Lead Service Line Replacement Cost', 'Emerging Contaminants?', 'Disadvantaged Community?', 'Estimated Green Project Reserve Cost',
                'Green Project Reserve Category', 'Current User Rate (per 4,000 gallons)', 'Estimated Post-Project User Rate (per 4,000 gallons)', 'Requested Funds',
                'Cumulative Requested Funds']

in_ppl_3 = in_ppl_3.reset_index(drop=True)

In [6]:
# combine and export to csv
in_ppl_output = pd.concat([in_ppl_1, in_ppl_2, in_ppl_3], ignore_index=True)

in_ppl_output.to_csv("../data/year1/csv/14-Indiana_Q4Final_PPL.csv", index=False)

### Final Lead PPL 

In [7]:
print("Parsing...")
# get the full table
in_pdf_2 = camelot.read_pdf("https://www.in.gov/ifa/srf/files/Revised-DWSRF-2023-Lead-Service-Line-Replacement-Project-Priority-List-Q4-Final-6.5.23.pdf", 
                           pages='1', flavor='stream', row_tol=10)
print(len(in_pdf_2))

Parsing...
1


In [8]:
in_lead_ppl = in_pdf_2[0].df

in_lead_ppl.columns = ['PPL Rank', 'PPL Score', 'Participant', 'MHI', 'Population Served', 'PWSID No(s).', 'SRF Project No.', 'Project Description',
                'Lead Service Line Replacement Cost', 'Emerging Contaminants?', 'Disadvantaged Community?', 'Estimated Green Project Reserve Cost',
                'Green Project Reserve Category', 'Current User Rate (per 4,000 gallons)', 'Estimated Post-Project User Rate (per 4,000 gallons)', 'Requested Funds',
                'Cumulative Requested Funds']

in_lead_ppl = in_lead_ppl.iloc[5:14,].reset_index(drop=True).copy()

In [9]:
in_lead_ppl.to_csv("../data/year1/csv/14-Indiana_Q4Final_LeadPPL.csv", index=False)

### 