In [1]:
import pandas as pd
import numpy as np
import camelot.io as camelot
import sys

 # adding folders up to system path to access functions
sys.path.insert(0, '../../../')
from driveFolderHandler import downloadTempDriveFolder, deleteTempDriveFolder

In [2]:
path = "../temp-docs"
downloadTempDriveFolder(folderID='1y2pdF3789Hhvc7Cr7MajOwQ2Pzvbi6ne',
                        path = path)

Retrieving folder contents


Processing file 1QaAlIsoaAhH-9-1frmmrbSP-Omq7MzSe OH_IUPandPPL_2023.pdf
Building directory structure completed


Retrieving folder contents completed
Building directory structure
Downloading...
From: https://drive.google.com/uc?id=1QaAlIsoaAhH-9-1frmmrbSP-Omq7MzSe
To: /Users/pcork/epic/dw-dashboard/year1/OH/temp-docs/OH_IUPandPPL_2023.pdf
100%|██████████| 1.35M/1.35M [00:00<00:00, 11.0MB/s]

Files stored in ../temp-docs



Download completed


# Ohio

### Ohio Table 1: Project Priority and Intended Projects List for PY 2023

In [3]:
oh_pdfs = camelot.read_pdf(path+"/OH_IUPandPPL_2023.pdf", 
                           pages='25-34', flavor = 'stream')
print(len(oh_pdfs))

Parsing Ohio, 35/50...
15


In [60]:
list_of_dfs = []

for i in range(len(oh_pdfs)):
    page_df = oh_pdfs[i].df
    page_df = page_df.iloc[5:,].copy()

    # on some pages the space between entity and project creates a phantom column. 
    # If present, remove, so column names remain uniform and concat correctly
    if len(page_df.columns) == 10:
        page_df.drop(columns=[1], inplace=True)
    
    page_df.columns = ['Entity', 'Project', 'PWSID', 'Population', 'County', 'Estimated.Loan.Amount', 'Loan.Type', 'Estimated.Award.Date', 'Rate']
    
    list_of_dfs.append(page_df)

oh = pd.concat(list_of_dfs).reset_index(drop=True)

# drop total columns and noise
oh = oh.iloc[:670,].copy()
oh = oh.drop_duplicates().reset_index(drop=True)

oh.to_csv("../data/35-Ohio_PPL_Base.csv")

In [59]:
oh.tail(10)

Unnamed: 0,Entity,Project,PWSID,Population,County,Estimated.Loan.Amount,Loan.Type,Estimated.Award.Date,Rate
430,Willard,Park Street Water Line Replacement,OH3901511,9979,Huron,"$895,295",Construction,Oct-22,SML/LSL
431,Willard,Water System Improvements,OH3901511,9979,Huron,"$6,910,000",Construction,Dec-22,SML
432,Wilmington,Raw Water Main Relocation,OH1401211,12401,Clinton,"$1,200,000",Construction,Jan-23,STD
433,Woodville,Elevated Water Storage Tank,OH7200912,2135,Sandusky,"$1,417,370",Construction,Jun-23,SML
434,Woodville,Elevated Water Storage Tank,OH7200912,2135,Sandusky,"$67,055",Design,Jul-22,PLN/DES
435,Woodville,Lime Lagoon Rehabilitation,OH7200912,2135,Sandusky,"$416,745",Construction,Jun-23,SML
436,Woodville,Water Line Improvements Phase 3,OH7200912,2135,Sandusky,"$80,560",Design,Jul-22,PLN/DES
437,Woodville,Water Line Improvements Phase 3,OH7200912,2135,Sandusky,"$557,530",Construction,Jan-23,SML/LSL
438,Woodville,Waterline Improvements Phase 4,OH7200912,2135,Sandusky,"$114,990",Design,Mar-23,PLN/DES
439,Zanesville,Pioneer Water Reservoir Replacement,OH6002712,25470,Muskingum,"$3,547,300",Construction,Sep-22,STD


### Ohio Table 2: Projects Eligible for Disadvantaged Community Principal Forgiveness in PY 2023

In [3]:
oh_pdfs_2 = camelot.read_pdf(path+"/OH_IUPandPPL_2023.pdf",
                              pages='35', flavor = 'stream')
print(len(oh_pdfs_2))

1


In [5]:
# select the first page of the table
oh_2 = oh_pdfs_2[0].df
# keep only relevant rows
oh_2 = oh_2.iloc[7:50,].copy().reset_index(drop=True)
# set column names
oh_2.columns = ['Entity', 'Project', 'County', 'Estimated.Loan.Amount', 'Estimated.Principal.Forgiveness', 'Loan.Type', 
                    'Estimated.Award.Date', 'Project.Score', 'Readiness.To.Proceed', 'Rate']

oh_2.to_csv("../data/35-Ohio_PPL_PF.csv", index=False)

### Ohio Table 3: Projects Eligible for Regionalization Principal Forgiveness and Discount in PY23

In [1284]:
oh_pdfs_3 = camelot.read_pdf(path+"/OH_IUPandPPL_2023.pdf", pages='36-37', flavor = 'stream')
print(len(oh_pdfs_3))

Parsing Ohio, 35/50...
2


In [1285]:
# format the first page
oh_3_0 = oh_pdfs_3[0].df
oh_3_0 = oh_3_0.iloc[7:,].copy()
oh_3_0.columns = ['Entity', 'Project', 'County', 'Estimated.Loan.Amount', 'Estimated.Principal.Forgiveness', 'Loan.Type', 
                    'Estimated.Award.Date', 'Project.Score', 'Readiness.To.Proceed', 'Rate']

# format the second page
oh_3_1 = oh_pdfs_3[1].df
oh_3_1 = oh_3_1.iloc[7:10,].copy()
oh_3_1.drop(columns=[1], inplace=True)
oh_3_1.columns = ['Entity', 'Project', 'County', 'Estimated.Loan.Amount', 'Estimated.Principal.Forgiveness', 'Loan.Type', 
                    'Estimated.Award.Date', 'Project.Score', 'Readiness.To.Proceed', 'Rate']

#combine
oh_3 = pd.concat([oh_3_0, oh_3_1]).reset_index(drop=True)
#preview
oh_3.sample(10)
oh_3.to_csv("../data/35-Ohio_PPL_RegionalPF.csv", index=False)

### Ohio Table 4: Projects Eligible for HAB/PFAS Discount in PY 2023

In [1287]:
oh_pdfs_4 = camelot.read_pdf(path+"/OH_IUPandPPL_2023.pdf", pages='38', flavor = 'stream')
print(len(oh_pdfs_4))

Parsing Ohio, 35/50...
1


In [None]:
# first page of table
oh_4 = oh_pdfs_4[0].df
# drop first four rows
oh_4 = oh_4.iloc[5:,].copy().reset_index(drop=True)
# set column names
oh_4.columns = ['Entity', 'Project', 'County', 'Estimated.Loan.Amount', 'Loan.Type', 'Estimated.Award.Date', 'Rate']

oh_4.to_csv("f../data/35-Ohio_PPL_HAB_PFAS.csv", index=False)

## Ohio Table 5: Projects Eliglible for Lead Service Line (LSL) Funding

In [61]:

oh_pdfs_5 = camelot.read_pdf(path+"/OH_IUPandPPL_2023.pdf", pages='39-40', flavor = 'stream')
print(len(oh_pdfs_5))

Parsing Ohio, 35/50...
2


In [75]:
#repeat above process
oh_5 = oh_pdfs_5[0].df
oh_5.columns = ['Entity', 'Project', 'County', 'Estimated.Loan.Amount', 'Estimated.LSL.Eligible.Costs', 'Loan.Type', 'Estimated.Award.Date', 'Rate']
oh_5 = oh_5.iloc[5:44,].copy().reset_index(drop=True)

# in both money columns, remove the line break and then the remaining space
oh_5['Estimated.Loan.Amount'] = oh_5['Estimated.Loan.Amount'].str.replace("\n", "")
oh_5['Estimated.Loan.Amount'] = oh_5['Estimated.Loan.Amount'].str.replace(" ", "")
oh_5['Estimated.LSL.Eligible.Costs'] = oh_5['Estimated.LSL.Eligible.Costs'].str.replace("\n", "")
oh_5['Estimated.LSL.Eligible.Costs'] = oh_5['Estimated.LSL.Eligible.Costs'].str.replace(" ", "")

oh_5

Unnamed: 0,Entity,Project,County,Estimated.Loan.Amount,Estimated.LSL.Eligible.Costs,Loan.Type,Estimated.Award.Date,Rate
0,Addyston,Lead Service Line Replacement,Hamilton,"$88,269","$88,269",Design,Jun-23,PLN/DES
1,Addyston,Lead Service Line Replacement,Hamilton,"$1,881,488","$1,881,488",Construction,Jun-23,LSL
2,Addyston,Sekitan Avenue Water Main and LSL Replacement,Hamilton,"$79,233","$22,057",Design,Jan-23,PLN/DES
3,Addyston,Sekitan Avenue Lead Service Line Replacement,Hamilton,"$558,394","$558,394",Construction,Jun-23,LSL
4,Akron,Lead Service Line Replacement Program 2022,Summit,"$5,000,000","$5,000,000",Construction,Sep-22,LSL
5,Alliance,"Lead Service Line Replacement Project, Phase 3",Stark,"$1,000,000","$1,000,000",Construction,Jun-23,LSL
6,Bellaire,Lead Service Line Replacement,Belmont,"$1,000,000","$1,000,000",Construction,Sep-22,LSL
7,Bowerston,Water System (LSLR) Improvements Project,Harrison,"$225,000","$225,000",Construction,Mar-23,LSL
8,Cadiz,Lead Service Line Replacement Project,Harrison,"$578,000","$578,000",Construction,Sep-22,LSL
9,Cincinnati,Apple Cooper Hanfield Water Main Replacement,Hamilton,"$2,307,000","$963,300",Construction,May-23,STD/LSL


In [76]:
#repeat for second page of table 5
oh_5_1 = oh_pdfs_5[1].df
oh_5_1.columns = ['Entity', 'Project', 'County', 'Estimated.Loan.Amount', 'Estimated.LSL.Eligible.Costs', 'Loan.Type', 'Estimated.Award.Date', 'Rate']

# in both money columns, remove the line break and then the remaining space
oh_5_1['Estimated.Loan.Amount'] = oh_5_1['Estimated.Loan.Amount'].str.replace("\n", "")
oh_5_1['Estimated.Loan.Amount'] = oh_5_1['Estimated.Loan.Amount'].str.replace(" ", "")
oh_5_1['Estimated.LSL.Eligible.Costs'] = oh_5_1['Estimated.LSL.Eligible.Costs'].str.replace("\n", "")
oh_5_1['Estimated.LSL.Eligible.Costs'] = oh_5_1['Estimated.LSL.Eligible.Costs'].str.replace(" ", "")

oh_5_1 = oh_5_1.iloc[5:25,].copy().reset_index(drop=True)
oh_5_1

Unnamed: 0,Entity,Project,County,Estimated.Loan.Amount,Estimated.LSL.Eligible.Costs,Loan.Type,Estimated.Award.Date,Rate
0,Licking County,Harbor Hills Watermain and Lead Service Line R...,Licking,"$11,000,000","$1,300,000",Construction,Jan-23,SML/LSL
1,Licking County,Prescott Estates Water Line and New Tank*,Licking,"$2,901,300","$466,000",Construction,Feb-23,SML/LSL
2,Licking County - Harbor Hills,Water Main and Lead Service Line Replacement*,Licking,"$950,000","$369,204",Design,Aug-22,PLN/DES
3,Lockland,Water Treatment System Upgrades,Hamilton,"$9,900,000","$900,000",Construction,Jun-23,SML/LSL
4,Lorain,2023 Lead Service line Replacement Project,Lorain,"$5,000,000","$5,000,000",Construction,May-23,LSL
5,Malvern,Phase 1 Waterline Replacement,Carroll,"$506,000","$17,750",Construction,Sep-22,DIS/LSL
6,Malvern,Waterline Replacement Phase 2,Carroll,"$3,886,000","$62,000",Construction,Jun-23,DIS/LSL
7,New London,Coleman Court/Clinton Street Lead Waterline Re...,Huron,"$369,204","$369,204",Construction,Jun-23,LSL
8,North Baltimore,Watermain Replacement Project,Wood,"$4,838,251","$30,000",Construction,Oct-22,SML/LSL
9,Port Clinton,Water and Sanitary Sewer Infrastructure Improv...,Ottawa,"$10,704,265","$750,000",Construction,Jun-23,REG/LSL


In [77]:
# combine and save
oh_5 = pd.concat([oh_5, oh_5_1])
oh_5.to_csv("final_ppls/35-Ohio_LSLR.csv", index=False)

In [4]:
deleteTempDriveFolder(path)

Deleted folder and all files within ../temp-docs
