In [1]:
## Import necessary packages
import pandas as pd
import numpy as np
import tkinter as tk
import PyPDF2 
import tabula
import tkinter
import camelot
import ghostscript
import re
import os
import csv
import urllib
import io
import warnings
warnings.filterwarnings("ignore")

## Maryland

### Comprehensive PPL

In [2]:
# get the full table
md_comp = camelot.read_pdf("https://mde.maryland.gov/programs/water/WQFA/Documents/FFY22%20SFY24%20Final%20DW%20PPL.pdf", 
                           pages='1-9', flavor='lattice')
print(len(md_comp))

9


In [21]:
# create list to append into
list_of_dfs = []

for i in range(len(md_comp)):
    # read in each page
    md = md_comp[i].df.iloc[1:,].copy()

    ## split the 0 column (Rank/Points) by line breaks
    # regex looks for a number greedily, then any amount of whtie space, then another number greedily
    md[['Rank', 'Points']] = md[0].str.extract(r'(\d+)\s*(\d+)')


    ##split the 1 column (Project Title / Number / Population) by the opening parantheses before the pwsid
    md[['ProjectTitle', 'PWSID.Pop.ProjectNumber']] = md[1].str.split("\n\(MD", expand=True)

    # split PWSID from Population and ProjectNumber by the other parentheses
    md[['PWSID', 'Pop.ProjectNumber']] = md["PWSID.Pop.ProjectNumber"].str.split("\)\n", expand=True)
    # add MD back to the PWSID after using it to split earlier columns to avoid issues with multiple splits
    md['PWSID'] = "MD" + md['PWSID']

    # remove population pretense
    md['Pop.ProjectNumber'] = md['Pop.ProjectNumber'].str.replace("Ben.Pop=\n", "")

    # split population and project number
    md[['Population', 'ProjectNumber']] = md['Pop.ProjectNumber'].str.split("\n", expand=True)

    #replace line breaks with spaces in ProjectTitle
    md['ProjectTitle'] = md['ProjectTitle'].str.replace("\n", " ") 


    ## column 2 only needs line breaks replace with spaces
    md['ProjectDescription'] = md[2].str.replace("\n", " ") 


    ## column 3 (Applicant Name / County) split by system size and characters around it
    md[["Name.Borrower", "Size.County"]] = md[3].str.split("\nSystem Size =\n", expand=True, n=2)

    # split by breaks only once because names are only one line but borrower can be two, replace remaining line breaks with space
    md[["Name", "Borrower"]] = md['Name.Borrower'].str.split("\n", n=1, expand=True)
    md['Borrower'] = md['Borrower'].str.replace("\n", " ")

    # split by line break
    md[['Size', 'County']] = md['Size.County'].str.split("\n", expand=True)


    ## column 4 only needs to be split by line breaks, but this data is not currently used in the dashboard
    # md[["LegDistrict", "CongDistrict"]] = md[4].str.split("\n", n=1, expand=True)


    ## column 5 needs to have Disadvantaged extracted from it, the rest is not currently used in the dashboard
    md['Disadvantaged'] = np.where(md[5].str.contains("DISADV."), "Yes", "No")

    ## column 6 can be ignored because we currently use no dates in the dashboard

    ## column 7 splits by line break between Total Cost and a variable we won't use for dashboard purposes
    md[['TotalCost', 'Details']] = md[7].str.split("\n", n=1, expand=True)


    ## column 8 - in each row, the numbers are the same, so just keep the first, similar to column 7
    md[['RequestedFunding', 'LoanDetails']] = md[8].str.split("\n", n=1, expand=True)

    # drop in-process columns
    md.drop(columns=[0,1,2,3,4,5,6,7,8,'Size', 'Name.Borrower', 'Size.County', 'Details','LoanDetails', 
            'PWSID.Pop.ProjectNumber', 'Pop.ProjectNumber'], inplace=True)

    # append to list    
    list_of_dfs.append(md)
    
# concat with common column names    
md_output = pd.concat(list_of_dfs)


In [22]:
md_output.reset_index(drop=True, inplace=True)
# drop total rows
md_output = md_output.iloc[:-2,]

In [23]:
md_output

Unnamed: 0,Rank,Points,ProjectTitle,PWSID,Population,ProjectNumber,ProjectDescription,Name,Borrower,County,Disadvantaged,TotalCost,RequestedFunding
0,1,117,NEW WINTERS RUN WATER TREATMENT FACILITY,MD0120003,15500,DW0016,PROBLEM: 70-YEAR OLD PLANT SITUATED IN 100-YR ...,BARRY L. SUITS,MARYLAND AMERICAN WATER,Harford County,No,18000000,9000000
1,2,110,POCOMOKE CITY WIIN-CIPP LINING MARKET STREET M...,MD023006,4075,DW0004,PROBLEM: HIGH RUST AND CONTAMINANTS IN WATER S...,LINDA MCNEIL,POCOMOKE CITY,Worcester County,Yes,2256700,843923
2,3,100,BRIERCREST APARTMENTS WATER SERVICE,MD0100004,60,DW0017,PROBLEM: CONTAMINATED DRINKING WATER SOURCE SE...,RODNEY WINEBRENNER,FREDERICK COUNTY,Frederick County,No,121178,121178
3,4,97,WINCHESTER ROAD WATER LINE REPLACEMENT (LAVALE),MD0010016,6218,DW0025,PROBLEM: AGING AND FAILING WATER MAIN REQUIRES...,DAVID S. WENDT,LAVALE SANITARY DISTRICT,Allegany County,Yes,5072800,4602800
4,5,97,SPRINGFIELD WELLS / FAIRHAVEN WELL HOUSE,MD006002,32673,DW0015,PROBLEM: PROVIDE A RELIABLE SUPPLEMENTAL AND B...,JEFFREY D. CASTONGUAY,BOARD OF CARROLL CO COMMISSIONERS,Carroll County,No,4030000,1000000
5,6,95,SPRINGVIEW MOBILE HOME PARK WATER SERVICE,MD0100030,93,DW0021,PROBLEM: SOURCE DRINKING WATER CONTAMINATED WI...,RODNEY WINEBRENNER,FREDERICK COUNTY,Frederick County,No,632772,632772
6,7,90,COULBOURNE LANE WELL BACKUP POWER,MD0230007,2150,DW0002,PROBLEM: NO BACKUP POWER TO LARGEST PRODUCING ...,RICK POLLITT,TOWN OF SNOW HILL,Worcester County,Yes,223064,218064
7,8,90,EMMITSBURG NORTH SETON AVENUE WATER LINE REPLA...,MD0100010,2770,DW0018,PROBLEM: WATER PRESSURE COMPLIANCE AND RESOLVE...,CATHY WILLETS,TOWN OF EMMITSBURG,Frederick County,No,1170552,1145552
8,9,90,EMMITSBURG DEPAUL STREET WATER LINE REPLACEMENT,MD0100010,6925,DW0019,PROBLEM: WATER PRESSURE COMPLIANCE AND RESOLVE...,CATHY WILLETS,TOWN OF EMMITSBURG,Frederick County,No,1135000,1110000
9,10,90,MACK WATER STORAGE TANK REPLACEMENT,MD0210010,25580,DW0034,PROBLEM: TANK HAS EXCEEDED USEFUL LIFE AND DEV...,WILLIAM LUHN,CITY OF HAGERSTOWN -- UTILITIES DEPARTMENT,Washington County,Yes,4600000,4000000


In [24]:
md_output.to_csv("../data/year1/csv/20-Maryland_ComprehensivePPL.csv", index=False)