In [1]:
import pandas as pd
import numpy as np
import camelot.io as camelot
import ghostscript
import sys

 # adding folders up to system path to access functions
sys.path.insert(0, '../../../')
from driveFolderHandler import downloadTempDriveFolder, deleteTempDriveFolder

In [2]:
path = "../temp-docs"
downloadTempDriveFolder(folderID='1o_DxZ5MoX6OmOsNG88_iuXY-WILYiY_p',
                        path = path)

Retrieving folder list


Processing file 1_uGOMbnd44RCXV_LVCLaryuzHtD5ENCm MD_IUP_2023.pdf
Processing file 1oEdooB90yJKh2N6pv96Dc4Bv94a3ruOx MD_PPL_2023.pdf
Building directory structure completed


Retrieving folder list completed
Building directory structure
Downloading...
From: https://drive.google.com/uc?id=1_uGOMbnd44RCXV_LVCLaryuzHtD5ENCm
To: /Users/pcork/epic/dw-dashboard/year1/MD/temp-docs/MD_IUP_2023.pdf
100%|██████████| 595k/595k [00:00<00:00, 6.72MB/s]
Downloading...
From: https://drive.google.com/uc?id=1oEdooB90yJKh2N6pv96Dc4Bv94a3ruOx
To: /Users/pcork/epic/dw-dashboard/year1/MD/temp-docs/MD_PPL_2023.pdf
100%|██████████| 738k/738k [00:00<00:00, 6.31MB/s]

Files stored in ../temp-docs



Download completed


## Maryland

### Comprehensive PPL

In [None]:

md_comp = camelot.read_pdf(path+"/MD_PPL_2023.pdf", 
                           pages='1-9', flavor='lattice')
print(len(md_comp))

In [21]:
# create list to append into
list_of_dfs = []

for i in range(len(md_comp)):
    # read in each page
    md = md_comp[i].df.iloc[1:,].copy()

    ## split the 0 column (Rank/Points) by line breaks
    # regex looks for a number greedily, then any amount of whtie space, then another number greedily
    md[['Rank', 'Points']] = md[0].str.extract(r'(\d+)\s*(\d+)')


    ##split the 1 column (Project Title / Number / Population) by the opening parantheses before the pwsid
    md[['ProjectTitle', 'PWSID.Pop.ProjectNumber']] = md[1].str.split("\n\(MD", expand=True)

    # split PWSID from Population and ProjectNumber by the other parentheses
    md[['PWSID', 'Pop.ProjectNumber']] = md["PWSID.Pop.ProjectNumber"].str.split("\)\n", expand=True)
    # add MD back to the PWSID after using it to split earlier columns to avoid issues with multiple splits
    md['PWSID'] = "MD" + md['PWSID']

    # remove population pretense
    md['Pop.ProjectNumber'] = md['Pop.ProjectNumber'].str.replace("Ben.Pop=\n", "")

    # split population and project number
    md[['Population', 'ProjectNumber']] = md['Pop.ProjectNumber'].str.split("\n", expand=True)

    #replace line breaks with spaces in ProjectTitle
    md['ProjectTitle'] = md['ProjectTitle'].str.replace("\n", " ") 


    ## column 2 only needs line breaks replace with spaces
    md['ProjectDescription'] = md[2].str.replace("\n", " ") 


    ## column 3 (Applicant Name / County) split by system size and characters around it
    md[["Name.Borrower", "Size.County"]] = md[3].str.split("\nSystem Size =\n", expand=True, n=2)

    # split by breaks only once because names are only one line but borrower can be two, replace remaining line breaks with space
    md[["Name", "Borrower"]] = md['Name.Borrower'].str.split("\n", n=1, expand=True)
    md['Borrower'] = md['Borrower'].str.replace("\n", " ")

    # split by line break
    md[['Size', 'County']] = md['Size.County'].str.split("\n", expand=True)


    ## column 4 only needs to be split by line breaks, but this data is not currently used in the dashboard
    # md[["LegDistrict", "CongDistrict"]] = md[4].str.split("\n", n=1, expand=True)


    ## column 5 needs to have Disadvantaged extracted from it, the rest is not currently used in the dashboard
    md['Disadvantaged'] = np.where(md[5].str.contains("DISADV."), "Yes", "No")

    ## column 6 can be ignored because we currently use no dates in the dashboard

    ## column 7 splits by line break between Total Cost and a variable we won't use for dashboard purposes
    md[['TotalCost', 'Details']] = md[7].str.split("\n", n=1, expand=True)


    ## column 8 - in each row, the numbers are the same, so just keep the first, similar to column 7
    md[['RequestedFunding', 'LoanDetails']] = md[8].str.split("\n", n=1, expand=True)

    # drop in-process columns
    md.drop(columns=[0,1,2,3,4,5,6,7,8,'Size', 'Name.Borrower', 'Size.County', 'Details','LoanDetails', 
            'PWSID.Pop.ProjectNumber', 'Pop.ProjectNumber'], inplace=True)

    # append to list    
    list_of_dfs.append(md)
    
# concat with common column names    
md_output = pd.concat(list_of_dfs)


In [22]:
md_output.reset_index(drop=True, inplace=True)
# drop total rows
md_output = md_output.iloc[:-2,]

In [24]:
md_output.to_csv("../data/year1/csv/20-Maryland_ComprehensivePPL.csv", index=False)

In [4]:
deleteTempDriveFolder(path)

Deleted folder and all files within ../temp-docs
