In [2]:
import pandas as pd
import time
import numpy as np
import tika
import glob

from tika import parser

df=pd.read_json('./data/priorities_list_waddr_geocoded.json')

In [11]:
#pull out the unique id from the narrative url, which can be matched to the downloaded pdfs
def extract_id(row):
    try:
        row['site_id']=row.site_narrative_url.split('/')[-1]
    except:
        row['site_id']=None
    return row
df=df.apply(extract_id,axis=1)

In [26]:
#extract text from the pdf documents
def extract_text(row):
    try:
        file='./data/epa_narratives/'+row.site_id+'.pdf'
        parsed = parser.from_file(file)
        row['text']=parsed["content"].rstrip().lstrip()
    except:
        row['text']=None
    return row
df=df.apply(extract_text,axis=1)
df.drop(['site_id'],axis=1,inplace=True)

# For demographic information, I will use the Census Bureau planning database:
* https://www.census.gov/research/data/planning_database/2015/

In [46]:
#preparing the priorities list to merge
df.FIPS_Block_Group[df.FIPS_Block_Group.isnull()]=0

df['FIPS_Block_Group']=df.FIPS_Block_Group.astype(int)

df.rename(columns={'FIPS_Block_Group':'FIPS_Full'},inplace=True)
df['FIPS_Block_Group']=df.FIPS_Full//1000

In [58]:
#Loading the planning database
pdb=pd.read_csv('/btrvol/Research/Datasets/Census/PDB_2015_Block_Group.csv',encoding="ISO-8859-1")
pdb.rename(columns={'GIDBG':'FIPS_Block_Group'},inplace=True)

In [66]:
# adding in planning database information to the priorities list and saving
df_priority=df.merge(pdb,on='FIPS_Block_Group',how='left')
df_priority.to_json('./data/priorities_list_full.json')

In [70]:
# adding in an indicator for whether or not a block group has a superfund site
def has_superfund(row):
    if row.FIPS_Block_Group in list(df.FIPS_Block_Group):
        row['has_superfund']=1
    else:
        row['has_superfund']=0
    return row
pdb_block=pdb.apply(has_superfund,axis=1)

pdb_block.to_csv("./data/pdb_block_group.csv",index=False)

In [74]:
# adding in an indicator for whether or not a census tract has a superfund site
pdb=pd.read_csv('/btrvol/Research/Datasets/Census/PDB_2015_Tract.csv',encoding="ISO-8859-1")
pdb.rename(columns={'GIDTR':'FIPS_Tract'},inplace=True)
df['FIPS_Tract']=df.FIPS_Full//10000

def has_superfund(row):
    if row.FIPS_Tract in list(df.FIPS_Tract):
        row['has_superfund']=1
    else:
        row['has_superfund']=0
    return row
pdb_tract=pdb.apply(has_superfund,axis=1)

pdb_tract.to_csv("./data/pdb_tract.csv",index=False)