### Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
from time import sleep
import swifter
import pickle

In [2]:
site = 'https://www.epa.gov/superfund/national-priorities-list-npl-sites-state'

In [3]:
try:
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.headless = True
    brower = webdriver.Firefox(options=fireFoxOptions)
    brower.get(site)
    sleep(10)
    site_source = brower.page_source
finally:
    try:
        brower.close()
    except:
        pass

In [4]:
soup = BeautifulSoup(site_source, 'html.parser')
scrap_tables = soup.findAll('table')

In [5]:
def tableDataTextSTATE(table):    
    """
    function pulled from eusoubrasileiro on stackoverflow
    https://stackoverflow.com/questions/2935658/beautifulsoup-get-the-contents-of-a-specific-table
    Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[1], 'th') + ['State','URL']
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[2:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') + [table.find('span').get_text(strip=True), tr.findAll('td')[-1].find('a')['href']]) # data row       
    return rows

In [6]:
list_table_results = [tableDataTextSTATE(x) for x in scrap_tables]

In [7]:
list_dfs = [pd.DataFrame(x[1:], columns=x[0]) for x in list_table_results]

In [8]:
# look at one of the state results
list_dfs[3]

Unnamed: 0,Site Name,City,Site EPA ID,Listing Date,Site Score,FederalFacilityIndicator,Additional Information,Site Location,State,URL
0,Apache Powder Co.,St. David,AZD008399263,08/30/1990,39.09,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
1,Hassayampa Landfill,Hassayampa,AZD980735666,07/22/1987,42.79,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
2,Indian Bend Wash Area,Scottsdale,AZD980695969,09/08/1983,42.24,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
3,Iron King Mine - Humboldt Smelter,Dewey-Humboldt,AZ0000309013,09/03/2008,52.69,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
4,"Motorola, Inc. (52nd Street Plant)",Phoenix,AZD009004177,10/04/1989,40.83,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
5,Phoenix-Goodyear Airport Area,"Avondale, Goodyear",AZD980695902,09/08/1983,45.91,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
6,Tucson International Airport Area,Tucson,AZD980737530,09/08/1983,57.8,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
7,Williams Air Force Base,Chandler,AZ7570028582,11/21/1989,37.93,Yes,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
8,Yuma Marine Corps Air Station,Yuma,AZ0971590062,02/21/1990,32.24,Yes,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...


In [9]:
# combine all the small dfs together
df_sites = pd.concat(list_dfs).reset_index(drop = True)

In [10]:
df_sites['latitude'] = np.nan
df_sites['longitude'] = np.nan

In [14]:
input_list = [tuple([i,x]) for i,x in enumerate(df_sites['URL'])]

In [17]:
input_list[0]

(0,
 "https://epa.maps.arcgis.com/apps/webappviewer/index.html?id=33cebcdfdd1b4c3a8b51d416956c41f1&query=Superfund_National_Priorities_List__NPL__Sites_with_Status_Information_7557,SITE_EPA_ID='AL6210020008'")

In [25]:
import requests
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool, Pool
from bs4 import BeautifulSoup
from selenium import webdriver
import threading
from time import sleep
from random import randint

def get_links(link):
  res = requests.get(link)
  soup = BeautifulSoup(res.text,"lxml")
  titles = [urljoin(url,items.get("href")) for items in soup.select(".summary .question-hyperlink")]
  return titles

threadLocal = threading.local()

def get_driver():
  driver = getattr(threadLocal, 'driver', None)
  if driver is None:
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.headless = True
    driver = webdriver.Firefox(options=fireFoxOptions)
    setattr(threadLocal, 'driver', driver)
  return driver


def get_latlong(url):
    driver = get_driver()
    
    try:     
        sleep(randint(1,10))
        driver.get(url[1])
        sleep(5)
        site_source = driver.page_source
        soup = BeautifulSoup(site_source, 'html.parser')
        scrap_tables = soup.findAll('table')
        lat_long = scrap_tables[0].findAll('span')[2:4]
        lat_long_out = [float(x.get_text(strip = True)) for x in lat_long] + [url]
    except:
        lat_long_out = [np.nan, np.nan, url]
    print(f'{url[0]} - {lat_long_out[0:2]}')
    return lat_long_out
if __name__ == '__main__':
    url = "https://epa.maps.arcgis.com/apps/webappviewer/index.html?id=33cebcdfdd1b4c3a8b51d416956c41f1&query=Superfund_National_Priorities_List__NPL__Sites_with_Status_Information_7557,SITE_EPA_ID='AL6210020008'"
    with ThreadPool(20) as pool:
        test = pool.map(get_latlong,input_list)

170 - [39.127781, -75.466669]
85 - [38.703889, -119.656111]
255 - [21.388889, -157.983333]
17 - [55.626442, -132.558853]
153 - [41.980556, -73.047222]
0 - [33.3381, -86.3268]
238 - [31.2225, -84.20333]
136 - [39.236111, -106.2692]
34 - [33.1825, -92.673333]
51 - [37.367361, -120.5636]
221 - [29.060831, -81.265]
272 - [41.327778, -89.304167]
119 - [33.946178, -118.176922]
204 - [27.678611, -82.477778]
187 - [28.057778, -82.3175]
102 - [34.404169, -118.9028]
171 - [39.1558, -75.5277]
86 - [37.318611, -121.8644]
289 - [41.7068, -87.9422]
306 - [42.2923, -88.4402]
68 - [37.4214, -122.1434]
323 - [39.186075, -85.930472]
256 - [47.543061, -116.1617]
18 - [31.88, -110.24]
1 - [33.399722, -86.405]
137 - [40.055833, -105.503611]
154 - [41.43195, -73.035281]
239 - [34.0207, -85.2717]
52 - [39.1125, -123.193]
324 - [41.614061, -87.426447]
35 - [34.878, -92.1312]
222 - [27.136667, -80.199167]
273 - [41.8326, -88.1787]
120 - [37.40805, -122.0786]
188 - [29.675, -82.323061]
205 - [30.3442, -81.6265]

In [37]:
lat_list = []
long_list = []
index_list = []
url_list = []
test[0]

[33.3381,
 -86.3268,
 (0,
  "https://epa.maps.arcgis.com/apps/webappviewer/index.html?id=33cebcdfdd1b4c3a8b51d416956c41f1&query=Superfund_National_Priorities_List__NPL__Sites_with_Status_Information_7557,SITE_EPA_ID='AL6210020008'")]

In [38]:
for i in test:
    lat_list.append(i[0])
    long_list.append(i[1])
    index_list.append(i[2][0])
    url_list.append(i[2][1])

In [42]:
df_sites['latitude'] = lat_list
df_sites['longitude'] = long_list
df_sites['test_index'] = index_list
df_sites['test_url'] = url_list

In [46]:
list(df_sites['URL']) == list(df_sites['test_url'])

True

In [47]:
list(df_sites.index) == list(df_sites['test_index'])

True

In [48]:
sum(np.isnan(df_sites['latitude']))

1

In [49]:
sum(np.isnan(df_sites['longitude']))

1

In [50]:
df_sites[np.isnan(df_sites['longitude'])]

Unnamed: 0,Site Name,City,Site EPA ID,Listing Date,Site Score,FederalFacilityIndicator,Additional Information,Site Location,State,URL,latitude,longitude,test_index,test_url
969,Boarhead Farms,Bridgeton Township,PAD047726161,03/31/1989,39.92,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Pennsylvania,https://epa.maps.arcgis.com/apps/webappviewer/...,,,969,https://epa.maps.arcgis.com/apps/webappviewer/...


In [None]:
def get_latlong(url):
    driver = get_driver()
    
    try:     
        sleep(randint(1,10))
        driver.get(url[1])
        sleep(5)
        site_source = driver.page_source
        soup = BeautifulSoup(site_source, 'html.parser')
        scrap_tables = soup.findAll('table')
        lat_long = scrap_tables[0].findAll('span')[2:4]
        lat_long_out = [float(x.get_text(strip = True)) for x in lat_long] + [url]
    except:
        lat_long_out = [np.nan, np.nan, url]
    print(f'{url[0]} - {lat_long_out[0:2]}')
    return lat_long_out

In [56]:
fix_nan = get_latlong(tuple([0,df_sites.loc[969,'URL']]))

0 - [40.545831, -75.125]


In [60]:
df_sites.loc[969,'latitude'] = fix_nan[0]
df_sites.loc[969,'longitude'] = fix_nan[1]

In [61]:
sum(np.isnan(df_sites['longitude']))

0

In [62]:
sum(np.isnan(df_sites['latitude']))

0

In [66]:
pickle.dump(df_sites,open('df_superfund_geo.pkl','wb'))