### Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
from time import sleep
import swifter

In [2]:
site = 'https://www.epa.gov/superfund/national-priorities-list-npl-sites-state'

In [3]:
try:
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.headless = True
    brower = webdriver.Firefox(options=fireFoxOptions)
    brower.get(site)
    sleep(10)
    site_source = brower.page_source
finally:
    try:
        brower.close()
    except:
        pass

In [4]:
soup = BeautifulSoup(site_source, 'html.parser')
scrap_tables = soup.findAll('table')

In [5]:
def tableDataTextSTATE(table):    
    """
    function pulled from eusoubrasileiro on stackoverflow
    https://stackoverflow.com/questions/2935658/beautifulsoup-get-the-contents-of-a-specific-table
    Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[1], 'th') + ['State','URL']
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[2:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') + [table.find('span').get_text(strip=True), tr.findAll('td')[-1].find('a')['href']]) # data row       
    return rows

In [6]:
list_table_results = [tableDataTextSTATE(x) for x in scrap_tables]

In [7]:
list_dfs = [pd.DataFrame(x[1:], columns=x[0]) for x in list_table_results]

In [8]:
# look at one of the state results
list_dfs[3]

Unnamed: 0,Site Name,City,Site EPA ID,Listing Date,Site Score,FederalFacilityIndicator,Additional Information,Site Location,State,URL
0,Apache Powder Co.,St. David,AZD008399263,08/30/1990,39.09,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
1,Hassayampa Landfill,Hassayampa,AZD980735666,07/22/1987,42.79,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
2,Indian Bend Wash Area,Scottsdale,AZD980695969,09/08/1983,42.24,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
3,Iron King Mine - Humboldt Smelter,Dewey-Humboldt,AZ0000309013,09/03/2008,52.69,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
4,"Motorola, Inc. (52nd Street Plant)",Phoenix,AZD009004177,10/04/1989,40.83,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
5,Phoenix-Goodyear Airport Area,"Avondale, Goodyear",AZD980695902,09/08/1983,45.91,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
6,Tucson International Airport Area,Tucson,AZD980737530,09/08/1983,57.8,No,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
7,Williams Air Force Base,Chandler,AZ7570028582,11/21/1989,37.93,Yes,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...
8,Yuma Marine Corps Air Station,Yuma,AZ0971590062,02/21/1990,32.24,Yes,Site Listing NarrativeSite Progress ProfileFed...,Site Location,Arizona,https://epa.maps.arcgis.com/apps/webappviewer/...


In [9]:
# combine all the small dfs together
df_sites = pd.concat(list_dfs).reset_index(drop = True)

In [10]:
df_sites['latitude'] = np.nan
df_sites['longitude'] = np.nan

In [11]:
def get_coordinates(url):
    try:
        lat_long = []
        from selenium import webdriver
        from bs4 import BeautifulSoup
        from time import sleep
        fireFoxOptions = webdriver.FirefoxOptions()
        fireFoxOptions.headless = True
        brower = webdriver.Firefox(options=fireFoxOptions)
        brower.get(url)
        sleep(5)
        site_source = brower.page_source
        soup = BeautifulSoup(site_source, 'html.parser')
        scrap_tables = soup.findAll('table')
        lat_long = scrap_tables[0].findAll('span')[2:4]
        lat_long = [float(x.get_text(strip = True)) for x in lat_long]
    except:
        print(url)
    finally:
        try:
            brower.close()
            return lat_long
        except:
            pass
    

In [20]:
def get_coordinates(url):
    lat_long = []
    from selenium import webdriver
    from bs4 import BeautifulSoup
    from time import sleep
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.headless = True
    brower = webdriver.Firefox(options=fireFoxOptions)
    brower.get(url)
    sleep(5)
    site_source = brower.page_source
    soup = BeautifulSoup(site_source, 'html.parser')
    scrap_tables = soup.findAll('table')
    try:
        lat_long = scrap_tables[0].findAll('span')[2:4]
        lat_long = [float(x.get_text(strip = True)) for x in lat_long] + [url]
        brower.close()
    except:
        lat_long = [np.nan, np.nan, url]

    return lat_long


In [19]:
url = "https://epa.maps.arcgis.com/apps/webappviewer/index.html?id=33cebcdfdd1b4c3a8b51d416956c41f1&query=Superfund_National_Priorities_List__NPL__Sites_with_Status_Information_7557,SITE_EPA_ID='AL6210020008'"

In [15]:
get_coordinates(url)

[33.3381, -86.3268]

In [None]:
# parallel processing isn't working because of limitations for selenium
import multiprocessing
pool = multiprocessing.Pool(processes = 30)

result = pool.map(get_coordinates, list(df_sites['URL']))

In [21]:
lat_long_results = []
for i,x in enumerate(df_sites['URL']):
    lat_long_results.append(get_coordinates(x))
    if i % 5 == 0:
        print(i)

0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
260
265
270
275
280
285
290
295
300
305
310
315
320
325
330
335
340
345
350
355
360
365
370
375
380
385
390
395
400
405
410
415
420
425
430
435
440
445
450
455
460
465
470
475
480
485
490
495
500
505
510
515
520
525
530
535
540
545
550
555
560
565
570
575
580


WebDriverException: Message: invalid argument: can't kill an exited process
