This script will navigate to the ECOS website, access links from a list of invertebrate species, determine if any 5-year reviews have been conducted for those species, and then scrape the recommended actions from each 5-year review PDF file for further analysis.

In [210]:
# Install libraries
!pip install PyPDF2
!pip install requests
!pip install bs4
!pip install selenium
!pip install undetected-chromedriver
!pip install pdfplumber

Collecting pdfplumber
  Obtaining dependency information for pdfplumber from https://files.pythonhosted.org/packages/f8/d3/f58c2d5d86a585e438c6708f568eca79e7c4e6ee3d5210cf8b31d38cb021/pdfplumber-0.10.3-py3-none-any.whl.metadata
  Downloading pdfplumber-0.10.3-py3-none-any.whl.metadata (38 kB)
Collecting pdfminer.six==20221105 (from pdfplumber)
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
     ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/5.6 MB 960.0 kB/s eta 0:00:06
     ---------------------------------------- 0.1/5.6 MB 1.1 MB/s eta 0:00:06
      --------------------------------------- 0.1/5.6 MB 1.0 MB/s eta 0:00:06
     - -------------------------------------- 0.2/5.6 MB 1.3 MB/s eta 0:00:05
     - -------------------------------------- 0.2/5.6 MB 1.3 MB/s eta 0:00:05
     -- ------------------------------------- 0.3/5.6 MB 1.2 MB/s eta 0:00:05
     -- ------------------------------------- 0.3/5

In [211]:
# Load libraries
import pdfplumber
from bs4 import BeautifulSoup
from functools import reduce
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

from lxml import html

import pandas as pd
import requests
import re
import io
import undetected_chromedriver as uc
import time


In [91]:
# URL to the index of ESA-listed species:
CUSTOM_QUERY_URL = "https://ecos.fws.gov/ecp0/reports/ad-hoc-species-report-input"
CUSTOM_JS_QUERY = "kingdom"
CUSTOM_OPTION_VALUE = "Invertebrates"
CUSTOM_SUBMIT_QUERY = "submit"

SP_BASE_URL = "https://ecos.fws.gov"

In [102]:
# Open the webpage with Selenium to access all ESA-listed animals
driver = webdriver.Chrome()
driver.get(CUSTOM_QUERY_URL)

# Save the current tab handle to switch to new query results tab
original_window = driver.current_window_handle

# Select the report length dropdown element and show all species on page
select = Select(driver.find_element(By.NAME, CUSTOM_JS_QUERY))
select.deselect_all()
select.select_by_visible_text(CUSTOM_OPTION_VALUE)
driver.find_element(By.ID, CUSTOM_SUBMIT_QUERY).submit()
time.sleep(5)

# Grab the newly visible HTML on the webpage
for window_handle in driver.window_handles:
    if window_handle != original_window:
        driver.switch_to.window(window_handle)
        break
    
webpage = driver.page_source

In [103]:
# Grab the HTMl representation of the page
soup = BeautifulSoup(webpage)

In [104]:
# Find HTML links to species pages and save them to a list, append the base URL to all species links
links = [a.get('href') for a in soup.find_all('a', href=True)]
sp_page_links = [i for i in links if "species" in i]
sp_page_links = [SP_BASE_URL + i for i in sp_page_links]

## Species Drill Down ##
Now we will navigate into individual species pages, parse out relevant traits like listing year, current status, review reports, etc. We will also save the URLs to the PDF files of the review reports to drill down even further.

In [163]:
# Some notes on species page parsing just to keep things a bit organized:
# Scientific Name = italic text in div ID 'speciesProfile' >> 'j-species-name'
# Listing Status = id is 'listingStatus, text in span 'listingEnd'
# Listing Location = visual text, text in span 'endangerStatus' 
# Listing Years = div id 'j-listing-status-summary', table id 'DataTables_Table_0', table body, 2nd <td> tags
# Five-year Review Years = div id 'j-five-year-reviews', table id 'DataTables_Table_4', table body, 1st <td> tags
# Five-year Review URLS = div id 'j-five-year-reviews', table id 'DataTables_Table_4', table body, 2nd column <a> tags
summary_dataframe = pd.DataFrame()

# Ensure that selenium is operating silently to prevent annoying driver pop-ups
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")

for i in sp_page_links:
    driver = webdriver.Chrome(options = chrome_options)

    # Try to get the species profile page, except the timeout error if page is wonky. Then move to the next.
    try:
        driver.get(i)
    except TimeoutError:
        pass

    time.sleep(2)

    webpage = driver.page_source
    soup = BeautifulSoup(webpage)

    taxon_id = re.search('(\d+)$', i).group(1)
    scientific_name = soup.find('div', {'id': 'speciesProfile'}).find('i').get_text()
    listing_status = soup.find('div', {'id': 'speciesProfile'}).find('span', {'id': 'listingStatus'}).get_text().replace("Listing Status: ", "")
    
    # Listing Status
    listingStatusPresent = soup.find('div', {'id': 'speciesProfile'}).find('span', {'class': 'endangerStatus'}) is not None
    if listingStatusPresent:
        listing_location = soup.find('div', {'id': 'speciesProfile'}).find('span', {'class': 'endangerStatus'}).get_text().capitalize()
    else:
        listing_location = ""

    # Listing Dates
    listingDatesPresent = soup.find('table', {'id': 'DataTables_Table_0'}) is not None
    if listingDatesPresent:
        listing_dates = []
        for row in soup.find('table', {'id': 'DataTables_Table_0'}).tbody.find_all('tr'):
            listing_dates.append(row.find_all('td')[1].text)
        listing_dates = "; ".join([str(x) for x in listing_dates])
    else:
        listing_dates = ""

    # 5-year Review Dates
    reviewDatesTablePresent = soup.find('div', {'id': 'j-five-year-reviews'}).find('tbody') is not None
    if reviewDatesTablePresent:
        review_dates = []
        for row in soup.find('div', {'id': 'j-five-year-reviews'}).find('tbody').find_all('tr'):
            review_dates.append(row.find_all('td')[0].text.replace("/", "-"))
        review_dates = "; ".join([str(x) for x in review_dates])
    else:
        review_dates = ""

    # 5-year Review URLs
    reviewURLsTablePresent = soup.find('div', {'id': 'j-five-year-reviews'}).find('tbody') is not None
    if reviewURLsTablePresent:
        review_URLs = []
        for row in soup.find('div', {'id': 'j-five-year-reviews'}).find('tbody').find_all('tr'):
            review_URLs.append(row.find_all('td')[1].find_all('a', href=True)[0].get('href'))
        review_URLs = "; ".join([str(x) for x in review_URLs])
    else:
        review_URLs = ""

    df2 = pd.DataFrame([[taxon_id, scientific_name, listing_status, listing_location, listing_dates, review_dates, review_URLs]],
    columns = ['taxon_id', "scientificName", "currentListingStatus", "currentListingLocality", "allListingDates", "allReviewDates", "allReviewURLs"])

    summary_dataframe = pd.concat([summary_dataframe, df2])

In [164]:
# Write the summary dataframe to a .csv file
summary_dataframe.to_csv("../data/summary_table.csv")

In [170]:
# Pull a random sample of PDF review documents, parse into sentences, and save those sentences to a .csv
# for manual annotation
summary_dataframe_linked = summary_dataframe[summary_dataframe['allReviewURLs'] != ""]
summary_dataframe_linked = summary_dataframe_linked.sample(n = 50, random_state=0)

In [179]:
# Parse the URLs randomly
summary_dataframe_linked['parsedURL'] = summary_dataframe_linked['allReviewURLs'].str.partition(";")[0]

In [213]:
# Parse the text from PDFs into a list
output = []
for i in summary_dataframe_linked['parsedURL']:
    request = requests.get(i)
    f = io.BytesIO(request.content)
    reader = pdfplumber.open(f)

    text = ""
    npages = len(reader.pages)
    for j in range(npages):
        page = reader.pages[j]
        text += page.extract_text()
    
    output.append(text)
    reader.close()

In [225]:
# Parse the list of text into list of sentences (list of lists)
from nltk.tokenize import punkt
import nltk 
import string

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

output_sentences = []
for i in output:
    i = re.sub('[^A-Za-z0-9.]+', ' ', i)
    sentence = sent_detector.tokenize(i.strip().replace("\n", " "))
    output_sentences.append(sentence)

output_sentences = [i for x in output_sentences for i in x if len(i) >= 50]
output_sentences = [x.strip().translate(str.maketrans('', '', string.punctuation)) for x in output_sentences]

In [226]:
# Randomly select 1,000 sentences from the flattened list of lists
from random import sample
import csv

output_sentences_sample = sample(output_sentences, 1500)
with open("../data/sentenceSample.csv", 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL, dialect='excel')
    wr.writerow(output_sentences_sample)