# Scraping Birds
1. Create driver
2. Go to [base website](https://www.vogelwarte.ch/en/birds/birds-of-switzerland/)
3. For each bird:
    * Go to dedicated website
    * Scrape information
    * Go back to base website
4. Quit driver
5. Export csv

In [13]:
### set up tools

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import TimeoutException
from selenium.common import NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
import time

In [14]:
### clock for measuring time

class Clock:
      def __init__(self, birds_list):
            self.iter_times = []
            self.list_length = len(birds_list)
            self.MINUTE_CONVERSION = 1/60

      def estimated_time_left(self, start, end):
            iter_time = end - start
            self.iter_times.append(iter_time)
            etl = (sum(self.iter_times) / len(self.iter_times)) * (self.list_length - (self.iter_times.index(iter_time) + 1)) * self.MINUTE_CONVERSION
            return etl

In [15]:
### set up data containers

birds = pd.read_csv('birds.csv')["name"].tolist()

latin_names = []
foods = []
habitats = []
nest_sites = []
presences = []

clock = Clock(birds_list=birds)

In [16]:
### set up web driver

service = Service("/Users/Alain/chromedriver_mac64/chromedriver")
options = Options()
options.add_argument("--headless")

driver = webdriver.Chrome(service=service, options=options)
wait = WebDriverWait(driver, 3)

In [None]:
### scrape data

# go to base website
base_URL = "https://www.vogelwarte.ch/en/birds/birds-of-switzerland/"

for bird in birds:

      start = time.time()

      driver.get(base_URL)
      wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'searchField'))
      )

      try:
            # go to specific website
            search = driver.find_elements(By.TAG_NAME, "input")[11]
            search.send_keys(bird)

            search.send_keys(Keys.ENTER)
            wait.until(
                  EC.presence_of_element_located((By.CLASS_NAME, 'speciesHeader'))
            )

            # find features
            web_page = driver.page_source
            soup = BeautifulSoup(web_page, "lxml")

            latin_name = soup.find("h3", text=bird).next_sibling.text
            characteristics = soup.find_all("div", class_="vds-col")
            food = characteristics[9].text
            habitat = characteristics[11].text
            nest_site = characteristics[15].text
            presence = soup.find("h2", text="Status (in CH)").next_sibling

      except (TimeoutException, NoSuchElementException):
            latin_name = ""
            characteristics = ""
            food = ""
            habitat = ""
            nest_site = ""
            presence = ""

      # store features
      latin_names.append(latin_name)
      foods.append(food)
      habitats.append(habitat)
      nest_sites.append(nest_site)
      presences.append(presence)

      # display status
      estimated_time_left = clock.estimated_time_left(start, time.time())
      print(str(birds.index(bird)+1) + "/" + str(len(birds)), "-",
            "estimated time left: " + str(round(estimated_time_left, 2)), "min -", bird)

driver.quit()

In [None]:
### store scraped data

df = pd.DataFrame({'English Name' : birds,'Latin Name': latin_names, 'Food': foods, 'Habitat': habitats, 'Nest Site': nest_sites, 'Status': presences})

df

In [None]:
### clean data set

df["Food"] = df["Food"].str.split(",")
df = df.explode("Food")

df

In [11]:
# export data set

df.to_csv('birds_scraped_data.csv', index=False)