# Selenium

In [None]:
#| default_exp selenium_utils

In [None]:
#| export
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

When scraping with selenium, we should provide relevant headers when making HTTP requests to effectively mimic a popular browser and client, thereby minimizing the chances of a website identifying this software as an automated process which it may block:  

In [None]:
#| export
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}

We will automate a Chrome browser to navigate webpages and extract data. First we need to install appropriate Chrome webdrivers. We should run this cell once before attempting any scrape:

In [None]:
#| export
service = Service(executable_path=ChromeDriverManager().install())

ValueError: There is no such driver by url https://chromedriver.storage.googleapis.com/LATEST_RELEASE_115.0.5790

In [None]:
!pip uninstall webdriver_manager

Found existing installation: webdriver-manager 3.8.6
Uninstalling webdriver-manager-3.8.6:
  Would remove:
    /Users/seangreaves/miniconda3/envs/deliveroo/lib/python3.10/site-packages/tests_negative/*
    /Users/seangreaves/miniconda3/envs/deliveroo/lib/python3.10/site-packages/tests_xdist/*
    /Users/seangreaves/miniconda3/envs/deliveroo/lib/python3.10/site-packages/webdriver_manager-3.8.6.dist-info/*
    /Users/seangreaves/miniconda3/envs/deliveroo/lib/python3.10/site-packages/webdriver_manager/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
#| export
assert service

In [None]:
#| export
def initialise_driver(service, # Instance of `selenium.webdriver.chrome.service.Service`
                      headless:bool=False): # Set browser to run headless [False] or visble [True]
    "Initialises Chrome WebDriver"
    options = Options()
    options.headless = headless
    return webdriver.Chrome(service=service, options=options) 

driver = initialise_driver(service, True)

  options.headless = headless


In [None]:
#| export
assert driver

Lets go ahead an start driving a Chrome Browser. We will set headless to `True` so we won't be able to see the browser. You can set `headless=False` if you'd like to view the browser.

We can now try access a webpage and get an HTML element.

In [None]:
#| export
def get_element_by(url:str, # url to search for
                       driver, # driver initialised with `initialise_driver`
                       css_component:str="id", # accepted values: `id`, `class_name`, `css_selector`
                       css_component_value:str="", # id of element to wait for when url page renders
                       timeout:int=15, # seconds to wait for element to appear before timeout error
                      ):
                   "Gets selenium web element that matches HTML element ID. Waits for element to load before user-defined timeout"
                   css_component_lookup = {"id": By.ID, "css_selector": By.CSS_SELECTOR, "class_name": By.CLASS_NAME}
                   if not css_component_lookup[css_component]:
                       print("Invalid css_component provided. Should be either 'id', 'class_name' or 'css_selector")
                       return
                   driver.get(url)
                   wait = WebDriverWait(driver, timeout)    
                   filter_input = wait.until(EC.presence_of_element_located((css_component_lookup[css_component], css_component_value)))
                   return filter_input

Lets attempt to grab the footer element from the Autonomy ADU homepage:

In [None]:
base_url = "https://autonomy.work/adu/"
filter_input = get_element_by(base_url, driver, 'id', 'footer')

In [None]:
assert filter_input

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()