# Selenium

### Here we use selenium alongside geckodriver to create a bot browser instance so we can circumvent some javascript events

In [None]:
!pip install selenium

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup as BS
import pandas as pd
pd.set_option('display.max_columns', None)

### Creating a webdriver

In [2]:
driver = webdriver.Firefox(executable_path='./geckodriver')

Creating a Selenium Firefox webdriver using geckodriver results in a real browser window that we can control with python


![example-img](../src/selenium-webdriver.png)

Using this driver, we can go to any page we'd like.

In [26]:
url = 'https://www.woolworths.co.za'

In [28]:
driver.get(url)

![example-img](../src/selenium-url.png)

### Why Selenium?

I think it's best to learn by example, so let's look at a simple one!

Say we want to scrape the nutrional information about a list of Woolworths products. To develop the flow, we start with a single product.

In [29]:
product_id = 8000500037874
product_url = 'https://www.woolworths.co.za/cat?Ntt={}&Dy=1'.format(product_id)
driver.get(product_url)

In [32]:
soup = BS(driver.page_source, 'lxml')

### Inspect the html to see which element or class name can we use to find the table

In [35]:
soup.find('table')

### Nothing found since the table is hidden until an event occurs

In [37]:
clickable_list = driver.find_elements_by_class_name('accordion__toggle--chrome')
clickable_list

[<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="7ae68c35-30cb-4bba-a653-4755614afbea", element="7273feb7-aaa9-41e8-91d2-961be6b8f00a")>,
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="7ae68c35-30cb-4bba-a653-4755614afbea", element="d8c73d68-ee43-498b-82c8-1cb7ef0df00e")>,
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="7ae68c35-30cb-4bba-a653-4755614afbea", element="89d3787c-afc3-408c-9bb2-d55a5efe54b8")>,
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="7ae68c35-30cb-4bba-a653-4755614afbea", element="1e3c34a9-309b-4d51-b78e-f6b53ea8cfd3")>,
 <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="7ae68c35-30cb-4bba-a653-4755614afbea", element="84507d68-a57b-4157-b6cb-3a22d49971a3")>]

In [38]:
type(clickable_list[0])

selenium.webdriver.firefox.webelement.FirefoxWebElement

In [42]:
dir(clickable_list[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_execute',
 '_id',
 '_parent',
 '_upload',
 '_w3c',
 'anonymous_children',
 'clear',
 'click',
 'find_anonymous_element_by_attribute',
 'find_element',
 'find_element_by_class_name',
 'find_element_by_css_selector',
 'find_element_by_id',
 'find_element_by_link_text',
 'find_element_by_name',
 'find_element_by_partial_link_text',
 'find_element_by_tag_name',
 'find_element_by_xpath',
 'find_elements',
 'find_elements_by_class_name',
 'find_elements_by_css_selector',
 'find_elements_by_id',
 'find_elements_by_link_text',
 'find_elements_by_name',
 'find_elements_by_partial_link_text',
 'find_elements_by_tag_name',
 'find_

In [47]:
clickable_list[0].text

'DETAILS'

In [55]:
for clickable in clickable_list:
    if clickable.text.lower() == 'nutritional information':
        clickable.click()

### Reload soup after click!

In [57]:
soup = BS(driver.page_source, 'lxml')

In [58]:
soup.find('table')

<table cellpadding="0" cellspacing="0" class="table table-scroll__table table--zebra table--nutrition"><thead class="table__head"><tr class="table-scroll__row"><th class="pdp-desc-font">Description</th><th class="pdp-desc-font">Per<br/>100g/ml</th><th class="pdp-desc-font">Per<br/>Serving</th><th class="pdp-desc-font">Measurement</th><th class="pdp-desc-font">% NRV<br/> per<br/>serving</th></tr></thead><tbody><tr class="table-scroll__row"><th>Portion Size</th><td>100</td><td></td><td></td><td></td></tr><tr class="table-scroll__row"><th>Energy</th><td>2419</td><td>301</td><td>kJ</td><td>-</td></tr><tr class="table-scroll__row"><th>Protein</th><td>8.8</td><td>1.1</td><td>g</td><td>-</td></tr><tr class="table-scroll__row"><th>Carbohydrate</th><td>42.3</td><td>5.3</td><td>g</td><td>-</td></tr><tr class="table-scroll__row"><th>    Of which Sugars</th><td>36.4</td><td>4.5</td><td>g</td><td>-</td></tr><tr class="table-scroll__row"><th>Total Fat</th><td>41.9</td><td>5.2</td><td>g</td><td>-</td

### Tables in html
- Rows are represented with `<tr>`
- Headers are represented with `<th>`
- Data is represented with `<td>`

```
<table>
 <tr><th>top-header1</th><th>top-header2</th><th>top-header3</th><th>top-header4</th>
 <tr><th>side-header1</th><td>x_1</td><td>y_1</td><td>z_1</td></tr>
 <tr><th>side-header2</th><td>x_2</td><td>y_2</td><td>z_2</td></tr>
 <tr><th>side-header3</th><td>x_3</td><td>y_3</td><td>z_3</td></tr>
</table>
```
<table>
 <tr><th>top-header1</th><th>top-header2</th><th>top-header3</th><th>top-header4</th>
 <tr><th>side-header1</th><td>x_1</td><td>y_1</td><td>z_1</td></tr>
 <tr><th>side-header2</th><td>x_2</td><td>y_2</td><td>z_2</td></tr>
 <tr><th>side-header3</th><td>x_3</td><td>y_3</td><td>z_3</td></tr>
</table>


![example](../src/selenium-table.png)

In [59]:
table_rows = soup.find_all('tr')
table_rows

[<tr class="table-scroll__row"><th class="pdp-desc-font">Description</th><th class="pdp-desc-font">Per<br/>100g/ml</th><th class="pdp-desc-font">Per<br/>Serving</th><th class="pdp-desc-font">Measurement</th><th class="pdp-desc-font">% NRV<br/> per<br/>serving</th></tr>,
 <tr class="table-scroll__row"><th>Portion Size</th><td>100</td><td></td><td></td><td></td></tr>,
 <tr class="table-scroll__row"><th>Energy</th><td>2419</td><td>301</td><td>kJ</td><td>-</td></tr>,
 <tr class="table-scroll__row"><th>Protein</th><td>8.8</td><td>1.1</td><td>g</td><td>-</td></tr>,
 <tr class="table-scroll__row"><th>Carbohydrate</th><td>42.3</td><td>5.3</td><td>g</td><td>-</td></tr>,
 <tr class="table-scroll__row"><th>    Of which Sugars</th><td>36.4</td><td>4.5</td><td>g</td><td>-</td></tr>,
 <tr class="table-scroll__row"><th>Total Fat</th><td>41.9</td><td>5.2</td><td>g</td><td>-</td></tr>,
 <tr class="table-scroll__row"><th>    Of which mono unsaturated fatty acids</th><td>25.1</td><td>3.1</td><td>g</td><t

In [15]:
nutrient_dict = {}
for row in table_rows[1:]:
    nutrient_dict[row.th.text] = row.td.text

In [16]:
nutrient_dict

{'Portion Size': '100',
 'Energy': '2419',
 'Protein': '8.8',
 'Carbohydrate': '42.3',
 '    Of which Sugars': '36.4',
 'Total Fat': '41.9',
 '    Of which mono unsaturated fatty acids': '25.1',
 '    Of which poly unsaturated fatty acids': '3.1',
 '    Of which saturated fatty acids': '13.7',
 '    Of which trans fatty acids': '0.1',
 'Cholesterol': '4.9',
 'Dietary Fibre': '4.5',
 'Sodium': '43'}

### With the scrape flow working for 1 product, lets put it into a function

In [17]:
def scrape_product_nutrition(product_id):
    product_url = 'https://www.woolworths.co.za/cat?Ntt={}&Dy=1'.format(str(product_id))
    
    driver.get(product_url)
    
    clickable_list = driver.find_elements_by_class_name('accordion__toggle--chrome')
    
    for i, clickable in enumerate(clickable_list):
        if clickable.text.lower() == 'nutritional information':
            clickable.click()
    
    soup = BS(driver.page_source, 'lxml')
    
    product = soup.find(attrs={'class':'prod-name'}).text
    
    table_rows = soup.find_all('tr')
    
    nutrient_dict = {}
    nutrient_dict['product_id'] = str(product_id)
    nutrient_dict['product_name']= product
    
    for row in table_rows[1:]:
        nutrient_dict[row.th.text] = row.td.text
    
    return nutrient_dict

### Check that it scrapes correctly

In [60]:
scrape_product_nutrition(8000500037874)

{'product_id': '8000500037874',
 'product_name': 'Ferrero Rocher 200g',
 'Portion Size': '100',
 'Energy': '2419',
 'Protein': '8.8',
 'Carbohydrate': '42.3',
 '    Of which Sugars': '36.4',
 'Total Fat': '41.9',
 '    Of which mono unsaturated fatty acids': '25.1',
 '    Of which poly unsaturated fatty acids': '3.1',
 '    Of which saturated fatty acids': '13.7',
 '    Of which trans fatty acids': '0.1',
 'Cholesterol': '4.9',
 'Dietary Fibre': '4.5',
 'Sodium': '43'}

In [61]:
essential_list = [3046920029759, 6009204330887, 6009801741758, 6001275000003, 6009178222607]

In [65]:
products = []
for product_id in essential_list:
    products.append(scrape_product_nutrition(product_id))

In [66]:
df = pd.DataFrame(products)

In [67]:
df

Unnamed: 0,product_id,product_name,Portion Size,Energy,Calories,Protein,Carbohydrate,Of which Sugars,Total Fat,Of which saturated fatty acids,Cholesterol,Dietary Fibre,Sodium,Glycaemic carbohydrates,of which saturated fat,of which monounsaturated fat,of which polyunsaturated fat,Dietary Fibre#,Total sodium,Vitamin B1 or Thiamin,Vitamin B2 or Riboflavin,Tinamide or niacin,Vitamin B6 or pyridoxine,Folic Acid or Folacin,Biotin,Calcium,Phosphorus,Iron,Magnesium,Zinc
0,3046920029759,Lindt Excellence 90% Cocoa Dark Chocolate 100g,100,2483,592.0,10.0,14,7.0,55,30,1.6,5.0,0.01,,,,,,,,,,,,,,,,,
1,6009204330887,Coconut Nectar with Chocolate 330ml,100,141,,0.4,4,7,<0.1,<0.1,,0.7,17.0,,,,,,,,,,,,,,,,,
2,6009801741758,Futurelife® Low GI Strawberry Flavour Cereal 500g,100,1673,,18.0,,,9.8,,,,,15.0,2.9,3.5,3.4,6.1,284.0,,,,,,,,,,,
3,6001275000003,Jungle Oats 1Kg,100,1443,,12.3,56.2,,9.1,,,3.7,0.0,,,,,,,0.53,0.3,1.4,0.2,25.0,15.0,45.0,440.0,4.0,140.0,3.5
4,6009178222607,Fynbos Honey 375g,100,1404\tkJ,,0.3,82.3\tg,82.3\tg,0g,,0.0,0.0,5.0,,,,,,,,,,,,,,,,,


## That concludes scraping with Selenium!