In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import time

### Web Scraper
This component will help in getting product description given a product URL.

Note: To crawl or scrape data from a website, ensure that the URLs are allowed on robot.txt of the website.
For ASDA robot.txt can be found here - https://groceries.asda.com/robots.txt 

In [3]:
def get_product_info(url):
    """
    This function is used to get product description from a given product URL.
    Input: url (String)
    Output: product_details (Dictionary) with product code and other descriptions.
    """
    product_code_class = "pdp-main-details__product-code"
    product_description_class = "pdp-description-reviews__product-details-cntr"
    product_desc_title_class = "pdp-description-reviews__product-details-title"
    product_desc_content_class = "pdp-description-reviews__product-details-content"

    # Initialize Firefox WebDriver
    driver = webdriver.Firefox()

    # Load the product page
    driver.get(url)
    
    # Handle Alert
    wait = WebDriverWait(driver, 10)  
    wait.until(EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler")))
    alert = driver.find_element_by_id('onetrust-accept-btn-handler')
    alert.click()

    product_details = {}
    
    # Get product code
    wait = WebDriverWait(driver, 30)  
    product_code = wait.until(EC.presence_of_element_located((By.CLASS_NAME, product_code_class)))
    product_details["Product Code"] = product_code.text.split()[-1]
    
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, product_description_class)))
    
    # Get all product descriptions
    descriptions = driver.find_elements_by_class_name(product_description_class)
    
    # Add any description titles that you would like to exclude
    exclude_titles = ["Net Content", "Ingredients", "Nutritional Values", "Storage", 
                       "Manufacturer Address", "Return To", "ASDA Product Information"]
    
    # Iterate through all product descriptions
    for desc in descriptions:
        # Get description sub-title
        title = desc.find_element(By.CLASS_NAME, product_desc_title_class).text
        if title in exclude_titles:
            continue
        # Get content under the sub-section
        content = desc.find_element(By.CLASS_NAME, product_desc_content_class).text
        product_details[title]=content
    
    driver.quit()
    return product_details

In [4]:
product_url = "https://groceries.asda.com/product/cornflakes-honey-nut/kelloggs-crunchy-nut-clusters-chocolate/1000383170056"
get_product_info(product_url)

{'Product Code': '7673249',
 'Life Style': 'Suitable for Vegetarians.',
 'Allergy Advice': 'May Contain: Cereals Containing Gluten. Contains: Barley, Milk, Oats, Peanuts, Soya, Wheat.',
 'Additives': 'Free From: Artificial Colours, Artificial Flavours.',
 'Features': "- Kellogg's Crunchy Nut Cluster Chocolate breakfast cereal is made with natural grains.. - Kellogg’s Crunchy Nut Clusters Chocolate breakfast cereal are made from crunchy golden oat clusters combined with peanuts and chocolate.. - Kellogg's Crunchy Nut Clusters Chocolate breakfast cereal is made with wholegrain.. - Kellogg's Crunchy Nut Clusters Chocolate breakfast cereal is made with no artificial colours or flavours.. - Try our range of Crunchy Nut cereals, granola, and snack bars - The trouble is they all taste too good!.",
 'Recycling Info': 'Recycle: Box.',
 'Country of Origin': 'United Kingdom',
 'Product Information': 'Breakfast just got crunchier with Kellogg’s Crunchy Nut Clusters Chocolate breakfast cereal. Each