In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.common.by import By

import time
import pandas as pd

### Web Crawler
This section will help in getting product URLs from a given product catalogue URL on ASDA groceries website.
These catelogue URLs can be found on the sitemap - https://groceries.asda.com/sitemap-category.xml 

Note: To crawl or scrape data from a website, ensure that the URLs are allowed on robot.txt of the website.
For ASDA robot.txt can be found here - https://groceries.asda.com/robots.txt 

In [2]:
def get_all_product_urls(url: str):
    """
    This function is used to get all the main product URLs from a given ASDA catalogue URL 
    (Sitemap: https://groceries.asda.com/sitemap-category.xml)
    Input: url (String)
    Output: product_url (DataFrame) with page number, product name, url
    """
    # Initialize Firefox WebDriver
    driver = webdriver.Firefox()
    
    # Load the page
    driver.get(url)
    
    # Alert
    wait = WebDriverWait(driver, 10)  
    wait.until(EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler")))
    alert = driver.find_element_by_id('onetrust-accept-btn-handler')
    alert.click()

    # Get number of pages
    wait = WebDriverWait(driver, 10)  
    page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".co-pagination__last-page")))
    max_no_pages = int(page.text)
    print(f"Max number of pages = {max_no_pages}")
    
    product_urls_df = pd.DataFrame(columns=["page_no", "product_name", "url"])
    
    # Iterate through each page
    for i in range(max_no_pages):
        # Get main product listing
        wait = WebDriverWait(driver, 30)  # Adjust the timeout as needed
        main_content = wait.until(EC.presence_of_element_located((By.XPATH,'//div[@data-module-type="ProductListing"]')))
        
        # Get all URLs (anchor tags)
        elements = main_content.find_elements_by_class_name("co-product__anchor")
        for ele in elements:
            # Save page number, product name and link
            product_details = {"page_no": i+1, "product_name": ele.text, "url": ele.get_attribute("href")}
            product_urls_df = pd.concat([product_urls_df, pd.DataFrame(product_details, index=[0])], ignore_index=True)
        
        # For the last page, do not click on right arrow to go to next page
        if i==max_no_pages-1:
            continue
        
        # Click on next page arrow
        wait = WebDriverWait(driver, 10)  # Adjust the timeout as needed
        retry = 0
        while retry<3:
            try:
                right_arrow = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,".co-pagination__arrow--right")))
                right_arrow.click()
                break
            except ElementNotInteractableException as e:
                retry+=1
                time.sleep(5)
    driver.quit()
    return product_urls_df
    

In [3]:
url = "https://groceries.asda.com/aisle/food-cupboard/cereals-cereal-bars/everyday-family-cereals/1215337189632-1215337194729-1215650880276"
product_urls = get_all_product_urls(url)
product_urls

Max number of pages = 4


Unnamed: 0,page_no,product_name,url
0,1,Weetabix Crispy Minis Chocolate Chip,https://groceries.asda.com/product/family-cere...
1,1,Kellogg's Coco Pops Breakfast Cereal,https://groceries.asda.com/product/family-cere...
2,1,Kellogg's Rice Krispies Breakfast Cereal,https://groceries.asda.com/product/family-cere...
3,1,Kellogg's Special K Milk Chocolate Multigrain ...,https://groceries.asda.com/product/special-k/k...
4,1,Kellogg's Rice Krispies Multigrain Shapes Honey,https://groceries.asda.com/product/family-cere...
...,...,...,...
164,4,Sistema Red Klip-it Microwaveable Soup Mug,https://groceries.asda.com/product/lunch-boxes...
165,4,ASDA Fibre Flakes Red Berry,https://groceries.asda.com/product/bran-flakes...
166,4,George Home Peel Lid Food Storers - Colour May...,https://groceries.asda.com/product/storage-con...
167,4,Nestle Go Free Gluten Free Honey Nut Flakes,https://groceries.asda.com/product/gluten-free...


In [4]:
product_urls.groupby(by="page_no")["url"].count().reset_index()

Unnamed: 0,page_no,url
0,1,48
1,2,52
2,3,54
3,4,15


In [5]:
def get_product_info(url):
    """
    This function is used to get product description from a given product URL.
    Input: url (String)
    Output: product_details (Dictionary) with product code and other descriptions.
    """
    product_code_class = "pdp-main-details__product-code"
    product_description_class = "pdp-description-reviews__product-details-cntr"
    product_desc_title_class = "pdp-description-reviews__product-details-title"
    product_desc_content_class = "pdp-description-reviews__product-details-content"

    # Initialize Firefox WebDriver
    driver = webdriver.Firefox()

    # Load the product page
    driver.get(url)
    
    # Handle Alert
    wait = WebDriverWait(driver, 10)  
    wait.until(EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler")))
    alert = driver.find_element_by_id('onetrust-accept-btn-handler')
    alert.click()

    product_details = {}
    
    # Get product code
    wait = WebDriverWait(driver, 30)  
    product_code = wait.until(EC.presence_of_element_located((By.CLASS_NAME, product_code_class)))
    product_details["Product Code"] = product_code.text.split()[-1]
    
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, product_description_class)))
    
    # Get all product descriptions
    descriptions = driver.find_elements_by_class_name(product_description_class)
    
    # Add any description titles that you would like to exclude
    exclude_titles = ["Net Content", "Ingredients", "Nutritional Values", "Storage", 
                       "Manufacturer Address", "Return To", "ASDA Product Information"]
    
    # Iterate through all product descriptions
    for desc in descriptions:
        # Get description sub-title
        title = desc.find_element(By.CLASS_NAME, product_desc_title_class).text
        if title in exclude_titles:
            continue
        # Get content under the sub-section
        content = desc.find_element(By.CLASS_NAME, product_desc_content_class).text
        product_details[title]=content
    
    driver.quit()
    return product_details

In [6]:
product_url = "https://groceries.asda.com/product/cornflakes-honey-nut/kelloggs-crunchy-nut-clusters-chocolate/1000383170056"
get_product_info(product_url)

{'Product Code': '7673249',
 'Life Style': 'Suitable for Vegetarians.',
 'Allergy Advice': 'May Contain: Cereals Containing Gluten. Contains: Barley, Milk, Oats, Peanuts, Soya, Wheat.',
 'Additives': 'Free From: Artificial Colours, Artificial Flavours.',
 'Features': "- Kellogg's Crunchy Nut Cluster Chocolate breakfast cereal is made with natural grains.. - Kellogg’s Crunchy Nut Clusters Chocolate breakfast cereal are made from crunchy golden oat clusters combined with peanuts and chocolate.. - Kellogg's Crunchy Nut Clusters Chocolate breakfast cereal is made with wholegrain.. - Kellogg's Crunchy Nut Clusters Chocolate breakfast cereal is made with no artificial colours or flavours.. - Try our range of Crunchy Nut cereals, granola, and snack bars - The trouble is they all taste too good!.",
 'Recycling Info': 'Recycle: Box.',
 'Country of Origin': 'United Kingdom',
 'Product Information': 'Breakfast just got crunchier with Kellogg’s Crunchy Nut Clusters Chocolate breakfast cereal. Each