In [2]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotInteractableException, TimeoutException
from selenium.webdriver.common.by import By

import time
import pandas as pd

### Web Crawler
This section will help in getting product URLs from a given product catalogue URL on ASDA groceries website.
These catelogue URLs can be found on the sitemap - https://groceries.asda.com/sitemap-category.xml 

Note: To crawl or scrape data from a website, ensure that the URLs are allowed on robot.txt of the website.
For ASDA robot.txt can be found here - https://groceries.asda.com/robots.txt 

In [1]:
def get_all_product_urls(url: str):
    """
    This function is used to get all the main product URLs from a given ASDA catalogue URL 
    (Sitemap: https://groceries.asda.com/sitemap-category.xml)
    Input: url (String)
    Output: product_url (DataFrame) with page number, product name, url
    """
    print(f"Input URL: {url}")
    # Initialize Firefox WebDriver
    driver = webdriver.Firefox()
    
    # Load the page
    driver.get(url)
    
    # Alert
    wait = WebDriverWait(driver, 10)  
    wait.until(EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler")))
    alert = driver.find_element_by_id('onetrust-accept-btn-handler')
    alert.click()
    
    # Check if it is the right page
    try:
        wait = WebDriverWait(driver, 30)  # Adjust the timeout as needed
        main_content = wait.until(EC.presence_of_element_located((By.XPATH,'//div[@data-module-type="ProductListing"]')))
    except TimeoutException as e:
        print("Could not load this page to crawl for product URLs, please check the URL provided.")
        driver.quit()
        return None

    # Get number of pages
    try:
        wait = WebDriverWait(driver, 10)  
        page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".co-pagination__last-page")))
        max_no_pages = int(page.text)  
    except TimeoutException as e:
        max_no_pages = 1
    
    print(f"Max number of pages = {max_no_pages}")
    product_urls_df = pd.DataFrame(columns=["page_no", "product_name", "url"])
    
    # Iterate through each page
    for i in range(max_no_pages):
        # Get main product listing
        wait = WebDriverWait(driver, 30)  # Adjust the timeout as needed
        main_content = wait.until(EC.presence_of_element_located((By.XPATH,'//div[@data-module-type="ProductListing"]')))
        
        # Get all URLs (anchor tags)
        elements = main_content.find_elements_by_class_name("co-product__anchor")
        for ele in elements:
            # Save page number, product name and link
            product_details = {"page_no": i+1, "product_name": ele.text, "url": ele.get_attribute("href")}
            product_urls_df = pd.concat([product_urls_df, pd.DataFrame(product_details, index=[0])], ignore_index=True)
        
        # For the last page, do not click on right arrow to go to next page
        if i==max_no_pages-1:
            continue
        
        # Click on next page arrow
        wait = WebDriverWait(driver, 10)  # Adjust the timeout as needed
        retry = 0
        while retry<3:
            try:
                right_arrow = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,".co-pagination__arrow--right")))
                right_arrow.click()
                break
            except ElementNotInteractableException as e:
                retry+=1
                time.sleep(5)
    driver.quit()
    product_urls_df["parent_url"] = url
    print(f"Number of URLs extracted: {product_urls_df.shape[0]}")
    return product_urls_df
    

In [4]:
if __name__=="__main__":
    url = "https://groceries.asda.com/aisle/food-cupboard/cereals-cereal-bars/everyday-family-cereals/1215337189632-1215337194729-1215650880276"
    product_urls = get_all_product_urls(url)
    print(f"Number of URLs extracted from each page: ")
    print(product_urls.groupby(by="page_no")["url"].count().reset_index())

Max number of pages = 4
