In [1]:
from splinter import Browser
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import time

In [2]:
def close_modal(browser):
    modal_selector = 'div.Modal__Backdrop'
    
    # Check if the modal is present before proceeding
    if browser.is_element_present_by_css(modal_selector, wait_time=5):
        # Attempt to close the modal
        try:
            print("ATTEND TO CLOSE MODAL")
            browser.find_by_css('span.Modal__Close').first.click()
            print("CLOSED MODAL")
        except ElementClickInterceptedException:
            # If the click is intercepted, try using JavaScript to close the modal
            browser.execute_script("document.querySelector('.Modal__Backdrop').click()")

In [3]:
def close_cookie_banner(browser):
    cookie_banner_selector = 'div#cookieBanner'
    if browser.is_element_present_by_css(cookie_banner_selector, wait_time=10):
        print("ATTEND TO CLOSE COOKIE")
        # Use JavaScript to set the display style to none and hide the cookie banner
        browser.execute_script(f"document.querySelector('{cookie_banner_selector}').style.display = 'none'")
        print("CLOSED COOKIE")

In [4]:
# def expand_accordion_button(browser, index):
#     button_selector = f'button.accordionButton[aria-expanded="true"][aria-controls="undefined_content_{index}"]'
#     if browser.is_element_present_by_css(button_selector, wait_time=5):
#         try:
#             browser.find_by_css(button_selector).first.click()
#         except ElementClickInterceptedException:
#             browser.execute_script(f"document.querySelector('{button_selector}').click()")

In [5]:
# def extract_text_from_div(browser, index):
#     script = f"return document.querySelector('div#undefined_content_{index}').innerHTML.trim();"
    
#     try:
#         div_html = browser.execute_script(script)

#         # Parse HTML content with BeautifulSoup
#         soup = BeautifulSoup(div_html, 'html.parser')

#         # Remove specific CSS styles
#         for style_tag in soup('style'):
#             style_tag.decompose()

#         # Extract text content from cleaned HTML
#         div_text = soup.get_text(separator=' ', strip=True)

#         if index == 0:
#             print(f"Product Description: {div_text}")
#         elif index == 1:
#             print(f"Ingredients: {div_text}")
#         elif index == 2:
#             print(f"Guaranteed Analysis: {div_text}")
#     except Exception as e:
#         print(f"Error extracting text from <div_{index}>: {e}")

In [6]:
def scrape_product_page(browser):
    browser.is_element_present_by_css('div.productContainer__productContainer', wait_time=10)

    # Get HTML of the product page and parse it with BeautifulSoup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # expand_accordion_button(browser, 0)  # Assuming index 0 for the main content

    product_name = soup.find('h1', class_='productHeader__title').text.strip()
    product_price = soup.find('div', class_='productDetails__price').text.strip()

    selected_variant_button = soup.find('button', class_='productDetails__selected')
    selected_variant = selected_variant_button.text.strip() if selected_variant_button else None

    # for index in range(0, 3): 
    #     expand_accordion_button(browser, index)
    #     extract_text_from_div(browser, index)

    print(f"Product Name: {product_name}")
    print(f"Product Price: {product_price}")
    print(f"Selected Variant: {selected_variant}")
    print("\n" + "=" * 30 + "\n")

    # Go back to the previous page (the list of product cards)
    browser.back()


In [7]:
def scrape_page(browser):
    # Wait for the product grids to be present
    browser.is_element_present_by_css('div.DynamicProductList__Table', wait_time=10)

    # Get HTML of the page and Parse the HTML with BeautifulSoup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Find and click on each product card's link
    product_cards = soup.find_all('a', class_='ProductResultName', href=True)

    # Check if any product cards were found before proceeding
    if product_cards:
        for product_card in product_cards:
            # Extract the href attribute from the product card
            product_href = product_card['href']

            # Try to click the product card link using JavaScript
            try:
                print("Attend to click product")
                print(product_href)
                browser.execute_script(f"document.querySelector('a.ProductResultName[href=\"{product_href}\"]').click()")

                # Call the function to scrape information from the individual product page
                soup = scrape_product_page(browser)
                print("Clicked product")
            except Exception as e:
                print(f"Error clicking the product card link: {e}")

            # Go back to the previous page (the list of product cards)
            browser.back()

            # Close the modal and cookie banner on each subsequent page
            close_modal(browser)
            close_cookie_banner(browser)
    else:
        print("No product cards found on the page.")

    return soup

In [8]:
# Set up Splinter
browser = Browser('chrome', headless=False) 

# Starting URL
base_url = 'https://www.petvalu.ca/category/dog/dry-food/30-081318-003'
browser.visit(base_url)

In [9]:
# Number of pages and cards per page
num_pages_to_scrape = 10

In [10]:
for page_num in range(1, num_pages_to_scrape + 1):
    print(f"Scraping information from page {page_num}...\n")

    # Scrape information from the current product page
    scrape_page(browser)

    # Call the function to scrape information from the individual product page
    try:
        scrape_product_page(browser)
    except Exception as e:
        print(f"Error scraping product page: {e}")

    # Navigate to the next page using Splinter's built-in method
    try:
        next_button = browser.find_link_by_partial_text('Next')
        if next_button:
            next_button.first.click()
            time.sleep(2)
        else:
            print("No 'Next' button found. Exiting...")
            break
    except Exception as e:
        print(f"Error clicking 'Next' button: {e}")
        break

    # Close the modal and cookie banner on each subsequent page
    close_modal(browser)
    close_cookie_banner(browser)

Scraping information from page 1...

Attend to click product
/product/performatrin-ultra-limited-ingredient-kangaroo-recipe-adult-dog-food/FCM07083CA
Product Name: performatrin Ultra Limited Ingredient Kangaroo Recipe Adult Dog Food
Product Price: $112.99
Selected Variant: 24 lb


Clicked product
ATTEND TO CLOSE MODAL
CLOSED MODAL
ATTEND TO CLOSE COOKIE
CLOSED COOKIE
Attend to click product
/product/performatrin-ultra-wholesome-grains-lamb-brown-rice-recipe-adult-dog-food/FCM06325CA
Product Name: performatrin Ultra Wholesome Grains Lamb & Brown Rice Recipe Adult Dog Food
Product Price: $86.99
Selected Variant: 24 lb


Clicked product
ATTEND TO CLOSE COOKIE
CLOSED COOKIE
Attend to click product
/product/performatrin-ultra-limited-ingredient-sweet-potato-venison-recipe-adult-dog-food/FCM06351CA
Product Name: performatrin Ultra Limited Ingredient Sweet Potato & Venison Recipe Adult Dog Food
Product Price: $115.99
Selected Variant: 24 lb


Clicked product


KeyboardInterrupt: 