In [1]:
from splinter import Browser
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import time
import csv

In [2]:
csv_file = 'scraped_data.csv'

In [3]:
def close_modal(browser):
    modal_selector = 'div.Modal__Backdrop'
    
    # Check if the modal is present before proceeding
    if browser.is_element_present_by_css(modal_selector, wait_time=5):
        # Attempt to close the modal
        try:
            #print("ATTEND TO CLOSE MODAL")
            browser.find_by_css('span.Modal__Close').first.click()
            #print("CLOSED MODAL")
        except ElementClickInterceptedException:
            # If the click is intercepted, try using JavaScript to close the modal
            browser.execute_script("document.querySelector('.Modal__Backdrop').click()")

In [4]:
def close_cookie_banner(browser):
    cookie_banner_selector = 'div#cookieBanner'
    if browser.is_element_present_by_css(cookie_banner_selector, wait_time=10):
        #print("ATTEND TO CLOSE COOKIE")
        # Use JavaScript to set the display style to none and hide the cookie banner
        browser.execute_script(f"document.querySelector('{cookie_banner_selector}').style.display = 'none'")
        #print("CLOSED COOKIE")

In [5]:
def expand_accordion_button(browser, index):
    button_selector = f'button.accordionButton[aria-expanded="true"][aria-controls="undefined_content_{index}"]'
    if browser.is_element_present_by_css(button_selector, wait_time=5):
        try:
            browser.find_by_css(button_selector).first.click()
        except ElementClickInterceptedException:
            browser.execute_script(f"document.querySelector('{button_selector}').click()")

In [6]:
def extract_text_from_div(browser, index):
    script = f"return document.querySelector('div#undefined_content_{index}').innerHTML.trim();"
    
    try:
        div_html = browser.execute_script(script)

        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(div_html, 'html.parser')

        # Remove specific CSS styles
        for style_tag in soup('style'):
            style_tag.decompose()

        # Extract text content from cleaned HTML
        div_text = soup.get_text(separator=' ', strip=True)

        if index == 0:
            print(f"Product Description: {div_text}")
        elif index == 1:
            print(f"Ingredients: {div_text}")
        elif index == 2:
            print(f"Guaranteed Analysis: {div_text}")

        return div_text
    except Exception as e:
        print(f"Error extracting text from <div_{index}>: {e}")
        return None

In [7]:
def scrape_product_page(browser):
    scraped_data = []
    browser.is_element_present_by_css('div.productContainer__productContainer', wait_time=10)

    # Get HTML of the product page and parse it with BeautifulSoup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    product_name = soup.find('h1', class_='productHeader__title').text.strip()
    product_price = soup.find('div', class_='productDetails__price').text.strip()

    selected_variant_button = soup.find('button', class_='productDetails__selected')
    selected_variant = selected_variant_button.text.strip() if selected_variant_button else None

    print(f"Product Name: {product_name}")
    print(f"Product Price: {product_price}")
    print(f"Selected Variant: {selected_variant}")

    description = extract_text_from_div(browser, 0)
    ingredients = extract_text_from_div(browser, 1)
    guaranteed_analysis = extract_text_from_div(browser, 2)

    print("\n" + "=" * 70 + "\n")

    # Append data to the scraped_data list
    scraped_data.append({
        'Product Name': product_name,
        'Price': product_price,
        'Bag Size': selected_variant,
        'Description': description,
        'Ingredients': ingredients,
        'Guaranteed Analysis': guaranteed_analysis
    })

    # Clear the browser cookies to avoid interference with the next product page
    browser.cookies.delete()

    return scraped_data

In [8]:
def scrape_page(browser):
    # Wait for the product grids to be present
    browser.is_element_present_by_css('div.DynamicProductList__Table', wait_time=10)

    # Get HTML of the page and Parse the HTML with BeautifulSoup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Find and click on each product card's link
    product_cards = soup.find_all('a', class_='ProductResultName', href=True)

    # Check if any product cards were found before proceeding
    if product_cards:
        for product_card in product_cards:
            # Extract the href attribute from the product card
            product_href = product_card['href']

            # Try to click the product card link using JavaScript
            try:
                print(f"https://www.petvalu.ca/{product_href}")
                browser.execute_script(f"document.querySelector('a.ProductResultName[href=\"{product_href}\"]').click()")

                # Call the function to scrape information from the individual product page
                scraped_data = scrape_product_page(browser)

                # Write data to CSV
                with open(csv_file, 'a', newline='', encoding='utf-8') as file:
                    csv_writer = csv.writer(file)

                    for data in scraped_data:
                        csv_writer.writerow([data['Product Name'], data['Price'], data['Bag Size'], data['Description'],
                                            data['Ingredients'], data['Guaranteed Analysis']])

            except Exception as e:
                print(f"Error clicking the product card link: {e}")

            # Go back to the previous page (the list of product cards)
            browser.back()

            # Close the modal and cookie banner on each subsequent page
            close_modal(browser)
            close_cookie_banner(browser)
    else:
        print("No product cards found on the page.")

    return soup

In [9]:
# Set up Splinter
browser = Browser('chrome', headless=False) 

# Starting URL
base_url = 'https://www.petvalu.ca/category/dog/dry-food/30-081318-003'
browser.visit(base_url)

In [10]:
num_pages_to_scrape = 1
page_counter = 1

while page_counter <= num_pages_to_scrape:
    print(f"Scraping information from page {page_counter}...\n")

    # Scrape information from the current product page
    scraped_data = scrape_page(browser)

    # Write data to CSV
    with open(csv_file, 'a', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)

        for data in scraped_data:
            # Use get method to avoid KeyError and replace None with an empty string
            csv_writer.writerow([
                data.get('Product Name', ''),
                data.get('Price', ''),
                data.get('Bag Size', ''),
                data.get('Description', ''),
                data.get('Ingredients', ''),
                data.get('Guaranteed Analysis', ''),
            ])

    # Increment the page counter
    page_counter += 1

    # Navigate to the next page using JavaScript click
    next_page_link_selector = 'a.page_button.next.link'
    
    # Navigate to the next page using Splinter's built-in method
    if browser.is_element_present_by_css(next_page_link_selector, wait_time=5):
        browser.execute_script(f"document.querySelector('{next_page_link_selector}').click()")
        
        # Give the browser a moment to load the next page
        time.sleep(2)
    else:
        # Break out of the loop if the "Next" button is not found
        print("No 'Next' button found. Exiting...")
        break

    # Close the modal and cookie banner on each subsequent page
    close_modal(browser)
    close_cookie_banner(browser)

Scraping information from page 1...

https://www.petvalu.ca//product/performatrin-ultra-limited-ingredient-kangaroo-recipe-adult-dog-food/FCM07083CA
Product Name: performatrin Ultra Limited Ingredient Kangaroo Recipe Adult Dog Food
Product Price: $112.99
Selected Variant: 24 lb
Product Description: performatrin Ultra Limited Ingredient Kangaroo Recipe Adult Dog Food embraces holistic nutrition, focusing on functional, whole foods. This recipe offers exceptional, complete and balanced nutrition for pets with food sensitivities. Key Benefits: Single source animal protein and limited carbohydrate resources. Kangaroo is a highly nutritious lean red meat that provides essential protein and helps maintain strong muscles. Easily digestible Green Peas and Chickpeas. No Grains, Gluten, Chicken, Fish, Corn, Eggs, Dairy, Artificial Flavours, Colours or Preservatives.
Ingredients: Kangaroo, Peas, Chickpeas, Dried Peas, Lentils, Kangaroo Meal, Pea Protein, Canola Oil (preserve with Mixed Tocopherol

KeyboardInterrupt: 