In [1]:
import requests
from bs4 import BeautifulSoup
import csv
from queue import PriorityQueue
import time
import random

class ProductCrawler:
    def __init__(self, start_url, max_pages=50):
        """Initialize the crawler with starting URL and page limit"""
        self.urls = PriorityQueue()
        self.urls.put((1, start_url))  # Priority 1 for the start URL
        self.visited_urls = set()
        self.products = []
        self.max_pages = max_pages
        self.request_delay = 1  # Base delay in seconds
        
    def extract_products_from_page(self, url):
        """Extract product information from a given page"""
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, "html.parser")
            
            # Find all product elements on the page
            products_on_page = soup.find_all("li", class_="product")
            
            for product_element in products_on_page:
                product = {
                    'url': product_element.find('a')['href'],
                    'image': product_element.find('img')['src'],
                    'title': product_element.find('h2', class_='woocommerce-loop-product__title').text.strip()
                }
                
                # Extract price if available
                price_element = product_element.find('span', class_='woocommerce-Price-amount')
                product['price'] = price_element.text.strip() if price_element else 'Price not available'
                
                self.products.append(product)
                print(f"Extracted product: {product['title']}")
            
            # Find pagination links and add them with high priority
            next_page = soup.find('a', class_='next')
            if next_page and next_page['href'] not in self.visited_urls:
                self.urls.put((1, next_page['href']))  # Priority 1 for pagination

        except Exception as e:
            print(f"Error processing {url}: {e}")

    def crawl(self):
        """Main crawling method"""
        pages_processed = 0
        
        while not self.urls.empty() and pages_processed < self.max_pages:
            priority, current_url = self.urls.get()
            
            if current_url in self.visited_urls:
                continue
                
            print(f"\nProcessing page: {current_url}")
            self.extract_products_from_page(current_url)
            self.visited_urls.add(current_url)
            
            pages_processed += 1
            
            # Add random delay between requests
            delay = self.request_delay + random.uniform(0.5, 2.0)
            time.sleep(delay)

    def save_to_csv(self, filename='products.csv'):
        """Save extracted products to CSV file"""
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csv_file:
                writer = csv.DictWriter(csv_file, fieldnames=['url', 'image', 'title', 'price'])
                writer.writeheader()
                for product in self.products:
                    writer.writerow(product)
            print(f"\nExtraction complete. {len(self.products)} products saved to {filename}")
        except Exception as e:
            print(f"Error saving to CSV: {e}")

def main():
    start_url = "https://scrapeme.live/shop/"
    crawler = ProductCrawler(start_url, max_pages=5)
    
    print("Starting web crawler...")
    crawler.crawl()
    crawler.save_to_csv()

if __name__ == "__main__":
    main()

Starting web crawler...

Processing page: https://scrapeme.live/shop/
Extracted product: Bulbasaur
Extracted product: Ivysaur
Extracted product: Venusaur
Extracted product: Charmander
Extracted product: Charmeleon
Extracted product: Charizard
Extracted product: Squirtle
Extracted product: Wartortle
Extracted product: Blastoise
Extracted product: Caterpie
Extracted product: Metapod
Extracted product: Butterfree
Extracted product: Weedle
Extracted product: Kakuna
Extracted product: Beedrill
Extracted product: Pidgey

Processing page: https://scrapeme.live/shop/page/2/
Extracted product: Pidgeotto
Extracted product: Pidgeot
Extracted product: Rattata
Extracted product: Raticate
Extracted product: Spearow
Extracted product: Fearow
Extracted product: Ekans
Extracted product: Arbok
Extracted product: Pikachu
Extracted product: Raichu
Extracted product: Sandshrew
Extracted product: Sandslash
Extracted product: Nidorina
Extracted product: Nidoqueen
Extracted product: Nidorino
Extracted product