In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import time
import logging
import urllib.parse
from collections import deque
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class GPUCrawler:
    def __init__(self, max_products_per_site=50, max_depth=2):
        self.max_products_per_site = max_products_per_site
        self.max_depth = max_depth
        self.visited_urls = set()
        self.all_data = {
            "ebay": [],
            "amazon": []
        }

    def setup_driver(self):
        options = Options()
        # options.add_argument("--headless")  # Run in background
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")  # Try to avoid detection
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")

        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            return driver
        except Exception as e:
            logging.error(f"Error setting up driver: {e}")
            return None

    def is_valid_url(self, url, base_domain):
        """Check if URL belongs to the given base domain and hasn't been visited yet"""
        if not url or not isinstance(url, str):
            return False

        if url in self.visited_urls:
            return False

        if base_domain not in url:
            return False

        # Filter out irrelevant pages
        excluded_terms = ['policy', 'help', 'contact', 'about', 'signin', 'login', 'cart']
        if any(term in url.lower() for term in excluded_terms):
            return False

        return True

    def extract_product_links(self, driver, base_url, selector, attribute='href'):
        """Extract product links from search results page"""
        links = []
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            for element in elements:
                try:
                    link = element.get_attribute(attribute)
                    if link and self.is_valid_url(link, urllib.parse.urlparse(base_url).netloc):
                        links.append(link)
                except Exception as e:
                    logging.debug(f"Error extracting link: {e}")

            logging.info(f"Found {len(links)} product links")
            return links
        except Exception as e:
            logging.error(f"Error extracting product links: {e}")
            return []

    def random_delay(self, min_seconds=2, max_seconds=5):

        delay = random.uniform(min_seconds, max_seconds)
        time.sleep(delay)

    def crawl_ebay(self, driver):
        base_url = "https://www.ebay.com/sch/i.html?_nkw=gpu&_sacat=0"
        domain = "ebay.com"

        try:
            # Start with the search results page
            logging.info(f"Starting eBay crawl from {base_url}")
            driver.get(base_url)
            time.sleep(5)

            # Queue for BFS crawling - each entry is (url, depth)
            url_queue = deque([(base_url, 0)])

            while url_queue and len(self.all_data["ebay"]) < self.max_products_per_site:
                current_url, depth = url_queue.popleft()

                if current_url in self.visited_urls or depth > self.max_depth:
                    continue

                self.visited_urls.add(current_url)
                logging.info(f"Visiting eBay URL (depth {depth}): {current_url}")

                try:
                    driver.get(current_url)
                    self.random_delay(3, 7)

                    # Check if we're on a product page or search results page
                    if "/itm/" in current_url:
                        # Product detail page
                        try:
                            title = driver.find_element(By.CSS_SELECTOR, "h1.x-item-title__mainTitle span").text.strip()
                            price_elem = driver.find_element(By.CSS_SELECTOR, "div.x-price-primary span")
                            price = price_elem.text.strip()

                            # Extract additional details
                            specs = {}
                            try:
                                spec_sections = driver.find_elements(By.CSS_SELECTOR, "div.ux-layout-section__item")
                                for section in spec_sections:
                                    try:
                                        labels = section.find_elements(By.CSS_SELECTOR, "div.ux-labels-values__labels")
                                        values = section.find_elements(By.CSS_SELECTOR, "div.ux-labels-values__values")
                                        for i in range(min(len(labels), len(values))):
                                            key = labels[i].text.strip()
                                            value = values[i].text.strip()
                                            if key and value:
                                                specs[key] = value
                                    except:
                                        pass
                            except:
                                pass

                            product_data = {
                                "name": title,
                                "price": price,
                                "url": current_url,
                                "specifications": specs
                            }

                            self.all_data["ebay"].append(product_data)
                            logging.info(f"Scraped eBay product: {title[:30]}... - {price}")
                        except Exception as e:
                            logging.error(f"Error scraping eBay product page: {e}")

                    else:
                        # Search results page - extract product links
                        product_links = self.extract_product_links(
                            driver,
                            base_url,
                            "a.s-item__link"
                        )

                        # Add product links to the queue
                        for link in product_links:
                            if self.is_valid_url(link, domain) and link not in self.visited_urls:
                                url_queue.append((link, depth + 1))

                        # Also add pagination links if we're still at shallow depth
                        if depth < 1:
                            pagination_links = self.extract_product_links(
                                driver,
                                base_url,
                                "a.pagination__item"
                            )
                            for link in pagination_links:
                                if self.is_valid_url(link, domain) and link not in self.visited_urls:
                                    url_queue.append((link, depth))

                except Exception as e:
                    logging.error(f"Error processing eBay URL {current_url}: {e}")

            logging.info(f"Completed eBay crawl. Collected {len(self.all_data['ebay'])} products.")
            return self.all_data["ebay"]

        except Exception as e:
            logging.error(f"Error during eBay crawl: {e}")
            return self.all_data["ebay"]

    def crawl_amazon(self, driver):
        base_url = "https://www.amazon.com/s?k=gpu"
        domain = "amazon.com"

        try:
            # Start with the search results page
            logging.info(f"Starting Amazon crawl from {base_url}")
            driver.get(base_url)
            time.sleep(5)

            # Queue for BFS crawling - each entry is (url, depth)
            url_queue = deque([(base_url, 0)])

            while url_queue and len(self.all_data["amazon"]) < self.max_products_per_site:
                current_url, depth = url_queue.popleft()

                if current_url in self.visited_urls or depth > self.max_depth:
                    continue

                self.visited_urls.add(current_url)
                logging.info(f"Visiting Amazon URL (depth {depth}): {current_url}")

                # Add random delay between requests
                self.random_delay(3, 7)

                try:
                    driver.get(current_url)
                    time.sleep(3)  # Allow page to load

                    # Check if we're on a product page or search results page
                    if "/dp/" in current_url:
                        # Product detail page
                        try:
                            title = driver.find_element(By.CSS_SELECTOR, "#productTitle").text.strip()

                            # Price could be in different locations
                            price = None
                            price_selectors = [
                                "#priceblock_ourprice",
                                "#priceblock_dealprice",
                                ".a-price .a-offscreen",
                                ".a-price span.a-offscreen"
                            ]

                            for selector in price_selectors:
                                try:
                                    price_elem = driver.find_element(By.CSS_SELECTOR, selector)
                                    price = price_elem.text.strip() if selector != ".a-price .a-offscreen" else price_elem.get_attribute("innerHTML")
                                    if price:
                                        break
                                except:
                                    continue

                            # Extract specifications
                            specs = {}
                            try:
                                table_rows = driver.find_elements(By.CSS_SELECTOR, "#productDetails_techSpec_section_1 tr")
                                for row in table_rows:
                                    try:
                                        key = row.find_element(By.CSS_SELECTOR, "th").text.strip()
                                        value = row.find_element(By.CSS_SELECTOR, "td").text.strip()
                                        if key and value:
                                            specs[key] = value
                                    except:
                                        pass
                            except:
                                # Try alternative specification format
                                try:
                                    detail_bullets = driver.find_elements(By.CSS_SELECTOR, "#detailBullets_feature_div li")
                                    for bullet in detail_bullets:
                                        text = bullet.text.strip()
                                        if ":" in text:
                                            key, value = text.split(":", 1)
                                            specs[key.strip()] = value.strip()
                                except:
                                    pass

                            if title and price:
                                product_data = {
                                    "name": title,
                                    "price": price,
                                    "url": current_url,
                                    "specifications": specs
                                }

                                self.all_data["amazon"].append(product_data)
                                logging.info(f"Scraped Amazon product: {title[:30]}... - {price}")
                        except Exception as e:
                            logging.error(f"Error scraping Amazon product page: {e}")

                    else:
                        # Search results page - extract product links
                        product_links = self.extract_product_links(
                            driver,
                            base_url,
                            "a.a-link-normal.s-no-outline"
                        )

                        # Add product links to the queue
                        for link in product_links:
                            if self.is_valid_url(link, domain) and link not in self.visited_urls:
                                url_queue.append((link, depth + 1))

                        # Also add pagination links if we're still at shallow depth
                        if depth < 1:
                            next_page = None
                            try:
                                next_button = driver.find_element(By.CSS_SELECTOR, ".s-pagination-next")
                                next_page = next_button.get_attribute("href")
                            except:
                                pass

                            if next_page and self.is_valid_url(next_page, domain) and next_page not in self.visited_urls:
                                url_queue.append((next_page, depth))

                except Exception as e:
                    logging.error(f"Error processing Amazon URL {current_url}: {e}")

            logging.info(f"Completed Amazon crawl. Collected {len(self.all_data['amazon'])} products.")
            return self.all_data["amazon"]

        except Exception as e:
            logging.error(f"Error during Amazon crawl: {e}")
            return self.all_data["amazon"]



    def run(self):
        driver = self.setup_driver()
        if not driver:
            logging.error("Failed to set up the driver. Exiting.")
            return

        try:
            # Run crawlers for each site
            try:
                logging.info("Starting eBay crawling...")
                self.crawl_ebay(driver)
            except Exception as e:
                logging.error(f"Error in eBay crawling: {e}")

            try:
                logging.info("Starting Amazon crawling...")
                self.crawl_amazon(driver)
            except Exception as e:
                logging.error(f"Error in Amazon crawling: {e}")

        except Exception as e:
            logging.error(f"Unexpected error during crawl: {e}")
        finally:
            driver.quit()

        # Save results
        self.save_results()

    def save_results(self):
        # Check if any data was collected
        total_items = sum(len(items) for items in self.all_data.values())
        logging.info(f"Total items crawled: {total_items}")

        if total_items == 0:
            logging.warning("No data was collected! Check the selectors and website structures.")

        with open("gpu_prices_crawled.json", "w") as f:
            json.dump(self.all_data, f, indent=4)

        logging.info("Crawling complete. Data saved to gpu_prices_crawled.json")


def main():
    # Configure the crawler
    crawler = GPUCrawler(
        max_products_per_site=30,  # Maximum products to collect per site
        max_depth=2                # Maximum depth of links to follow (0=just search page, 1=product pages, 2=additional related pages)
    )

    # Run the crawler
    crawler.run()


if __name__ == "__main__":
    main()

2025-03-20 07:37:04,348 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-20 07:37:04,691 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-20 07:37:04,872 - INFO - There is no [win64] chromedriver "134.0.6998.90" for browser google-chrome "134.0.6998" in cache
2025-03-20 07:37:04,873 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-20 07:37:05,212 - INFO - WebDriver version 134.0.6998.90 selected
2025-03-20 07:37:05,216 - INFO - Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/134.0.6998.90/win32/chromedriver-win32.zip
2025-03-20 07:37:05,217 - INFO - About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/134.0.6998.90/win32/chromedriver-win32.zip
2025-03-20 07:37:05,700 - INFO - Driver downloading response is 200
2025-03-20 07:37:06,430 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-20 07:37:07,576 - INFO - Driver has been saved in cache [C:\Users\

In [34]:
import pandas as pd
import json
import streamlit as st

with open("gpu_prices_crawled.json", "r") as f:
    data = json.load(f)

    for i in range(len(data['ebay'])):
        print(data['ebay'][i]['price'])

US $81.00
US $95.00
US $184.99
US $47.99
US $96.00
GBP 75.00
US $19.99
EUR 499.00
US $278.00
US $139.00
US $205.00
US $21.99
C $119.98
US $148.31
EUR 150.00
EUR 99.00
US $26.00
US $49.00
EUR 99.88
EUR 149.00
C $119.98
GBP 99.95
US $82.99
AU $120.00
US $65.88
US $21.88
US $97.98
US $340.00
US $99.99
GBP 78.88
