In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")


In [2]:
# WebDriver setup (ensure the path points to your WebDriver)
driver_path = "/home/wohldan/Downloads/chromedriver-linux64/chromedriver"  # Replace with your ChromeDriver path
BASE_URL = "https://noteb.com/"
START_URL = f"{BASE_URL}?search/search.php?browse_by=budget&sort_by=value"  # Example: Budget laptops


In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service


In [4]:

def extract_laptop_urls_with_selenium(url, driver_path):
    """
    Extract laptop URLs using Selenium to handle JavaScript-rendered content.
    """
    logging.info(f"Opening browser and navigating to {url}")
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service)
    driver.get(url)
    
    # Wait for the page to load completely
    time.sleep(5)  # Adjust this based on your connection speed and page complexity
    
    # Extract all elements with "onmousedown" containing "OpenPage"
    elements = driver.find_elements(By.CSS_SELECTOR, "div.searchresult-content")
    laptop_urls = []
    
    for elem in elements:
        onmousedown = elem.get_attribute("onmousedown")
        if onmousedown and "OpenPage" in onmousedown:
            # Extract the URL inside the 'OpenPage' function
            start = onmousedown.find("('") + 2
            end = onmousedown.find("',event")
            if start != -1 and end != -1:
                laptop_url = BASE_URL + '?' + onmousedown[start:end]
                laptop_urls.append(laptop_url)
    
    logging.info(f"Found {len(laptop_urls)} laptop URLs")
    driver.quit()
    return laptop_urls, driver

In [5]:
laptop_urls, driver = extract_laptop_urls_with_selenium(START_URL,driver_path)

2024-11-29 16:52:55,990 - Opening browser and navigating to https://noteb.com/?search/search.php?browse_by=budget&sort_by=value
2024-11-29 16:53:03,829 - Found 20 laptop URLs


In [6]:
laptop_urls[0]

'https://noteb.com/?model/model.php?conf=5027952706180836093_4298&ex=USD'

In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import logging

def scrape_laptop_data(url, driver_path):
    """
    Scrape data from an individual laptop page.
    Args:
        url (str): URL of the laptop page.
        driver (webdriver.Chrome): Selenium WebDriver instance.

    Returns:
        dict: A dictionary containing the laptop specifications.
    """
    logging.info(f"Scraping data from {url}")
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service)
    # Navigate to the URL
    driver.get(url)

    # Initialize dictionary to store the specifications
    laptop_data = {"URL": url}

    # Find the specifications accordion
    try:
        specifications_section = driver.find_element(By.ID, "specificationsAccordion")
    except Exception as e:
        logging.error(f"Specifications section not found: {e}")
        return laptop_data

    # Extract data from the first and second columns
    for col_id in ["firstCol", "secondCol"]:
        try:
            column = specifications_section.find_element(By.ID, col_id)
            # Find all specification cards within the column
            spec_cards = column.find_elements(By.CLASS_NAME, "specification-card")
            for card in spec_cards:
                # Extract key-value pairs for each specification
                try:
                    key = card.find_element(By.TAG_NAME, "h4").text.strip()  # Adjust to match the actual HTML structure
                    value = card.find_element(By.TAG_NAME, "p").text.strip()  # Adjust to match the actual HTML structure
                    laptop_data[key] = value
                except Exception as e:
                    logging.warning(f"Failed to extract key-value from a card in {col_id}: {e}")
        except Exception as e:
            logging.warning(f"Failed to process column {col_id}: {e}")

    return laptop_data


In [12]:
scrape_laptop_data(laptop_urls[0], driver_path)

2024-11-29 16:54:58,422 - Scraping data from https://noteb.com/?model/model.php?conf=5027952706180836093_4298&ex=USD
2024-11-29 16:55:01,184 - Specifications section not found: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="specificationsAccordion"]"}
  (Session info: chrome=131.0.6778.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
#0 0x57a0b55a931a <unknown>
#1 0x57a0b50bf6e0 <unknown>
#2 0x57a0b510e3e6 <unknown>
#3 0x57a0b510e681 <unknown>
#4 0x57a0b5153b04 <unknown>
#5 0x57a0b513248d <unknown>
#6 0x57a0b5150ed7 <unknown>
#7 0x57a0b5132203 <unknown>
#8 0x57a0b5100cc0 <unknown>
#9 0x57a0b5101c9e <unknown>
#10 0x57a0b5576d0b <unknown>
#11 0x57a0b557ac92 <unknown>
#12 0x57a0b5563b3c <unknown>
#13 0x57a0b557b807 <unknown>
#14 0x57a0b55490df <unknown>
#15 0x57a0b5598578 <unknown>
#16 0x57a0b5598740 <unknown>
#17 0x57a0b55a8196 <u

{'URL': 'https://noteb.com/?model/model.php?conf=5027952706180836093_4298&ex=USD'}

In [5]:
# Main function to scrape laptops by class
def scrape_laptops_by_class():
    """
    Main function to scrape laptops for all classes and save the data.
    """
    all_laptops = []
    for laptop_class in CLASSES:
        # Fetch laptop links for the current class
        links = fetch_laptop_links(laptop_class)
        
        for link in links:
            laptop_data = scrape_laptop_data(link)
            if laptop_data:
                laptop_data["Class"] = laptop_class
                all_laptops.append(laptop_data)
            time.sleep(1)  # Be polite to the server
        
        time.sleep(2)  # Avoid hammering the server between classes
    
    # Save the data to a CSV
    df = pd.DataFrame(all_laptops)
    df.to_csv("laptops_data.csv", index=False)
    logging.info("Scraping completed. Data saved to 'laptops_data.csv'.")


In [7]:
scrape_laptops_by_class()

2024-11-29 15:40:27,873 - Fetching laptops for class: budget from https://noteb.com/?search/search.php?browse_by=budget&sort_by=value
2024-11-29 15:40:28,701 - Found 0 laptops for class: budget
2024-11-29 15:40:30,703 - Fetching laptops for class: mainstream from https://noteb.com/?search/search.php?browse_by=mainstream&sort_by=value
2024-11-29 15:40:31,360 - Found 0 laptops for class: mainstream
2024-11-29 15:41:02,845 - Fetching laptops for class: ultraportable from https://noteb.com/?search/search.php?browse_by=ultraportable&sort_by=value
2024-11-29 15:41:03,714 - Found 0 laptops for class: ultraportable
2024-11-29 15:42:07,296 - Fetching laptops for class: business from https://noteb.com/?search/search.php?browse_by=business&sort_by=value
2024-11-29 15:42:08,021 - Found 0 laptops for class: business
2024-11-29 15:42:10,023 - Fetching laptops for class: gaming from https://noteb.com/?search/search.php?browse_by=gaming&sort_by=value
2024-11-29 15:42:10,790 - Found 0 laptops for class