In [1]:
pip install selenium

Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.7 MB 660.6 kB/s eta 0:00:15
   ---------------------------------------- 0.0/9.7 


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# **Amazon Best Sellers Scraper**

# **Overview**
# This script automates the process of logging into Amazon, navigating to various "Best Sellers" categories, and scraping product details.
# The extracted data includes product name, price, rating, discount, and category. 
# The script filters products offering discounts greater than 50% and saves the data in a JSON file.

# **Dependencies**
# - `selenium`: For web automation.
# - `time`: To manage delays during web scraping.
# - `json`: To save the scraped data in a structured format.

# **Steps:**
# 1. **Configuration**: Setup Amazon login URL, best seller URLs, and credentials.
# 2. **Functions**: Define helper functions to log in, scrape product data, and save the results.
# 3. **Execution**: Automate the scraping and save results.

# **Code Below** :-


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json

# Configuration
LOGIN_URL = "https://www.amazon.com/ap/signin?openid.pape.max_auth_age=900&openid.return_to=https%3A%2F%2Fwww.amazon.com%2Fgp%2Fyourstore%2Fhome%3Fpath%3D%252Fgp%252Fyourstore%252Fhome%26useRedirectOnSuccess%3D1%26signIn%3D1%26action%3Dsign-out%26ref_%3Dnav_AccountFlyout_signout&openid.assoc_handle=usflex&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0"
BEST_SELLER_URLS = [
    "https://www.amazon.in/gp/bestsellers/kitchen/ref=zg_bs_nav_kitchen_0",
    "https://www.amazon.in/gp/bestsellers/shoes/ref=zg_bs_nav_shoes_0",
    "https://www.amazon.in/gp/bestsellers/computers/ref=zg_bs_nav_computers_0",
    "https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_electronics_0",
    # Add 6 more categories
]
USERNAME = "ankit"  # Your username
PASSWORD = "Ankit@1234"  # Your password
OUTPUT_FILE = "amazon_best_sellers.json"

# Initialize WebDriver
driver = webdriver.Chrome()  # Ensure you have ChromeDriver installed
driver.maximize_window()

def login_amazon():
    """
    Logs into Amazon using provided credentials.
    """
    driver.get(LOGIN_URL)

    try:
        # Wait for the email input field
        email_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "ap_email"))
        )
        email_input.send_keys(USERNAME)

        # Click the 'Continue' button
        continue_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "continue"))
        )
        continue_button.click()

        # Wait for the password input field
        password_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "ap_password"))
        )
        password_input.send_keys(PASSWORD)

        # Click the 'Sign-In' button
        sign_in_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "signInSubmit"))
        )
        sign_in_button.click()

        # Wait for post-login confirmation element
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "nav-link-accountList"))
        )
        print("Login successful!")

    except Exception as e:
        print(f"Error during login: {e}")
        driver.quit()
        exit()

def scrape_category(url):
    driver.get(url)
    time.sleep(3)
    products = []
    for _ in range(15):  # Scroll multiple times to load more products
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
    items = driver.find_elements(By.CSS_SELECTOR, ".zg-item-immersion")
    for item in items[:1500]:  # Limit to top 1500 products
        try:
            name = item.find_element(By.CSS_SELECTOR, ".p13n-sc-truncated").text
            price = item.find_element(By.CSS_SELECTOR, ".p13n-sc-price").text
            rating = item.find_element(By.CSS_SELECTOR, ".a-icon-alt").text
            discount = item.find_element(By.CSS_SELECTOR, ".a-text-price").text

            # Parse discount value
            try:
                discount_value = int(discount.strip('%').strip())
            except ValueError:
                discount_value = 0

            if discount_value > 50:  # Filter for discounts > 50%
                category = url.split("/")[-2]
                products.append({
                    "Product Name": name,
                    "Product Price": price,
                    "Best Seller Rating": rating,
                    "Sale Discount": discount,
                    "Category Name": category,
                })
        except Exception as e:
            print(f"Error scraping product: {e}")
    return products

def save_data(data):
    """
    Saves the scraped data into a JSON file.
    """
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def main():
    """
    Main function to run the scraper and save data.
    """
    login_amazon()
    all_data = []
    for url in BEST_SELLER_URLS:
        print(f"Scraping category: {url}")
        category_data = scrape_category(url)
        all_data.extend(category_data)
    save_data(all_data)
    print(f"Data saved to {OUTPUT_FILE}")
    driver.quit()

if __name__ == "__main__":
    main()


Error during login: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=129.0.6668.90)
Stacktrace:
	GetHandleVerifier [0x00007FF683E6B095+29557]
	(No symbol) [0x00007FF683DDFA50]
	(No symbol) [0x00007FF683C9B56A]
	(No symbol) [0x00007FF683C6FCC5]
	(No symbol) [0x00007FF683D1EE07]
	(No symbol) [0x00007FF683D37E21]
	(No symbol) [0x00007FF683D16F33]
	(No symbol) [0x00007FF683CE116F]
	(No symbol) [0x00007FF683CE22D1]
	GetHandleVerifier [0x00007FF68419C96D+3378253]
	GetHandleVerifier [0x00007FF6841E8497+3688311]
	GetHandleVerifier [0x00007FF6841DD1CB+3642539]
	GetHandleVerifier [0x00007FF683F2A6B6+813462]
	(No symbol) [0x00007FF683DEAB5F]
	(No symbol) [0x00007FF683DE6B74]
	(No symbol) [0x00007FF683DE6D10]
	(No symbol) [0x00007FF683DD5C1F]
	BaseThreadInitThunk [0x00007FFD5630259D+29]
	RtlUserThreadStart [0x00007FFD5812AF38+40]

Scraping category: https://www.amazon.in/gp/bestsellers/kitchen/ref=zg_bs_nav_kitchen_0


MaxRetryError: HTTPConnectionPool(host='localhost', port=50523): Max retries exceeded with url: /session/1d89412f88f8e3891b3312654d6f6244/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001D4631B42C0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))