In [3]:
import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def scrape_amazon():
    # Define the search URL
    base_url = 'https://www.amazon.com/s?k=all+items'

    # Setup driver
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    # Extract product details from each page
    products = []
    for page in range (1,50):
        # Navigate to the search results page
        url = f'{base_url}&page={page}'
        driver.get(url)
        time.sleep(5) # wait for page to load

        # Extract product details
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for product in soup.find_all('div', class_='sg-col-inner'): 
            product_dict = {}

            item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')
            if item_name is not None:
                product_dict['product'] = item_name.text.strip()

            product_price = product.find('span', class_='a-offscreen')
            if product_price is not None:
                product_price = product_price.text.strip()
                product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

            rating = product.find('span', class_='a-icon-alt')
            if rating is not None:
                product_dict['ratings'] = rating.text.strip().split(" ")[0]

            # review_pop = product.find('span', class_='a-size-base s-underline-text')
            # if rating is not None:
            #     product_dict['review_pop'] = rating.text.strip().split(" ")[0]

            item_reviews = product.find('span', class_='a-size-base')
            if item_reviews is not None:
                reviews_text = item_reviews.text.strip()
                reviews_count = reviews_text.split(" ")[0]
                product_dict['reviews'] = reviews_count.strip()

            # Only append product_dict if it has some data
            if product_dict:
                products.append(product_dict)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(products)

if __name__ == '__main__':
  # Load the JSON output string into a Python List of dictionaries for further processing
  amazon_data = json.loads(scrape_amazon())

  # Save the JSON data to a file
  with open('amazon_data.json', 'w') as file:
    json.dump(amazon_data, file)
