In [18]:
import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def scrape_amazon(categories):
    # Setup driver
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    # Extract product details from each category
    all_products = []
    for category, base_url in categories.items():
        products = []
        for page in range (1,3):
            # Navigate to the search results page
            url = f'{base_url}&page={page}'
            driver.get(url)
            time.sleep(5) # wait for page to load

            # Extract product details
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for product in soup.find_all('div', class_='sg-col-inner'): 
                product_dict = {}

                item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')
                if item_name is not None:
                    product_dict['product'] = item_name.text.strip()

                product_price = product.find('span', class_='a-offscreen')
                if product_price is not None:
                    product_price = product_price.text.strip()
                    product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

                rating = product.find('span', class_='a-icon-alt')
                if rating is not None:
                    product_dict['ratings'] = rating.text.strip().split(" ")[0]

                item_reviews = product.find('span', class_='a-size-base')
                if item_reviews is not None:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = reviews_text.split(" ")[0]
                    product_dict['reviews'] = reviews_count.strip()

                # Add category to product_dict
                product_dict['category'] = category

                # Only append product_dict if it has some data
                if product_dict:
                    products.append(product_dict)

        all_products.extend(products)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(all_products)

if __name__ == '__main__':
  # Define the categories and their base URLs
  categories = {
    'electronics': 'https://www.amazon.com/s?rh=n%3A16225009011&fs=true&ref=lp_16225009011_sar',
    'Computers': 'https://www.amazon.com/s?rh=n%3A16225007011&fs=true&ref=lp_16225007011_sar',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'boys_clothing_8-20':'https://www.amazon.com/s?i=specialty-aps&bbn=16225021011&rh=n%3A16225021011%2Cn%3A1040666%2Cp_n_size_six_browse-vebin%3A4940401011&dc&fst=as%3Aoff&pf_rd_i=16225021011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=d84623b2-8aff-40df-9701-224067aef31e&pf_rd_r=BRC30MHTYX0XAFZA3N94&pf_rd_s=merchandised-search-3&pf_rd_t=101&qid=1511397964&rnid=49403&ref=s9_acss_bw_cg_AEGFVN2E_1a1_w',
    'boys_clothing_2-7':'https://www.amazon.com/s?i=specialty-aps&bbn=16225021011&rh=n%3A16225021011%2Cn%3A1040666%2Cp_n_size_six_browse-vebin%3A4940400011&dc&fst=as%3Aoff&pf_rd_i=16225021011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=d84623b2-8aff-40df-9701-224067aef31e&pf_rd_r=BRC30MHTYX0XAFZA3N94&pf_rd_s=merchandised-search-3&pf_rd_t=101&qid=1511397964&rnid=49&ref=s9_acss_bw_cg_AEGFVN2E_1b1_w',
    'boys_clothing_ 0-24M':'https://www.amazon.com/s?i=specialty-aps&bbn=16225005011&rh=n%3A%2116225005011%2Cn%3A7147444011%2Cn%3A7628013011&dc&fst=as%3Aoff&pf_rd_i=16225021011&pf_rd_m=ATVPDKIKX0DER&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=d84623b2-8aff-40df-9701-224067aef31e&pf_rd_r=BRC30MHTYX0XAFZA3N94&pf_rd_s=merchandised-sear&pf_rd_s=merchandised-search-3&pf_rd_t=101&qid=1510445001&rnid=7147444011&ref=s9_acss_bw_cg_AEGFVN2E_1c1_w',
    'girls_clothing_7-16':'https://www.amazon.com/s?bbn=16225020011&rh=n%3A7141123011%2Cn%3A16225020011%2Cn%3A1040664%2Cp_n_size_six_browse-vebin%3A4940398011&dc&fst=as%3Aoff&pf_rd_i=16225020011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=2a239f2b-0318-4c5d-be33-9cc1f0eed9b3&pf_rd_r=QGPCTNEA3G2MD9BWY49S&pf_rd_s=merchandised-search-3&pf_rd_t=101&qid=1489098061&rnid=4940396011&ref=s9_acss_bw_cg_AEGFVN2E_1a1_w',
    'girls_clothing_2-6X':'https://www.amazon.com/s?bbn=16225020011&rh=n%3A7141123011%2Cn%3A16225020011%2Cn%3A1040664%2Cp_n_size_six_browse-vebin%3A4940397011&dc&fst=as%3Aoff&pf_rd_i=16225020011&pf_rd_m=ATVPDKIKX0DER&pf_rd_p=2a239f2b-0318-4c5d-be33-9cc1f0eed9b3&pf_rd_r=QGPCTNEA3G2MD9BWY49S&pf_rd_s=merchandised-search-3&pf_rd_t=101&qid=1489098061&rnid=4940396011&ref=s9_acss_bw_cg_AEGFVN2E_1b1_w',
    'girls_clothing_0-24M':'https://www.amazon.com/s?keywords=Baby+Girls%27+Clothing+%26+Shoes&bbn=7628012011&rh=n%3A7141123011%2Cn%3A7147444011%2Cn%3A7628012011%2Cp_n_shipping_option-bin%3A3242350011&dc&c=ts&ts_id=7628012011&ref=s9_acss_bw_cg_AEGFVN2E_1c1_w',
    

    # Add more categories as needed
  }

  # Load the JSON output string into a Python List of dictionaries for further processing
  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
    json.dump(amazon_data, file)


In [16]:

import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def scrape_amazon(categories):
    # Setup driver
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    # Extract product details from each category
    all_products = []
    for category, base_url in categories.items():
        products = []
        for page in range (1,2):
            # Navigate to the search results page
            url = f"{base_url}&page={page}"
            driver.get(url)
            time.sleep(5) # wait for page to load

            # Extract product details
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for product in soup.find_all('div', class_='sg-col-inner'): 
                product_dict = {}

                item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')
                if review_responder_population is not None:
                    product_dict['product'] = review_responder_population.text.strip()

                product_price = product.find('span', class_='a-offscreen')
                if product_price is not None:
                    product_price = product_price.text.strip()
                    product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

                rating = product.find('span', class_='a-icon-alt')
                if rating is not None:
                    product_dict['ratings'] = rating.text.strip().split(" ")[0]

                review_responder_population = product.find('span', class_='a-size-base s-underline-text')
                if review_responder_population is not None:
                    review_responders = review_responder_population.text.strip().split(" ")[0]
                else:
                    review_responders = None

                product_dict['review_responders'] = review_responders


                item_reviews = product.find('span', class_='a-size-base')
                if item_reviews is not None:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = reviews_text.split(" ")[0]
                    product_dict['reviews'] = reviews_count.strip()

                # Add category to product_dict
                product_dict['category'] = category

                # Only append product_dict if it has some data
                if product_dict:
                    products.append(product_dict)

        all_products.extend(products)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(all_products)

if __name__ == '__main__':
  # Define the categories and their base URLs
  categories = {

    'Smartphones': 'https://www.amazon.com/s?k=smartphone&crid=3L33Q517U80C6&sprefix=smartphone%2Caps%2C934&ref=nb_sb_noss_1',
    'Laptops': 'https://www.amazon.com/s?k=Laptop&crid=3GD9HX5GPNA0R&sprefix=laptop%2Caps%2C569&ref=nb_sb_noss_1',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'Dresses':'https://www.amazon.com/s?k=dress&crid=1ZKY6X4I4VARF&sprefix=dress%2Caps%2C1200&ref=nb_sb_noss_1',
    'Shoes':'https://www.amazon.com/s?k=shoes&crid=BWUHHU3UQ38H&sprefix=shoes%2Caps%2C346&ref=nb_sb_noss_1',
    'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&crid=17CJNG61JS7OX&sprefix=accessories+for+clo%2Caps%2C750&ref=nb_sb_ss_ts-doa-p_3_19', # Accessories for Clothes


    # Add more categories as needed
  }

  # Load the JSON output string into a Python List of dictionaries for further processing
  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
    json.dump(amazon_data, file)



In [38]:
import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.microsoft import EdgeChromiumDriverManager


def scrape_amazon(categories):
    # Setup driver
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    # Extract product details from each category
    all_products = []

    for category, base_url in categories.items():
        products = []

        # Iterate through pages for this category
        for page in range(1, 3):

            url = f"{base_url}&page={page}"
            driver.get(url)
            time.sleep(5) # wait for page to load

            # Extract product details
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for product in soup.find_all('div', class_='sg-col-inner'):
                product_dict = {}

                # Item name
                item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')

                # If item name is None, then it is a mouse pad
                if item_name is None:
                    item_name = product.find('span', class_='a-size-medium a-color-base a-text-normal')

                if item_name is not None:
                    product_dict['product'] = item_name.text.strip()

                  

                # Item price
                product_price = product.find('span', class_='a-offscreen')
                rating = product.find('span', class_='a-icon-alt')
                review_responder_population = product.find('span', class_='a-size-base s-underline-text')
                item_reviews = product.find('span', class_='a-size-base')

                # Check if the product_price variable is None before adding it to the product_dict
                if product_price is not None:
                    product_price = product_price.text.strip()
                    product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

                # Check if the rating variable is None before adding it to the product_dict
                if rating is not None:
                    product_dict['ratings'] = rating.text.strip().split(" ")[0]

                # Check if the review_responder_population variable is None before adding it to the product_dict
                if review_responder_population is not None:
                    review_responders = review_responder_population.text.strip().split(" ")[0]
                    product_dict['review_responders'] = review_responders

                # Check if the item_reviews variable is None before adding it to the product_dict
                if item_reviews is not None:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = reviews_text.split(" ")[0]
                    product_dict['reviews'] = reviews_count.strip()

                # Add category to product_dict
                product_dict['category'] = category

                # Only append product_dict if it has some data
                if len(product_dict.keys()) > 0:
                    products.append(product_dict)

        all_products.extend(products)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(all_products)


if __name__ == '__main__':

  # Define the categories and their base URLs

  categories = {

    'Smartphones': 'https://www.amazon.com/s?k=smartphone&crid=3L33Q517U80C6&sprefix=smartphone%2Caps%2C934&ref=nb_sb_noss_1',
    'Laptops': 'https://www.amazon.com/s?k=Laptop&crid=3GD9HX5GPNA0R&sprefix=laptop%2Caps%2C569&ref=nb_sb_noss_1',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'Dresses':'https://www.amazon.com/s?k=dress&crid=1ZKY6X4I4VARF&sprefix=dress%2Caps%2C1200&ref=nb_sb_noss_1',
    'Shoes':'https://www.amazon.com/s?k=shoes&crid=BWUHHU3UQ38H&sprefix=shoes%2Caps%2C346&ref=nb_sb_noss_1',
    'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&crid=17CJNG61JS7OX&sprefix=accessories+for+clo%2Caps%2C750&ref=nb_sb_ss_ts-doa-p_3_19', # Accessories for Clothes

    # Add more categories as needed
  }

  # Load the JSON output string into a Python List of dictionaries for further processing
  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
    json.dump(amazon_data, file)



In [37]:
import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def setup_driver():
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    return webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

def extract_product_details(product, category, seen_products):
    product_dict = {
        'uuid': None,
        'product': None,
        'price': None,
        'ratings': None,
        'reviews': None,
        'review_responders': None,
        'category': category,
    }

    # Item ID
    item_id = product.find('div', class_='data_uuid')
    if item_id is not None:
        product_dict['uuid'] = item_id.get('data-uuid')

    # Item name
    item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal') or product.find('span', class_='a-size-medium a-color-base a-text-normal')
    if item_name is not None:
        product_name = item_name.text.strip()
        if product_name in seen_products:
            return None
        seen_products.add(product_name)
        product_dict['product'] = product_name

    # Item price
    product_price = product.find('span', class_='a-offscreen')
    if product_price is not None:
        product_price = product_price.text.strip()
        product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

    rating = product.find('span', class_='a-icon-alt')
    review_responder_population = product.find('span', class_='a-size-base s-underline-text')
    item_reviews = product.find('span', class_='a-size-base')

    # Check if the rating variable is None before adding it to the product_dict
    if rating is not None:
        product_dict['ratings'] = rating.text.strip().split(" ")[0]

    # Check if the review_responder_population variable is None before adding it to the product_dict
    if review_responder_population is not None:
        review_responders = review_responder_population.text.strip().split(" ")[0]
        product_dict['review_responders'] = review_responders

    # Check if the item_reviews variable is None before adding it to the product_dict
    if item_reviews is not None:
        reviews_text = item_reviews.text.strip()
        reviews_count = reviews_text.split(" ")[0]
        product_dict['reviews'] = reviews_count.strip()

    return product_dict

def scrape_amazon(categories, max_pages=2):
    driver = setup_driver()
    all_products = []
    seen_products = set()

    for category, base_url in categories.items():
        products = []
        for page in range(1, max_pages + 1):
            url = f"{base_url}&page={page}"
            driver.get(url)
            time.sleep(5) # wait for page to load

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            next_page_button = soup.find('li', class_='a-last')
            if next_page_button and next_page_button.find('a') is None:
                break

            for product in soup.find_all('div', class_='sg-col-inner'):
                product_dict = extract_product_details(product, category, seen_products)
                if product_dict:
                    products.append(product_dict)


        all_products.extend(products)

    driver.quit()
    return json.dumps(all_products)

if __name__ == '__main__':
  categories = {

    'Smartphones': 'https://www.amazon.com/s?k=smartphone&crid=3L33Q517U80C6&sprefix=smartphone%2Caps%2C934&ref=nb_sb_noss_1',
    'Laptops': 'https://www.amazon.com/s?k=Laptop&crid=3GD9HX5GPNA0R&sprefix=laptop%2Caps%2C569&ref=nb_sb_noss_1',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'Dresses':'https://www.amazon.com/s?k=dress&crid=1ZKY6X4I4VARF&sprefix=dress%2Caps%2C1200&ref=nb_sb_noss_1',
    'Shoes':'https://www.amazon.com/s?k=shoes&crid=BWUHHU3UQ38H&sprefix=shoes%2Caps%2C346&ref=nb_sb_noss_1',
    'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&crid=17CJNG61JS7OX&sprefix=accessories+for+clo%2Caps%2C750&ref=nb_sb_ss_ts-doa-p_3_19', # Accessories for Clothes

    # Add more categories as needed
  }

    # Load the JSON output string into a Python List of dictionaries for further processing
  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
    json.dump(amazon_data, file)


In [69]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

options = Options()
options.headless = True

driver = webdriver.Chrome(options=options)

driver.set_page_load_timeout(30) 
driver.implicitly_wait(20)

from selenium.common.exceptions import TimeoutException

def scrape_amazon(categories):

    options = webdriver.ChromeOptions()

    # Run Chrome in headless mode
    options.add_argument('--headless')

    # Set a higher timeout value
    options.add_argument('--timeout=10')

    # Install latest ChromeDriver and get its path
    webdriver_path = ChromeDriverManager().install()

    # Use the installed webdriver_path
    driver = webdriver.Chrome(service=Service(webdriver_path), options=options)

    all_products = []

    for category, base_url in categories.items():
        products = []
        
        # Iterate through pages for this category
        for page in range(1, 2):
            
            url = f"{base_url}&page={page}"
            try:
                driver.get(url)
                time.sleep(5) # wait for page to load
            except TimeoutException:
                print(f"Timeout occurred while loading {url}. Skipping this page.")
                continue

            # Extract product details
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for product in soup.find_all('div', class_='sg-col-inner'): 
                product_dict = {}

                # Item name
                item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')
                item_price = product.find('span', class_='a-offscreen')

                # If item name is None, then it is a mouse pad
                if item_name is None:
                    item_name = product.find('span', class_='a-size-medium a-color-base a-text-normal')
                    product_dict['product'] = item_name.text.strip()

                # If item price is None, then it is not a mouse pad
                if item_price is not None:
                    product_price = item_price.text.strip()
                    product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

                rating = product.find('span', class_='a-icon-alt')
                if rating is not None:
                    product_dict['ratings'] = rating.text.strip().split(" ")[0]

                review_responder_population = product.find('span', class_='a-size-base s-underline-text')
                
                if review_responder_population is not None:
                    review_responders = review_responder_population.text.strip().split(" ")[0]
                else:
                    review_responders = None


                try:
                    review_responder_population = product.find('span', class_='a-size-base s-underline-text')
                    review_responders = review_responder_population.text.strip().split(" ")[0]
                except:
                    review_responders = None

                product_dict['review_responders'] = review_responders


                item_reviews = product.find('span', class_='a-size-base')
                if item_reviews is not None:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = reviews_text.split(" ")[0]
                    product_dict['reviews'] = reviews_count.strip()

                # Add category to product_dict
                product_dict['category'] = category

                # Only append product_dict if it has some data
                if product_dict:
                    products.append(product_dict)

        all_products.extend(products)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(all_products)

if __name__ == '__main__':
  # Define the categories and their base URLs

  categories = {
    'Smartphones': 'https://www.amazon.com/s?k=smartphone&crid=3L33Q517U80C6&sprefix=smartphone%2Caps%2C934&ref=nb_sb_noss_1',
    'Laptops': 'https://www.amazon.com/s?k=Laptop&crid=3GD9HX5GPNA0R&sprefix=laptop%2Caps%2C569&ref=nb_sb_noss_1',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'Dresses':'https://www.amazon.com/s?k=dress&crid=1ZKY6X4I4VARF&sprefix=dress%2Caps%2C1200&ref=nb_sb_noss_1',
    'Shoes':'https://www.amazon.com/s?k=shoes&crid=BWUHHU3UQ38H&sprefix=shoes%2Caps%2C346&ref=nb_sb_noss_1',
    'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&crid=17CJNG61JS7OX&sprefix=accessories+for+clo%2Caps%2C750&ref=nb_sb_ss_ts-doa-p_3_19', # Accessories for Clothes

    # Add more categories as needed
  }

  # Load the JSON output string into a Python List of dictionaries for further processing
  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
    json.dump(amazon_data, file)



ConnectionError: Could not reach host. Are you offline?

In [79]:
# Load the JSON data into a pandas DataFrame
with open('amazon_data_cat.json') as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Now, print out a few lines of the DataFrame to check if 'product' key is present
print(df.head())


      category                                            product  price  \
0  Smartphones                                                NaN    NaN   
1  Smartphones                                                NaN    NaN   
2  Smartphones                                                NaN    NaN   
3  Smartphones                                                NaN    NaN   
4  Smartphones  Google Pixel 4a - Unlocked Android Smartphone ...  99.99   

  ratings review_responders   reviews  
0     NaN               NaN       NaN  
1     NaN               NaN       NaN  
2     NaN               NaN       NaN  
3     NaN               NaN       NaN  
4     4.4               163  Typical:  


In [53]:
import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def scrape_amazon(categories):
    # Setup driver
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    # Extract product details from each category
    all_products = []
    seen_products = set()  # Keep track of products we've already seen

    for category, base_url in categories.items():
        products = []

        # Iterate through pages for this category
        for page in range(1, 5):

            url = f"{base_url}&page={page}"
            driver.get(url)
            time.sleep(5) # wait for page to load

            # Extract product details
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for product in soup.find_all('div', class_='sg-col-inner'):
                product_dict = {}

                # Item name
                item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')

                # If item name is None, then it is a mouse pad
                if item_name is None:
                    item_name = product.find('span', class_='a-size-medium a-color-base a-text-normal')

                if item_name is not None:
                    product_name = item_name.text.strip()

                    # Skip this product if we've already seen it
                    if product_name in seen_products:
                        continue

                    seen_products.add(product_name)
                    product_dict['product'] = product_name

                # Item price
                product_price = product.find('span', class_='a-offscreen')
                rating = product.find('span', class_='a-icon-alt')
                review_responder_population = product.find('span', class_='a-size-base s-underline-text')
                item_reviews = product.find('span', class_='a-size-base')

                # Check if the product_price variable is None before adding it to the product_dict
                if product_price is not None:
                    product_price = product_price.text.strip()
                    product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

                # Check if the rating variable is None before adding it to the product_dict
                if rating is not None:
                    product_dict['ratings'] = rating.text.strip().split(" ")[0]

                # Check if the review_responder_population variable is None before adding it to the product_dict
                if review_responder_population is not None:
                    review_responders = review_responder_population.text.strip().split(" ")[0]
                    product_dict['review_responders'] = review_responders

                # Check if the item_reviews variable is None before adding it to the product_dict
                if item_reviews is not None:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = reviews_text.split(" ")[0]
                    product_dict['reviews'] = reviews_count.strip()

                # Add category to product_dict
                product_dict['category'] = category

                # Only append product_dict if it has some data
                if len(product_dict.keys()) > 0:
                    products.append(product_dict)

        all_products.extend(products)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(all_products)


if __name__ == '__main__':
  categories = {
    'Smartphones': 'https://www.amazon.com/s?k=smartphone&crid=3L33Q517U80C6&sprefix=smartphone%2Caps%2C934&ref=nb_sb_noss_1',
    'Laptops': 'https://www.amazon.com/s?k=Laptop&crid=3GD9HX5GPNA0R&sprefix=laptop%2Caps%2C569&ref=nb_sb_noss_1',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'Dresses':'https://www.amazon.com/s?k=dress&crid=1ZKY6X4I4VARF&sprefix=dress%2Caps%2C1200&ref=nb_sb_noss_1',
    'Shoes':'https://www.amazon.com/s?k=shoes&crid=BWUHHU3UQ38H&sprefix=shoes%2Caps%2C346&ref=nb_sb_noss_1',
    'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&crid=17CJNG61JS7OX&sprefix=accessories+for+clo%2Caps%2C750&ref=nb_sb_ss_ts-doa-p_3_19', # Accessories for Clothes
    # Add more categories as needed
  }

  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
   json.dump(amazon_data, file)





In [66]:
import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def scrape_amazon(categories):
    # Setup driver
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    # Extract product details from each category
    all_products = []

    for category, base_url in categories.items():
        products = []

        # Iterate through pages for this category
        for page in range(1, 5):

            url = f"{base_url}&page={page}"
            driver.get(url)
            time.sleep(5) # wait for page to load

            # Extract product details
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for product in soup.find_all('div', class_='sg-col-inner'):
                product_dict = {}

                # Item name
                item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')

                # If item name is None, then it is a mouse pad
                if item_name is None:
                    item_name = product.find('span', class_='a-size-medium a-color-base a-text-normal')

                if item_name is not None:
                    product_dict['product'] = item_name.text.strip()

                # Item price
                product_price = product.find('span', class_='a-offscreen')
                rating = product.find('span', class_='a-icon-alt')

                
                item_reviews = product.find('span', class_='a-size-base')

                # Check if the product_price variable is None before adding it to the product_dict
                if product_price is not None:
                    product_price = product_price.text.strip()
                    product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

                # Check if the rating variable is None before adding it to the product_dict
                if rating is not None:
                    product_dict['ratings'] = rating.text.strip().split(" ")[0]

                # Check if the review_responder_population variable is None before adding it to the product_dict
                review_responder_population = product.find('span', class_='a-size-base s-underline-text')
                if review_responder_population is not None:
                    review_responders = review_responder_population.text.strip().split(" ")[0]
                    product_dict['review_responders'] = review_responders

                # Extract number of ratings
                num_ratings = product.find('span', 'a-size-base')
                if num_ratings is not None:
                    product_dict['num_ratings'] = num_ratings['aria-label'].split()[0]

                # If item name is None, then it is a mouse pad
                # Find review_responders
                review_responder_population_1 = product.find('span', class_='a-size-base s-underline-text')
                review_responder_population_2 = product.find('span', 'a-size-base')

                if review_responder_population_1 is not None:
                    review_responders1 = review_responder_population_1.text.strip().split(" ")[0]
                    if review_responders1 > 0:
                     product_dict['review_responders'] = review_responders1
                    else:
                        review_responders2 = review_responder_population_2['aria-label'].split()[0]
                        product_dict['review_responders'] = review_responders2
                    
    
                # Check if the item_reviews variable is None before adding it to the product_dict
                if item_reviews is not None:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = reviews_text.split(" ")[0]
                    product_dict['reviews'] = reviews_count.strip()

                # Add category to product_dict
                product_dict['category'] = category

                # Only append product_dict if it has some data
                if len(product_dict.keys()) > 0:
                    products.append(product_dict)

        all_products.extend(products)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(all_products)


if __name__ == '__main__':
  categories = {
    'Smartphones': 'https://www.amazon.com/s?k=smartphone&crid=3L33Q517U80C6&sprefix=smartphone%2Caps%2C934&ref=nb_sb_noss_1',
    'Laptops': 'https://www.amazon.com/s?k=Laptop&crid=3GD9HX5GPNA0R&sprefix=laptop%2Caps%2C569&ref=nb_sb_noss_1',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'Dresses':'https://www.amazon.com/s?k=dress&crid=1ZKY6X4I4VARF&sprefix=dress%2Caps%2C1200&ref=nb_sb_noss_1',
    'Shoes':'https://www.amazon.com/s?k=shoes&crid=BWUHHU3UQ38H&sprefix=shoes%2Caps%2C346&ref=nb_sb_noss_1',
    'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&crid=17CJNG61JS7OX&sprefix=accessories+for+clo%2Caps%2C750&ref=nb_sb_ss_ts-doa-p_3_19', # Accessories for Clothes

  }

  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
   json.dump(amazon_data, file)
# BAD

In [95]:
import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def scrape_amazon(categories):
    # Setup driver
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    # Extract product details from each category
    all_products = []

    for category, base_url in categories.items():
        products = []

        # Iterate through pages for this category
        for page in range(1, 5):

            url = f"{base_url}&page={page}"
            driver.get(url)
            time.sleep(5) # wait for page to load

            # Extract product details
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for product in soup.find_all('div', class_='sg-col-inner'):
                product_dict = {}

                # Item name
                item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')

                # If item name is None, then it is a mouse pad
                if item_name is None:
                    item_name = product.find('span', class_='a-size-medium a-color-base a-text-normal')

                if item_name is not None:
                    product_dict['product'] = item_name.text.strip()

                # Item price
                product_price = product.find('span', class_='a-offscreen')
                rating = product.find('span', class_='a-icon-alt')
                review_responder_population = product.find('span', class_='a-size-base s-underline-text')
                item_reviews = product.find('span', class_='a-size-base')

                # Check if the product_price variable is None before adding it to the product_dict
                if product_price is not None:
                    product_price = product_price.text.strip()
                    product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

                # Check if the rating variable is None before adding it to the product_dict
                if rating is not None:
                    product_dict['ratings'] = rating.text.strip().split(" ")[0]

                # Check if the review_responder_population variable is None before adding it to the product_dict
                if review_responder_population is not None:
                    review_responders = review_responder_population.text.strip().split(" ")[0]
                    product_dict['review_responders'] = review_responders

                # Check if the item_reviews variable is None before adding it to the product_dict
                if item_reviews is not None:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = reviews_text.split(" ")[0]
                    product_dict['reviews'] = reviews_count.strip()

                # Add category to product_dict
                product_dict['category'] = category

                # Only append product_dict if it has some data
                if len(product_dict.keys()) > 0:
                    products.append(product_dict)

        all_products.extend(products)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(all_products)

if __name__ == '__main__':
  categories = {
    'Smartphones': 'https://www.amazon.com/s?k=smartphone&crid=3L33Q517U80C6&sprefix=smartphone%2Caps%2C934&ref=nb_sb_noss_1',
    'Laptops': 'https://www.amazon.com/s?k=Laptop&crid=3GD9HX5GPNA0R&sprefix=laptop%2Caps%2C569&ref=nb_sb_noss_1',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'Dresses':'https://www.amazon.com/s?k=dress&crid=1ZKY6X4I4VARF&sprefix=dress%2Caps%2C1200&ref=nb_sb_noss_1',
    'Shoes':'https://www.amazon.com/s?k=shoes&crid=BWUHHU3UQ38H&sprefix=shoes%2Caps%2C346&ref=nb_sb_noss_1',
    'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&crid=17CJNG61JS7OX&sprefix=accessories+for+clo%2Caps%2C750&ref=nb_sb_ss_ts-doa-p_3_19', # Accessories for Clothes

  }

  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
   json.dump(amazon_data, file)
# GOOD

In [110]:
import time
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.microsoft import EdgeChromiumDriverManager

def scrape_amazon(categories):
    # Setup driver
    options = webdriver.EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    # Extract product details from each category
    all_products = []

    for category, base_url in categories.items():
        products = []

        # Iterate through pages for this category
        for page in range(1, 5):

            url = f"{base_url}&page={page}"
            driver.get(url)
            time.sleep(5) # wait for page to load

            # Extract product details
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            for product in soup.find_all('div', class_='sg-col-inner'):
                product_dict = {}

                # Item name
                item_name = product.find('span', class_='a-size-base-plus a-color-base a-text-normal')

                # If item name is None, then it is a mouse pad
                if item_name is None:
                    item_name = product.find('span', class_='a-size-medium a-color-base a-text-normal')

                if item_name is not None:
                    product_dict['product'] = item_name.text.strip()

                # Item price
                product_price = product.find('span', class_='a-offscreen')
                rating = product.find('span', class_='a-icon-alt')




                # **Modified:** Get the review responder population from the `aria-label` attribute of the span element with class `a-size-base s-underline-text`
                review_responder_population_element = product.find('span', class_='a-size-base s-underline-text')
                if review_responder_population_element is not None and 'aria-label' in review_responder_population_element.attrs:
                    review_responder_population = review_responder_population_element['aria-label']
                else:
                    review_responder_population = None

                item_reviews = product.find('span', class_='a-size-base')
                
                # Check if the product_price variable is None before adding it to the product_dict
                if product_price is not None:
                    product_price = product_price.text.strip()
                    product_dict['price'] = product_price.replace("$", "").replace(",", "").strip()

                # Check if the rating variable is None before adding it to the product_dict
                if rating is not None:
                    product_dict['ratings'] = rating.text.strip().split(" ")[0]

                # **Modified:** Check if the review_responder_population variable is None before adding it to the product_dict
                if review_responder_population is not None:
                    product_dict['review_responders'] = review_responder_population

                # Check if the item_reviews variable is None before adding it to the product_dict
                if item_reviews is not None:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = reviews_text.split(" ")[0]
                    product_dict['reviews'] = reviews_count.strip()

                # Add category to product_dict
                product_dict['category'] = category

                # Only append product_dict if it has some data
                if len(product_dict.keys()) > 0:
                    products.append(product_dict)

        all_products.extend(products)

    # Close the browser
    driver.quit()

    # Output the result as JSON
    return json.dumps(all_products)


if __name__ == '__main__':
  categories = {
    'Smartphones': 'https://www.amazon.com/s?k=smartphone&crid=3L33Q517U80C6&sprefix=smartphone%2Caps%2C934&ref=nb_sb_noss_1',
    'Laptops': 'https://www.amazon.com/s?k=Laptop&crid=3GD9HX5GPNA0R&sprefix=laptop%2Caps%2C569&ref=nb_sb_noss_1',
    'video_games': 'https://www.amazon.com/s?rh=n%3A16225016011&fs=true&ref=lp_16225016011_sar',
    'Dresses':'https://www.amazon.com/s?k=dress&crid=1ZKY6X4I4VARF&sprefix=dress%2Caps%2C1200&ref=nb_sb_noss_1',
    'Shoes':'https://www.amazon.com/s?k=shoes&crid=BWUHHU3UQ38H&sprefix=shoes%2Caps%2C346&ref=nb_sb_noss_1',
    'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&crid=17CJNG61JS7OX&sprefix=accessories+for+clo%2Caps%2C750&ref=nb_sb_ss_ts-doa-p_3_19', # Accessories for Clothes

  }

  amazon_data = json.loads(scrape_amazon(categories))

  # Save the JSON data to a file
  with open('amazon_data_cat.json', 'w') as file:
   json.dump(amazon_data, file)


# ORIGINAL DB QUERY

In [99]:
import psycopg2
import pandas as pd

# Load the JSON data into a pandas DataFrame
df = pd.read_json('amazon_data_cat.json')

# Remove empty rows
df = df.dropna()

# Create a connection to your PostgreSQL database
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="demopass"
)

# Create a cursor object
cur = conn.cursor()

# Create a table in your PostgreSQL database to store the data
create_table_query = """
DROP TABLE IF EXISTS amazon_date;
CREATE TABLE IF NOT EXISTS amazon_data (
    product TEXT NOT NULL,
    price NUMERIC NOT NULL,
    ratings NUMERIC NOT NULL,
    reviews NUMERIC,
    review_responders NUMERIC,
    category TEXT NOT NULL
)
"""
cur.execute(create_table_query)
conn.commit()

# Function to clean and format data
def clean_format_data(row):
    # Remove commas from 'reviews' and 'price' fields
    reviews = str(row['reviews']).replace(',', '')
    review_responders = str(row['review_responders']).replace(',', '')
    price = str(row['price']).replace(',', '')

    # Convert reviews to int, if not possible set to 0
    try:
        reviews = int(reviews)
    except ValueError:
        reviews = 0
    
    # Convert reviews to int, if not possible set to 0
    try:
        review_responders = int(review_responders)
    except ValueError:
        review_responders = 0

    # Convert the ratings value to a float
    ratings = float(row['ratings'])

    # Adapt data to correct format for SQL insertion and remove quotes
    product = psycopg2.extensions.adapt(row['product'].encode('utf-8')).getquoted().decode('utf-8')[1:-1]

    # Convert price to float, if not possible set to 0
    try:
        price = float(price)
    except ValueError:
        price = 0

    category = psycopg2.extensions.adapt(row['category']).getquoted().decode('utf-8')[1:-1]

    return product, price, ratings, reviews, review_responders, category

# Insert the data from the pandas DataFrame into the PostgreSQL table
for index, row in df.iterrows():
    product, price, ratings, reviews, review_responders, category = clean_format_data(row)

    insert_query = """
    INSERT INTO amazon_data (product, price, ratings, reviews, review_responders, category) 
    VALUES (%s, %s, %s, %s, %s, %s)
    """

    cur.execute(insert_query, (product, price, ratings, reviews, review_responders, category))

# Commit the changes and close the connection
conn.commit()
cur.close()
conn.close()


KeyError: 'reviews'

# SOME EMPTY COL NO EMPTY ROWS


In [77]:
import psycopg2
import pandas as pd

# Load the JSON data into a pandas DataFrame
df = pd.read_json('amazon_data_cat.json')

# Remove empty rows
df = df.dropna()

# Create a connection to your PostgreSQL database
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="demopass"
)

# Create a cursor object
cur = conn.cursor()

# Create a table in your PostgreSQL database to store the data
create_table_query = """
DROP TABLE IF EXISTS amazon_date;
CREATE TABLE IF NOT EXISTS amazon_data (
    product TEXT NOT NULL,
    price NUMERIC NOT NULL,
    ratings NUMERIC NOT NULL,
    reviews NUMERIC,
    review_responders NUMERIC,
    category TEXT NOT NULL
)
"""
cur.execute(create_table_query)
conn.commit()

# Function to clean and format data
def clean_format_data(row):
    # Remove commas from 'reviews' and 'price' fields
    reviews = str(row['reviews']).replace(',', '')
    review_responders = str(row['review_responders']).replace(',', '')
    price = str(row['price']).replace(',', '')

    # Convert reviews to int, if not possible set to 0
    try:
        reviews = int(reviews)
    except ValueError:
        reviews = 0
    
    # Convert reviews to int, if not possible set to 0
    try:
        review_responders = int(review_responders)
    except ValueError:
        review_responders = 0

    # Convert the ratings value to a float
    ratings = float(row['ratings'])

    # Adapt data to correct format for SQL insertion and remove quotes
    product = psycopg2.extensions.adapt(row['product'].encode('utf-8')).getquoted().decode('utf-8')[1:-1]

    # Convert price to float, if not possible set to 0
    try:
        price = float(price)
    except ValueError:
        price = 0

    category = psycopg2.extensions.adapt(row['category']).getquoted().decode('utf-8')[1:-1]

    return product, price, ratings, reviews, review_responders, category

# Insert the data from the pandas DataFrame into the PostgreSQL table
for index, row in df.iterrows():
    product, price, ratings, reviews, review_responders, category = clean_format_data(row)

    insert_query = """
    INSERT INTO amazon_data (product, price, ratings, reviews, review_responders, category) 
    VALUES (%s, %s, %s, %s, %s, %s)
    """

    cur.execute(insert_query, (product, price, ratings, reviews, review_responders, category))

# Commit the changes and close the connection
conn.commit()
cur.close()
conn.close()

# Save the DataFrame to a CSV file
df.to_csv('amazon_data_cat.csv', index=False)


# NO EMPTY ROWS

In [76]:
import psycopg2
import pandas as pd

# Load the JSON data into a pandas DataFrame
df = pd.read_json('amazon_data_cat.json')

# Remove empty rows
df = df.dropna()

# Group by category and take the first 100 rows of each group
df = df.groupby('category').apply(lambda x: x.head(100)).reset_index(drop=True)

# Create a connection to your PostgreSQL database
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="demopass"
)

# Create a cursor object
cur = conn.cursor()

# Create a table in your PostgreSQL database to store the data
create_table_query = """
DROP TABLE IF EXISTS amazon_date;
CREATE TABLE IF NOT EXISTS amazon_data (
    product TEXT NOT NULL,
    price NUMERIC NOT NULL,
    ratings NUMERIC NOT NULL,
    reviews NUMERIC,
    review_responders NUMERIC,
    category TEXT NOT NULL
)
"""
cur.execute(create_table_query)
conn.commit()

# Function to clean and format data
def clean_format_data(row):
    # Remove commas from 'reviews' and 'price' fields
    reviews = str(row['reviews']).replace(',', '')
    review_responders = str(row['review_responders']).replace(',', '')
    price = str(row['price']).replace(',', '')

    # Convert reviews to int, if not possible set to 0
    try:
        reviews = int(reviews)
    except ValueError:
        reviews = 0
    
    # Convert reviews to int, if not possible set to 0
    try:
        review_responders = int(review_responders)
    except ValueError:
        review_responders = 0

    # Convert the ratings value to a float
    ratings = float(row['ratings'])

    # Adapt data to correct format for SQL insertion and remove quotes
    product = psycopg2.extensions.adapt(row['product'].encode('utf-8')).getquoted().decode('utf-8')[1:-1]

    # Convert price to float, if not possible set to 0
    try:
        price = float(price)
    except ValueError:
        price = 0

    category = psycopg2.extensions.adapt(row['category']).getquoted().decode('utf-8')[1:-1]

    return product, price, ratings, reviews, review_responders, category

# Insert the data from the pandas DataFrame into the PostgreSQL table
for index, row in df.iterrows():
    product, price, ratings, reviews, review_responders, category = clean_format_data(row)

    insert_query = """
    INSERT INTO amazon_data (product, price, ratings, reviews, review_responders, category) 
    VALUES (%s, %s, %s, %s, %s, %s)
    """

    cur.execute(insert_query, (product, price, ratings, reviews, review_responders, category))

# Commit the changes and close the connection
conn.commit()
cur.close()
conn.close()

# Save the DataFrame to a CSV file
df.to_csv('amazon_data.csv', index=False)
