In [1]:
import time
import json
import random
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from webdriver_manager.microsoft import EdgeChromiumDriverManager


def process_review_count(text):
    text = text.strip().replace(',', '')
    if 'K+' in text:
        return str(int(float(text.replace('(', '').replace(')', '').replace('K+', '').strip()) * 1000))
    return text

def setup_driver():
    options = webdriver.EdgeOptions()
    options.add_argument('--no-sandbox')
    try:
        driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)
    except Exception as e:
        print(e)
        raise Exception("Failed to install Edge Chromium driver.")
    return driver

def scrape_extra_parameters(url: str, driver: webdriver.Edge) -> dict:
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "span.a-list-item")))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        actual_category_tag = soup.find('span', class_='a-list-item')
        if actual_category_tag:
            actual_category = html.unescape(actual_category_tag.get_text(strip=True))
            # Item category to preset: "Clothing, Shoes & Jewelry", "Toys & Games"
            return {'actual_category': actual_category}
    except Exception as e:
        print(f"Error scraping extra parameters for {url}: {e}")
    return {}

def scrape_amazon(categories):
    driver = setup_driver()
    all_products = []
    seen_products = set()

    for category, base_url in categories.items():
        products = []

        for page in range(1, 1):
            url = f"{base_url}&page={page}"

            try:
                driver.get(url)
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "[data-asin]")))
            except TimeoutException:
                print(f"Timed out waiting for elements on page {page} of category {category}.")
                continue

            time.sleep(random.uniform(3.0, 6.0))
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            for product in soup.find_all('div', attrs={"data-asin": True}):
                product_dict = {}

                product_dict['Product_ID'] = product.attrs.get('data-asin', None)

                item_name = product.find('span', class_='a-text-normal')
                if item_name:
                    product_dict['product'] = item_name.text.strip()

                product_price = product.find('span', class_='a-offscreen')
                if product_price:
                    product_price = product_price.text.strip().replace("$", "").replace(",", "").strip()
                    product_dict['price'] = product_price

                rating_spans = product.find_all('span', attrs={"aria-label": True})
                for rating_span in rating_spans:
                    aria_label_value = rating_span.attrs["aria-label"]
                    if "stars" in aria_label_value:
                        product_dict['ratings'] = aria_label_value.split(" ")[0]
                    else:
                        if 'K+' in aria_label_value:
                            product_dict['review_responders'] = aria_label_value
                        else:
                            try:
                                int_value = int(aria_label_value)
                                product_dict['review_responders'] = aria_label_value
                            except ValueError:
                                pass

                item_reviews = product.find('span', class_='a-size-base s-underline-text')
                if item_reviews:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = process_review_count(reviews_text)
                    product_dict['reviews'] = reviews_count


                # Extract Monthly Sales
                monthly_sales = product.find('span', class_='a-size-base a-color-secondary', string=lambda x: 'K+ bought' in x)
                if monthly_sales:
                    product_dict['monthly_sales'] = process_review_count(monthly_sales.text.strip())

                # Extract Coupon Discount
                coupon_discount = product.find('span', class_='a-size-base s-highlighted-text-padding aok-inline-block s-coupon-highlight-color')
                if coupon_discount:
                    product_dict['coupon_discount'] = coupon_discount.text.strip()

                item_url_tag = product.find('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')
                if item_url_tag:
                    item_url = item_url_tag.get('href')
                    product_dict['url'] = "https://www.amazon.com" + item_url
                else:
                    product_dict['url'] = None

                product_dict['category'] = category

                if 'Product_ID' in product_dict and product_dict['Product_ID']:
                    identifier = product_dict['Product_ID']

                    if identifier not in seen_products:
                        seen_products.add(identifier)
                        if product_dict.get('url'):
                            extra_params = scrape_extra_parameters(product_dict['url'], driver)
                            product_dict.update(extra_params)
                        products.append(product_dict)

            all_products.extend(products)

    driver.quit()
    return json.dumps(all_products)


if __name__ == '__main__':
    categories = {
        'Smartphones': 'https://www.amazon.com/s?k=smartphone&ref=nb_sb_noss',
        'Laptops': 'https://www.amazon.com/s?k=Laptops&ref=nb_sb_noss',
        'video_games': 'https://www.amazon.com/s?k=video_games&ref=nb_sb_noss',
        'Dresses':'https://www.amazon.com/s?k=Dresses&ref=nb_sb_noss',
        'Shoes':'https://www.amazon.com/s?k=Shoes&ref=nb_sb_noss',
        'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&ref=nb_sb_noss',
    }

    amazon_data = json.loads(scrape_amazon(categories))

    with open('amazon_data_ext.json', 'w') as file:
        json.dump(amazon_data, file)


Error scraping extra parameters for https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo2NzM5MzYzMDM0NzYzMDoxNjk1NDY4NDg1OnNwX2F0ZjoyMDAxNjYyMDI2MDA1OTg6OjA6Og&url=%2FGoogle-Pixel-7a-Unlocked-Smartphone%2Fdp%2FB0BZ9XNBRB%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Dsmartphone%26qid%3D1695468485%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1: 'NoneType' object has no attribute 'get_text'
Error scraping extra parameters for https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo2NzM5MzYzMDM0NzYzMDoxNjk1NDY4NDg1OnNwX2F0ZjoyMDAxMjM3OTk3MzA3OTg6OjA6Og&url=%2FSAMSUNG-Factory-Unlocked-Smartphone-Lavender%2Fdp%2FB0BLP2B5DZ%2Fref%3Dsr_1_2_sspa%3Fkeywords%3Dsmartphone%26qid%3D1695468485%26sr%3D8-2-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1: 'NoneType' object has no attribute 'get_text'
Error scraping extra parameters for https://www.amazon.com/Stylus-battery-Unlocked-Motorola-Twilight/dp/B09PFC2DVD/ref=sr_1_3?keywords=smartphone&qid=1695468485&sr=8-3: 'NoneType' object has no attribute 'get_te

In [2]:
import pandas as pd
import psycopg2
import re

# Load the JSON data into a pandas DataFrame
df = pd.read_json('amazon_data_cat.json')

# Remove any duplicates that may have been created due to URL changes
df = df.drop_duplicates(subset=['Product_ID'], keep='first')

# Fill NaN values in 'reviews' with '0'
df['reviews'].fillna(0, inplace=True)

# Fill NaN values in 'ratings' with '0'
df['ratings'].fillna(0, inplace=True)

# Fill NaN values in 'price' with '0'
df['price'].fillna(0, inplace=True)

# Fill NaN values in 'reviews' with 0
# df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')
df['reviews'].fillna(0, inplace=True)
# Check the data type of the reviews column
if pd.api.types.is_string_dtype(df['reviews']):
    df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')

# Convert reviews to integer
df['reviews'] = df['reviews'].astype(int)

# Set negative reviews to 0
df.loc[df['reviews'] < 0, 'reviews'] = 0

# Fill NaN values in 'url' with 'Unknown'
df['url'].fillna('Unknown', inplace=True)

# Function to extract the second occurrence of the URL
def extract_second_url(url):
    prefix = "https://www.amazon.comhttps://"
    if url.startswith(prefix):
        matches = re.findall(r'https://www\.amazon\.com/', url[len(prefix):])
        if len(matches) >= 1:
            second_occurrence_index = url.rfind(matches[0])
            return url[second_occurrence_index:]
    return url

# Apply the function to the 'url' column
df['url'] = df['url'].apply(extract_second_url)

# Remove any duplicates that may have been created due to URL changes
df = df.drop_duplicates(subset=['product', 'price','ratings', 'reviews', 'category', 'actual_category'], keep='first')

# Replace empty product names with NaN and drop those rows
df['product'].replace('', pd.NA, inplace=True)
df.dropna(subset=['product'], inplace=True)

# Drop the 'review_responders' column
df.drop('review_responders', axis=1, inplace=True)

# Connect to PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="demopass"
)
cur = conn.cursor()

# Create a table in the PostgreSQL database
create_table_query = """
DROP TABLE IF EXISTS amazon_data_ext;
CREATE TABLE IF NOT EXISTS amazon_data_ext (
    product_id TEXT NOT NULL,
    product TEXT NOT NULL,
    price NUMERIC NOT NULL,
    ratings NUMERIC NOT NULL,
    reviews INTEGER NOT NULL,
    category TEXT NOT NULL,
    actual_category TEXT NOT NULL,
    url TEXT NOT NULL

)
"""
cur.execute(create_table_query)
conn.commit()

def clean_format_data(row):
    # Convert the ratings value to a float
    ratings = float(row['ratings'])
    
    # Convert the product name to a string and then adapt for SQL insertion
    product = psycopg2.extensions.adapt(str(row['product']).encode('utf-8')).getquoted().decode('utf-8')[1:-1]
    
    # Convert price to float, if not possible set to 0
    try:
        price = float(row['price'])
    except ValueError:
        price = 0

    category = psycopg2.extensions.adapt(row['category']).getquoted().decode('utf-8')[1:-1]
    actual_category = psycopg2.extensions.adapt(row['actual_category']).getquoted().decode('utf-8')[1:-1]
    url = row['url']
    product_id = row['Product_ID']
    reviews = row['reviews']  # Already cleaned and converted to int
    return product_id, product, price, ratings, reviews, category, actual_category, url


# Insert the data from the pandas DataFrame into the PostgreSQL table
for index, row in df.iterrows():
    product_id, product, price, ratings, reviews, category, actual_category, url = clean_format_data(row)
    insert_query = """
    INSERT INTO amazon_data_ext (product_id, product, price, ratings, reviews, category, actual_category, url) 
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """
    cur.execute(insert_query, (product_id, product, price, ratings, reviews, category, actual_category, url))

conn.commit()
cur.close()
conn.close()

# Save the DataFrame to a CSV file
df.to_csv('amazon_data_ext.csv', index=False)


KeyError: Index(['actual_category'], dtype='object')