In [12]:
import time
import json
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from webdriver_manager.microsoft import EdgeChromiumDriverManager


def process_review_count(text):
    text = text.strip().replace(',', '')
    if 'K+' in text:
        return str(int(float(text.replace('(', '').replace(')', '').replace('K+', '').strip()) * 1000))
    return text

def setup_driver():
    options = webdriver.EdgeOptions()
    # options.add_argument('--headless')  # Run browser in headless mode
    options.add_argument('--no-sandbox')
    try:
        driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)
    except Exception as e:
        print(e)
        raise Exception("Failed to install Edge Chromium driver.")
    return driver

def scrape_amazon(categories):
    driver = setup_driver()
    all_products = []
    seen_products = set()  # Initialize the set here

    for category, base_url in categories.items():
        products = []

        for page in range(1, 10):
            url = f"{base_url}&page={page}"

            try:
                driver.get(url)
                # Increase the wait time if needed
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "[data-asin]")))
            except TimeoutException:
                print(f"Timed out waiting for elements on page {page} of category {category}.")
                continue

            # Use random sleep to mimic human behavior
            time.sleep(random.uniform(3.0, 6.0))
            # Now parse the page with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')
#
            for product in soup.find_all('div', attrs={"data-asin": True}):
                product_dict = {}

                # Extract Product_ID (ASIN) directly from the 'data-asin' attribute
                product_dict['Product_ID'] = product.attrs.get('data-asin', None)

                # Item name
                item_name = product.find('span', class_='a-text-normal')
                if item_name:
                    product_dict['product'] = item_name.text.strip()
#
                # Item price
                product_price = product.find('span', class_='a-offscreen')
                if product_price:
                    product_price = product_price.text.strip().replace("$", "").replace(",", "").strip()
                    product_dict['price'] = product_price

                # Ratings and review responders
                rating_spans = product.find_all('span', attrs={"aria-label": True})
                for rating_span in rating_spans:
                    aria_label_value = rating_span.attrs["aria-label"]
                    if "stars" in aria_label_value:
                        product_dict['ratings'] = aria_label_value.split(" ")[0]
                    else:
                        if 'K+' in aria_label_value:
                            product_dict['review_responders'] = aria_label_value
                        else:
                            try:
                                int_value = int(aria_label_value)
                                product_dict['review_responders'] = aria_label_value
                            except ValueError:
                                pass

                # Updated Item reviews extraction
                item_reviews = product.find('span', class_='a-size-base s-underline-text')
                if item_reviews:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = process_review_count(reviews_text)
                    product_dict['reviews'] = reviews_count

                # # Extract item URL
                # item_url_tag = product.find('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')
                # if item_url_tag:
                #     item_url = item_url_tag.get('href')
                #     product_dict['url'] = "https://www.amazon.com" + item_url
                # else:
                #     product_dict['url'] = None

                # Extract item URL
                item_url_tag = product.find('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')
                if item_url_tag:
                    item_url = item_url_tag.get('href')
                    
                    # Check if it is a relative URL
                    if item_url.startswith('/'):
                        product_dict['url'] = "https://www.amazon.com" + item_url.split('/ref')[0] + "/product-reviews/" + product_dict['Product_ID'] + "/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
                    else:
                        product_dict['url'] = item_url
                else:
                    product_dict['url'] = None





                # Add category to product_dict
                product_dict['category'] = category

                # Ensure product dictionary contains necessary details
                if 'Product_ID' in product_dict and product_dict['Product_ID']:
                    # Create a unique identifier for the product
                    identifier = product_dict['Product_ID']

                    if identifier not in seen_products:
                        seen_products.add(identifier)
                        products.append(product_dict)

            all_products.extend(products)

    driver.quit()
    return json.dumps(all_products)


if __name__ == '__main__':
    categories = {
        'Smartphones': 'https://www.amazon.com/s?k=smartphone&ref=nb_sb_noss',
        'Laptops': 'https://www.amazon.com/s?k=Laptops&ref=nb_sb_noss',
        'video_games': 'https://www.amazon.com/s?k=video_games&ref=nb_sb_noss',
        'Dresses':'https://www.amazon.com/s?k=Dresses&ref=nb_sb_noss',
        'Shoes':'https://www.amazon.com/s?k=Shoes&ref=nb_sb_noss',
        'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&ref=nb_sb_noss',
    }

    amazon_data = json.loads(scrape_amazon(categories))

    # Save the JSON data to a file
    with open('amazon_data_cat.json', 'w') as file:
        json.dump(amazon_data, file)


In [14]:
import pandas as pd
import psycopg2
import re

# Load the JSON data into a pandas DataFrame
df = pd.read_json('amazon_data_cat.json')

# Remove any duplicates that may have been created due to URL changes
df = df.drop_duplicates(subset=['Product_ID'], keep='first')

# Fill NaN values in 'reviews' with '0'
df['reviews'].fillna(0, inplace=True)

# Fill NaN values in 'ratings' with '0'
df['ratings'].fillna(0, inplace=True)

# Fill NaN values in 'price' with '0'
df['price'].fillna(0, inplace=True)

# Fill NaN values in 'reviews' with 0
# df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')
df['reviews'].fillna(0, inplace=True)
# Check the data type of the reviews column
if pd.api.types.is_string_dtype(df['reviews']):
    df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')

# Convert reviews to integer
df['reviews'] = df['reviews'].astype(int)

# Set negative reviews to 0
df.loc[df['reviews'] < 0, 'reviews'] = 0

# Fill NaN values in 'url' with 'Unknown'
df['url'].fillna('Unknown', inplace=True)


# Function to extract the second occurrence of the URL
def extract_second_url(url):
    prefix = "https://www.amazon.comhttps://"
    if url.startswith(prefix):
        matches = re.findall(r'https://www\.amazon\.com/', url[len(prefix):])
        if len(matches) >= 1:
            second_occurrence_index = url.rfind(matches[0])
            return url[second_occurrence_index:]
    return url

# Apply the function to the 'url' column
df['url'] = df['url'].apply(extract_second_url)

# Remove any duplicates that may have been created due to URL changes
df = df.drop_duplicates(subset=['product', 'price','ratings', 'reviews', 'category'], keep='first')

# Replace empty product names with NaN and drop those rows
df['product'].replace('', pd.NA, inplace=True)
df.dropna(subset=['product'], inplace=True)

# Drop the 'review_responders' column
df.drop('review_responders', axis=1, inplace=True)

# Connect to PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="demopass"
)
cur = conn.cursor()

# Create a table in the PostgreSQL database
create_table_query = """
DROP TABLE IF EXISTS amazon_data;
CREATE TABLE IF NOT EXISTS amazon_data (
    product_id TEXT NOT NULL,
    product TEXT NOT NULL,
    price NUMERIC NOT NULL,
    ratings NUMERIC NOT NULL,
    reviews INTEGER NOT NULL,
    category TEXT NOT NULL,
    url TEXT NOT NULL
)
"""
cur.execute(create_table_query)
conn.commit()

def clean_format_data(row):
    # Convert the ratings value to a float
    ratings = float(row['ratings'])
    
    # Convert the product name to a string and then adapt for SQL insertion
    product = psycopg2.extensions.adapt(str(row['product']).encode('utf-8')).getquoted().decode('utf-8')[1:-1]
    
    # Convert price to float, if not possible set to 0
    try:
        price = float(row['price'])
    except ValueError:
        price = 0

    category = psycopg2.extensions.adapt(row['category']).getquoted().decode('utf-8')[1:-1]
    url = row['url']
    product_id = row['Product_ID']
    reviews = row['reviews']  # Already cleaned and converted to int
    return product_id, product, price, ratings, reviews, category, url


# Insert the data from the pandas DataFrame into the PostgreSQL table
for index, row in df.iterrows():
    product_id, product, price, ratings, reviews, category, url = clean_format_data(row)
    insert_query = """
    INSERT INTO amazon_data (product_id, product, price, ratings, reviews, category, url) 
    VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    cur.execute(insert_query, (product_id, product, price, ratings, reviews, category, url))

conn.commit()
cur.close()
conn.close()

# Save the DataFrame to a CSV file
df.to_csv('amazon_data.csv', index=False)


In [13]:
import pandas as pd
import psycopg2
import re

# Load the JSON data into a pandas DataFrame
df = pd.read_json('amazon_data_cat.json')

# Remove any duplicates that may have been created due to URL changes
df = df.drop_duplicates(subset=['Product_ID'], keep='first')

# Fill NaN values in 'reviews' with 0
# df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')
df['reviews'].fillna(0, inplace=True)
# Check the data type of the reviews column
if pd.api.types.is_string_dtype(df['reviews']):
    df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')

df.tail()

Unnamed: 0,Product_ID,product,price,ratings,reviews,url,category,review_responders
10173,B0B6Z4XVVG,HOTOUCH Waffle Button Down Shirt Women Casual ...,32.99,4.1,545.0,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Accessories,545.0
10174,B07KFB347M,Alise YJ8000-B Clothing Multiple Hook Wardrobe...,11.99,4.3,52.0,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Accessories,52.0
10175,B0BNVGCY42,"SORON 43"" Garment Bags, 7 Packs Garment Bags f...",26.99,4.6,212.0,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Accessories,212.0
10176,B0BF8S8KX6,Topwon Bear Ear Women Man Winter Ski Mask Knit...,14.99,4.2,4.0,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Accessories,4.0
10177,B01N9PB4CX,DreamLily Lace Cat Ears Hair Band Fancy Dress ...,9.99,4.5,922.0,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Accessories,922.0


# 100 lines per Category

In [11]:
import pandas as pd
import psycopg2
import re

# Load the JSON data into a pandas DataFrame
df = pd.read_json('amazon_data_cat.json')

# Remove any duplicates that may have been created due to URL changes
df = df.drop_duplicates(subset=['Product_ID'], keep='first')

# Fill NaN values in 'reviews' with '0'
df['reviews'].fillna(0, inplace=True)

# Fill NaN values in 'ratings' with '0'
df['ratings'].fillna(0, inplace=True)

# Fill NaN values in 'price' with '0'
df['price'].fillna(0, inplace=True)

# Fill NaN values in 'reviews' with 0

df['reviews'].fillna(0, inplace=True)
# Check the data type of the reviews column
if pd.api.types.is_string_dtype(df['reviews']):
    df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')

# Convert reviews to integer
df['reviews'] = df['reviews'].astype(int)

# Set negative reviews to 0
df.loc[df['reviews'] < 0, 'reviews'] = 0

# Fill NaN values in 'url' with 'Unknown'
df['url'].fillna('Unknown', inplace=True)

# Function to extract the second occurrence of the URL
def extract_second_url(url):
    prefix = "https://www.amazon.comhttps://"
    if url.startswith(prefix):
        matches = re.findall(r'https://www\.amazon\.com/', url[len(prefix):])
        if len(matches) >= 1:
            second_occurrence_index = url.rfind(matches[0])
            return url[second_occurrence_index:]
    return url

# Apply the function to the 'url' column
df['url'] = df['url'].apply(extract_second_url)

# Remove any duplicates that may have been created due to URL changes
df = df.drop_duplicates(subset=['product', 'price','ratings', 'reviews', 'category'], keep='first')

# Replace empty product names with NaN and drop those rows
df['product'].replace('', pd.NA, inplace=True)
df.dropna(subset=['product'], inplace=True)

# Drop the 'review_responders' column
df.drop('review_responders', axis=1, inplace=True)

# Group by 'category' and select the first 100 rows of each group
df_grouped = df.groupby('category')
df_sampled = df_grouped.apply(lambda x: x.sample(min(len(x), 100))).reset_index(drop=True)
df = df_sampled

# Connect to PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="demopass"
)
cur = conn.cursor()

# Create a table in the PostgreSQL database
create_table_query = """
DROP TABLE IF EXISTS amazon_data_100;
CREATE TABLE IF NOT EXISTS amazon_data_100 (
    product_id TEXT NOT NULL,
    product TEXT NOT NULL,
    price NUMERIC NOT NULL,
    ratings NUMERIC NOT NULL,
    reviews INTEGER NOT NULL,
    category TEXT NOT NULL,
    url TEXT NOT NULL
)
"""
cur.execute(create_table_query)
conn.commit()

def clean_format_data(row):
    # Convert the ratings value to a float
    ratings = float(row['ratings'])
    
    # Convert the product name to a string and then adapt for SQL insertion
    product = psycopg2.extensions.adapt(str(row['product']).encode('utf-8')).getquoted().decode('utf-8')[1:-1]
    
    # Convert price to float, if not possible set to 0
    try:
        price = float(row['price'])
    except ValueError:
        price = 0

    category = psycopg2.extensions.adapt(row['category']).getquoted().decode('utf-8')[1:-1]
    url = row['url']
    product_id = row['Product_ID']
    reviews = row['reviews']  # Already cleaned and converted to int
    return product_id, product, price, ratings, reviews, category, url


# Insert the data from the pandas DataFrame into the PostgreSQL table
for index, row in df.iterrows():
    product_id, product, price, ratings, reviews, category, url = clean_format_data(row)
    insert_query = """
    INSERT INTO amazon_data_100 (product_id, product, price, ratings, reviews, category, url) 
    VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
    cur.execute(insert_query, (product_id, product, price, ratings, reviews, category, url))

conn.commit()
cur.close()
conn.close()

# Save the DataFrame to a CSV file
df.to_csv('amazon_data_100.csv', index=False)
