In [2]:
import time
import json
import random
import html
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import traceback

def process_review_count(text):
    text = text.strip().replace(',', '')
    if 'K+' in text:
        return str(int(float(text.replace('(', '').replace(')', '').replace('K+', '').strip()) * 1000))
    return text

def setup_driver():
    options = webdriver.EdgeOptions()
    options.add_argument('--no-sandbox')
    try:
        driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)
    except Exception as e:
        print(e)
        raise Exception("Failed to install Edge Chromium driver.")
    return driver

from selenium.common.exceptions import TimeoutException
from word2number import w2n
import traceback

def scrape_extra_parameters(url: str, driver: webdriver.Edge) -> dict:
    try:
        driver.get(url)
        try:
            WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[data-hook='review']")))
        except TimeoutException:
            print(f"TimeoutException: Could not find reviews for {url}")
            return {}
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        reviews_tags = soup.find_all('div', attrs={'data-hook': 'review'})

        result = {}
        for i, review_tag in enumerate(reviews_tags[:5]):
            result[f'Customer_{i + 1}_ID'] = review_tag.attrs.get('id', 'NaN')
            
            # Extract the Star Rating
            star_rating_tag = review_tag.select_one('i[data-hook="review-star-rating"] span.a-icon-alt')
            star_rating = float(star_rating_tag.text.split()[0]) if star_rating_tag else 0.0
            result[f'Customer_{i+1}_Star_Rating'] = star_rating
            
            # Extract the Comment Title
            comment_title_tag = review_tag.select_one('a[data-hook="review-title"]')
            if comment_title_tag:
                actual_comment_title = comment_title_tag.text.strip()
            else:
                # Handle alternate structure
                comment_title_tag = review_tag.select_one('span.cr-original-review-content')
                actual_comment_title = comment_title_tag.text.strip() if comment_title_tag else 'NaN'
            
            result[f'Customer_{i+1}_Comment'] = actual_comment_title

            # Extract the Number of people who found the review helpful
            helpful_vote_tag = review_tag.select_one('span[data-hook="helpful-vote-statement"]')
            helpful_count = w2n.word_to_num(helpful_vote_tag.text.split()[0]) if helpful_vote_tag else 0
            result[f'Customer_{i+1}_buying_influence'] = helpful_count

        return result
    except Exception as e:
        print(f"Error scraping extra parameters for {url}: {e}")
        traceback.print_exc()
    return {}


def scrape_amazon(categories):
    driver = setup_driver()
    all_products = []
    seen_products = set()

    for category, base_url in categories.items():
        products = []

        for page in range(1, 10):
            url = f"{base_url}&page={page}"

            try:
                driver.get(url)
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "[data-asin]")))
            except TimeoutException:
                print(f"Timed out waiting for elements on page {page} of category {category}.")
                continue

            time.sleep(random.uniform(3.0, 6.0))
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            for product in soup.find_all('div', attrs={"data-asin": True}):
                product_dict = {}

                product_dict['Product_ID'] = product.attrs.get('data-asin', None)

                item_name = product.find('span', class_='a-text-normal')
                if item_name:
                    product_dict['product'] = item_name.text.strip()

                product_price = product.find('span', class_='a-offscreen')
                if product_price:
                    product_price = product_price.text.strip().replace("$", "").replace(",", "").strip()
                    product_dict['price'] = product_price

                rating_spans = product.find_all('span', attrs={"aria-label": True})
                for rating_span in rating_spans:
                    aria_label_value = rating_span.attrs["aria-label"]
                    if "stars" in aria_label_value:
                        product_dict['ratings'] = aria_label_value.split(" ")[0]
                    else:
                        if 'K+' in aria_label_value:
                            product_dict['review_responders'] = aria_label_value
                        else:
                            try:
                                int_value = int(aria_label_value)
                                product_dict['review_responders'] = aria_label_value
                            except ValueError:
                                pass

                item_reviews = product.find('span', class_='a-size-base s-underline-text')
                if item_reviews:
                    reviews_text = item_reviews.text.strip()
                    reviews_count = process_review_count(reviews_text)
                    product_dict['reviews'] = reviews_count


                # Extract ASIN
                product_dict['Product_ID'] = product.attrs.get('data-asin', None)

                # Construct the review URL using ASIN
                if product_dict['Product_ID']:
                    asin = product_dict['Product_ID']
                    product_dict['url'] = f"https://www.amazon.com/product-reviews/{asin}/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews"
                    
                else:
                    product_dict['url'] = None


                product_dict['category'] = category

                if 'Product_ID' in product_dict and product_dict['Product_ID']:
                # Create a unique identifier for the product
                    identifier = product_dict['Product_ID']

                    if identifier not in seen_products:
                        seen_products.add(identifier) #
                        if product_dict.get('url'):
                            extra_params = scrape_extra_parameters(product_dict['url'], driver)
                            product_dict.update(extra_params)
                        products.append(product_dict) #
            all_products.extend(products)
    driver.quit()
    return json.dumps(all_products)


if __name__ == '__main__':
    categories = {
        'Smartphones': 'https://www.amazon.com/s?k=smartphone&ref=nb_sb_noss',
        'Laptops': 'https://www.amazon.com/s?k=Laptops&ref=nb_sb_noss',
        'video_games': 'https://www.amazon.com/s?k=video_games&ref=nb_sb_noss',
        'Dresses':'https://www.amazon.com/s?k=Dresses&ref=nb_sb_noss',
        'Shoes':'https://www.amazon.com/s?k=Shoes&ref=nb_sb_noss',
        'Accessories':'https://www.amazon.com/s?k=accessories+for+clothes&ref=nb_sb_noss',
    }

    all_products = []
    try:
        all_products = json.loads(scrape_amazon(categories))
    except Exception as e:
        print(f"Error occurred during scraping: {e}")
    finally:
        with open('amazon_data_ext.json', 'w') as file:
            json.dump(all_products, file)


TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0C37QXBH3/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CCRMY3BL/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0BX5JW5Y1/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CHBBT3ZJ/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CGVFT8PS/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CDX6WGFP/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.

Traceback (most recent call last):
  File "C:\Users\Kasim\AppData\Local\Temp\ipykernel_13224\4046908382.py", line 69, in scrape_extra_parameters
    helpful_count = w2n.word_to_num(helpful_vote_tag.text.split()[0]) if helpful_vote_tag else 0
  File "C:\Users\Kasim\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\word2number\w2n.py", line 154, in word_to_num
    raise ValueError("No valid number words found! Please enter a valid number word (eg. two million twenty three thousand and forty nine)")
ValueError: No valid number words found! Please enter a valid number word (eg. two million twenty three thousand and forty nine)


TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0C6YRCFM2/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CHC8294C/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CFBF6XL3/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CDQLX7W8/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B083J4ZJQL/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B09JFF1S3X/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.

Traceback (most recent call last):
  File "C:\Users\Kasim\AppData\Local\Temp\ipykernel_13224\4046908382.py", line 69, in scrape_extra_parameters
    helpful_count = w2n.word_to_num(helpful_vote_tag.text.split()[0]) if helpful_vote_tag else 0
  File "C:\Users\Kasim\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\word2number\w2n.py", line 154, in word_to_num
    raise ValueError("No valid number words found! Please enter a valid number word (eg. two million twenty three thousand and forty nine)")
ValueError: No valid number words found! Please enter a valid number word (eg. two million twenty three thousand and forty nine)


TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0C91MLPNG/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0B5LMVFQP/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CBNQ9YC3/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CBCFS4GC/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0C9NK7ZZB/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0C9NX5P1R/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.

Traceback (most recent call last):
  File "C:\Users\Kasim\AppData\Local\Temp\ipykernel_13224\4046908382.py", line 37, in scrape_extra_parameters
    driver.get(url)
  File "C:\Users\Kasim\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\selenium\webdriver\remote\webdriver.py", line 353, in get
    self.execute(Command.GET, {"url": url})
  File "C:\Users\Kasim\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\selenium\webdriver\remote\webdriver.py", line 344, in execute
    self.error_handler.check_response(response)
  File "C:\Users\Kasim\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\selenium\webdriver\remote\errorhandler.py", line 229, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unkno

TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CDV76GRS/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0C3MTMYT9/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CDGZCHG6/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CDW78FPK/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0CB5RVYK1/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.com/product-reviews/B0BT6FRF2V/ref=cm_cr_dp_d_show_all_top?ie=UTF8&reviewerType=all_reviews
TimeoutException: Could not find reviews for https://www.amazon.

In [1]:
import pandas as pd
import psycopg2
import numpy as np
import re

# Load the JSON data into a pandas DataFrame
df = pd.read_json('amazon_data_ext.json')


# Ensure the columns for the additional information from the five customers exist
for i in range(1, 6):
    df[f'Customer_{i}_ID'] = df[f'Customer_{i}_ID'].fillna('NaN')
    df[f'Customer_{i}_Star_Rating'] = df[f'Customer_{i}_Star_Rating'].fillna(0)
    df[f'Customer_{i}_Comment'] = df[f'Customer_{i}_Comment'].fillna('NaN')
    df[f'Customer_{i}_buying_influence'] = df[f'Customer_{i}_buying_influence'].fillna(0)

# Handle other columns similarly
df['price'].fillna(0, inplace=True)
df['ratings'].fillna(0, inplace=True)
df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')
df['reviews'].fillna(0, inplace=True)

# Check the data type of the reviews column
if pd.api.types.is_string_dtype(df['reviews']):
    df['reviews'] = df['reviews'].str.replace('(', '').str.replace(')', '')

# Convert reviews to integer
try:
    df['reviews'] = df['reviews'].astype(int)
except ValueError:
    df['reviews'] = 0

df['url'].fillna('Unknown', inplace=True)
df['category'].fillna('', inplace=True)
df['product'].fillna('', inplace=True)
# df['monthly_sales'] = df.get('monthly_sales', 0)  # Adding handling for 'monthly_sales'

# Set negative reviews to 0
df.loc[df['reviews'] < 0, 'reviews'] = 0

# Function to extract the second occurrence of the URL
def extract_second_url(url):
    prefix = "https://www.amazon.comhttps://"
    if url.startswith(prefix):
        matches = re.findall(r'https://www\.amazon\.com/', url[len(prefix):])
        if len(matches) >= 1:
            second_occurrence_index = url.rfind(matches[0])
            return url[second_occurrence_index:]
    return url

# Apply the function to the 'url' column
df['url'] = df['url'].apply(extract_second_url)

# Remove any duplicates that may have been created due to URL changes
df = df.drop_duplicates(subset=['product', 'price', 'ratings', 'reviews', 'category'], keep='first')

# Replace empty product names with NaN and drop those rows
df['product'].replace('', pd.NA, inplace=True)
df.dropna(subset=['product'], inplace=True)

# Drop the 'review_responders' column if it exists
if 'review_responders' in df.columns:
    df.drop('review_responders', axis=1, inplace=True)
# Drop the 'review_responders' column if it exists
if 'monthly_sales' in df.columns:
    df.drop('monthly_sales', axis=1, inplace=True)
print(len(df))
# Drop all items with review having zero values
df.drop(df.index[df['reviews'] == 0], inplace=True)

# Define the columns to be updated
columns_to_update = [f'Customer_{i}_Comment' for i in range(1, 6)]

# Iterate through the columns and replace 'NaN' with the desired value (e.g., 'No Comment')
for column in columns_to_update:
    df.loc[df[column] == 'NaN', column] = 'No Comment'

# Define the columns to be updated
columns_to_update = [f'Customer_{i}_ID' for i in range(1, 6)]
# Iterate through the columns and replace 'NaN' with the desired value (e.g., 'No ID')
for column in columns_to_update:
    df.loc[df[column] == 'NaN', column] = 'None'

# Drop all items with review having zero values
df.drop(df.index[df['Customer_1_ID'] == 'None'], inplace=True)



print(len(df))

# Connect to PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="demopass",
    client_encoding='utf8'
)

cur = conn.cursor()

# Create a table in the PostgreSQL database with additional columns for the five customers
create_table_query = """
DROP TABLE IF EXISTS amazon_data_ext;
CREATE TABLE IF NOT EXISTS amazon_data_ext (
    product_id TEXT NOT NULL,
    product TEXT NOT NULL,
    price_dollars NUMERIC NOT NULL,
    -- monthly_sales NUMERIC,
    ratings NUMERIC NOT NULL,
    reviews_qty INTEGER NOT NULL,
    category TEXT NOT NULL,
    url TEXT NOT NULL,
    """ + ",\n    ".join([f"Customer_{i}_ID TEXT, Customer_{i}_Star_Rating INTEGER, Customer_{i}_Comment TEXT, Customer_{i}_buying_influence INTEGER" for i in range(1, 6)]) + """
)
"""
cur.execute(create_table_query)
conn.commit()

def clean_format_data(row):
    # Convert the ratings value to a float
    ratings = float(row['ratings'])
    
    # Convert the product name, reviews, and category to strings and then adapt for SQL insertion
    product = psycopg2.extensions.adapt(str(row['product']).encode('utf-8', 'replace')).getquoted().decode('utf-8')[1:-1]
    category = psycopg2.extensions.adapt(row['category'].encode('utf-8', 'replace')).getquoted().decode('utf-8')[1:-1]
    
    # Convert price to float, if not possible set to 0
    try:
        price = float(row['price'])
    except ValueError:
        price = 0

    url = row['url']
    product_id = row['Product_ID']
    reviews = row['reviews']  # Already cleaned and converted to int
    # monthly_sales = float(row['monthly_sales'])  # Handling for 'monthly_sales'

    
    # Handle additional customer information
    customer_data = []
    for i in range(1, 6):
        customer_id = row[f'Customer_{i}_ID']
        star_rating = row[f'Customer_{i}_Star_Rating']
        comment = psycopg2.extensions.adapt(str(row[f'Customer_{i}_Comment']).encode('utf-8', 'replace')).getquoted().decode('utf-8')[1:-1]
        buying_influence = row[f'Customer_{i}_buying_influence']
        customer_data.extend([customer_id, star_rating, comment, buying_influence])
    
    return product_id, product, price, ratings, reviews, category, url,*customer_data

# Modify the INSERT query to include additional columns for the five customers
insert_query = """
INSERT INTO amazon_data_ext (
    product_id, product, price_dollars, ratings, reviews_qty, category, url,
    """ + ", ".join([f"Customer_{i}_ID, Customer_{i}_Star_Rating, Customer_{i}_Comment, Customer_{i}_buying_influence" for i in range(1, 6)]) + """
) VALUES (%s, %s, %s, %s, %s, %s, %s, """ + ", ".join(["%s"] * 20) + ")"

# Insert the data from the pandas DataFrame into the PostgreSQL table
for index, row in df.iterrows():
    try:
        product_id, product, price,ratings, reviews, category, url, *customer_data = clean_format_data(row)
        cur.execute(insert_query, (product_id, product, price,ratings, reviews, category, url, *customer_data))
    except Exception as e:
        print(f"Error inserting row: {e}")

conn.commit()
cur.close()
conn.close()
# Rename the columns in the DataFrame
df.rename(columns={'reviews': 'reviews_qty', 'price': 'price_dollars'}, inplace=True)

# Save the DataFrame to a CSV file with updated column names
df.to_csv('amazon_data_ext.csv', index=False)



748
0
