# Webscraping demo
## Amazon Product Reviews
### Changes may be required due to tag updates

(c) Nuno António 2020-2025 - Rev. 1.20 (2025-04-17)

### Load packages and do the initializations

In [1]:
# Load libraries
import numpy as np
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from fake_useragent import UserAgent
import time
import random

In [3]:
# Generate a random user-agent string
ua = UserAgent()
user_agent = ua.random

In [4]:
# Get Firefox options (configurations)
options = Options()

# Add options to fake agent
options.set_preference("general.useragent.override", user_agent)

# Add this argument to Options to hide Firefox (make it not visible)
# options.add_argument('--headless') 

In [5]:
# Load the list of the hotels to read the content
productsToScrap = pd.read_excel("LaptopsToScrap.xlsx", sheet_name="Sheet1", index_col="ID", engine='openpyxl')

In [6]:
print(productsToScrap)

                                                                                                  URL
ID                                                                                                   
Lenovo Flex 5 14" 2-in-1 Laptop, 14.0" FHD (192...  https://www.amazon.com/Lenovo-Processor-Graphi...
Acer Chromebook Spin 311 Convertible Laptop, In...  https://www.amazon.com/Acer-Chromebook-Convert...


In [7]:
# Create an empty dataframe for the resuls
productReviews = pd.DataFrame({'productID': pd.Series([], dtype='string'),
                             'user': pd.Series([], dtype='string'),
                             'verified': pd.Series([], dtype='string'),
                             'rating': pd.Series([], dtype='float'),
                             'reviewDate': pd.Series([], dtype='string'),
                             'reviewCountry': pd.Series([], dtype='string'),
                             'text': pd.Series([], dtype='string'),
                             })

### Functions to use in the Main Loop

In [8]:
# Open page and read HTML
def open_page_read_html(url, browser):
    browser.get(url)
    time.sleep(random.uniform(3, 5))  # Wait for page to load
    return BeautifulSoup(browser.page_source, 'html.parser')

In [9]:
# Process each page
def processPage(soup, productID, extractedDF):
    reviews = soup.find_all("li", {"data-hook": "review"})

    for review in reviews:
        # Get Rating
        reviewRating = float(review.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip())

        # Get User
        user = review.select("span[class*=a-profile-name]")[0].string

        # Get Verified or not
        verified_span = review.find('span', {'data-hook': 'avp-badge'})
        verified = verified_span.string if verified_span else "Not verified"

        # Get review text
        reviewText = review.find('span', {'data-hook': 'review-body'}).text.strip()

        # Get review Date
        reviewDateTemp = review.find('span', {'data-hook': 'review-date'}).string
        reviewDate = reviewDateTemp.strip().split("on")[1]

        # Get review Country
        reviewCountry = reviewDateTemp.replace('Reviewed in', '').strip().split("on")[0]

        # Update extracted reviews dataframe
        tDF = pd.DataFrame({
            'productID': [productID],
            'user': [user],
            'verified': [verified],
            'rating': [reviewRating],
            'reviewDate': [reviewDate],
            'reviewCountry': [reviewCountry],
            'text': [reviewText],
        })
        extractedDF = pd.concat([extractedDF, tDF], ignore_index=True)

    # Return the resulting dataframe
    return extractedDF

#### To Clean Dataset

In [10]:
# Create an empty dataframe for the resuls
productReviews = pd.DataFrame({'productID': pd.Series([], dtype='string'),
                             'user': pd.Series([], dtype='string'),
                             'verified': pd.Series([], dtype='string'),
                             'rating': pd.Series([], dtype='float'),
                             'reviewDate': pd.Series([], dtype='string'),
                             'reviewCountry': pd.Series([], dtype='string'),
                             'text': pd.Series([], dtype='string'),
                             })

### Main loop

In [11]:
# Open browser and navigate to login page
print("Opening browser")
browser = webdriver.Firefox(options=options)

Opening browser


In [12]:
# Load the Amazon website
temp = open_page_read_html("http://www.amazon.com", browser)

#### DO NOT RUN THE FOLLOWING CODE BEFORE SOLVING THE CAPTCHA AND SIGN IN THE OPENED BROWSER

In [13]:
# Because this is a demo, let's define the maximum number of reviews to obtain per product
reviewsToGet = 20

In [None]:
# Loop for all product
for index, row in productsToScrap.iterrows():
    print(f"Processing product {index}")
    urlToUse = row['URL']
    productReviewsCount = 0  # Initialize review count for the current product
    pageNumber = 1

    while productReviewsCount < reviewsToGet:
        # Open and read the web page content
        soup = open_page_read_html(urlToUse, browser)

        # Process web page
        productReviews = processPage(soup, index, productReviews)
        
        # Calculate the extracted reviews in current loop
        extracted_reviews_count = len(productReviews[productReviews['productID'] == str(index)])
        
        # Number of reviews from the current product
        productReviewsCount = extracted_reviews_count
        
        # Click the next page button 
        try:
            next_button = WebDriverWait(browser, random.uniform(5,10)).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "li.a-last a"))
            )
            next_button.click()
            
            # Wait for page to load after click
            WebDriverWait(browser, random.uniform(5,10)).until(
                EC.staleness_of(next_button)
            )
        except (NoSuchElementException, TimeoutException):
            print("No more pages available")
            break

        print(f"Extracted {productReviewsCount} / {reviewsToGet}")


# Close browser
print("Closing browser")
browser.quit()    
print("Browser closed")    

Processing product Lenovo Flex 5 14" 2-in-1 Laptop, 14.0" FHD (1920 x 1080) Touch Display, AMD Ryzen 5 4500U Processor, 16GB DDR4, 256GB SSD, AMD Radeon Graphics, Digital Pen Included, Win 10, 81X20005US, Graphite Grey 
Extracted 20 / 20
Processing product Acer Chromebook Spin 311 Convertible Laptop, Intel Celeron N4020, 11.6" HD Touch, 4GB LPDDR4, 32GB eMMC, Gigabit Wi-Fi 5, Bluetooth 5.0, Google Chrome, CP311-2H-C679 
Extracted 20 / 20
Closing browser
Browser closed


In [15]:
# Save the extracted reviews data frame to an Excel file
productReviews.to_excel("ExtractedReviews_Amazon.xlsx")