In [1]:
pip install --upgrade selenium webdriver-manager beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Downloading beautifulsoup4-4.14.3-py3-none-any.whl (107 kB)
Installing collected packages: beautifulsoup4
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.13.5
    Uninstalling beautifulsoup4-4.13.5:
      Successfully uninstalled beautifulsoup4-4.13.5
Successfully installed beautifulsoup4-4.14.3
Note: you may need to restart the kernel to use updated packages.


In [14]:
# CISB5123 Text Analytics - Lab Assignment 1: Web Scraping
# Name: Tun Danial Adli Bin Tun Ali
# Student ID: IS01083502

# Name: Amirul Irfaan Bin Mohd.Hishamuddin
# Student ID: IS01083864

from selenium import webdriver                                     # For automating browser interaction
from selenium.webdriver.chrome.options import Options              # For configuring Chrome browser
from selenium.webdriver.common.by import By                        # For locating HTML elements
from selenium.webdriver.support.ui import WebDriverWait            # For waiting until elements load
from selenium.webdriver.support import expected_conditions as EC   # For wait conditions
from bs4 import BeautifulSoup                                      # For parsing HTML content
import pandas as pd                                                # For data manipulation and CSV export
import time                                                        # For adding delays between requests
import random                                                      # For randomizing delay intervals

# --- Configuration ---
# The ASIN is extracted from the Amazon product URL (after /dp/)
# Product: Backpack for Women, Carry-On Travel Backpack
# URL: https://www.amazon.com/dp/B09MQWWP87

ASIN = 'B09MQWWP87'   # Amazon product ASIN
MAX_PAGES = 5          # Number of review pages to scrape (limited to 5)


# --- Helper Functions ---

def setup_driver():
    """
    Sets up and returns a Selenium Chrome WebDriver instance.
    Runs with a visible browser window to avoid anti-bot detection.
    
    Returns:
        webdriver.Chrome: Configured Chrome WebDriver instance
    """
    chrome_options = Options()
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    driver = webdriver.Chrome(options=chrome_options)
    
    # Hide automation detection from websites
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
    })
    
    return driver


def get_review_url(asin, page_number):
    """
    Constructs the URL for a specific page of Amazon product reviews.
    
    Parameters:
        asin (str): The Amazon Standard Identification Number of the product
        page_number (int): The page number of reviews to fetch
    
    Returns:
        str: The complete URL for the review page
    """
    url = (f'https://www.amazon.com/product-reviews/{asin}'
           f'/ref=cm_cr_arp_d_paging_btm_next_{page_number}'
           f'?ie=UTF8&reviewerType=all_reviews&pageNumber={page_number}')
    return url


def scroll_page(driver):
    """
    Scrolls down the page gradually to trigger lazy-loading of content.
    Amazon loads review content dynamically as the user scrolls.
    
    Parameters:
        driver (webdriver.Chrome): The Selenium WebDriver instance
    """
    for i in range(5):
        driver.execute_script(f"window.scrollTo(0, {500 + i * 500});")
        time.sleep(1)


def extract_reviews(soup):
    """
    Extracts reviewer name, date, and content from a parsed Amazon review page.
    Uses the 'customer_review' ID prefix to find review containers,
    'a-profile-name' class for reviewer names, 'review-date' data-hook for dates,
    and 'review-text-content' class for review body text.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML of the review page
    
    Returns:
        list: A list of dictionaries with reviewer_name, review_date, review_content
    """
    reviews = []
    
    # Find all review containers
    review_divs = soup.find_all('div', id=lambda x: x and x.startswith('customer_review'))
    
    for review_div in review_divs:
        try:
            # Extract reviewer name from the profile section
            name_tag = review_div.find('span', class_='a-profile-name')
            reviewer_name = name_tag.text.strip() if name_tag else 'Anonymous'
            
            # Extract review date
            date_tag = review_div.find('span', {'data-hook': 'review-date'})
            review_date = date_tag.text.strip() if date_tag else 'No date'
            if ' on ' in review_date:
                review_date = review_date.split(' on ')[-1]
            
            # Extract review content using the updated class name
            body_tag = review_div.find('span', class_='review-text-content')
            review_content = body_tag.text.strip() if body_tag else ''
            
            # Only include reviews that have actual text content
            if review_content:
                reviews.append({
                    'reviewer_name': reviewer_name,
                    'review_date': review_date,
                    'review_content': review_content
                })
                
        except Exception as e:
            print(f"  Error extracting a review: {e}")
            continue
    
    return reviews


def scrape_all_reviews(asin, max_pages=5):
    """
    Main scraping function. Opens Amazon sign-in page for user to log in,
    then navigates through review pages and extracts all review data.
    
    Parameters:
        asin (str): The Amazon product ASIN
        max_pages (int): Maximum number of pages to scrape (default: 5)
    
    Returns:
        list: A combined list of all review dictionaries from all pages
    """
    all_reviews = []
    
    print("Setting up browser...")
    driver = setup_driver()
    
    try:
        # Open Amazon sign-in page for user to log in
        print("Opening Amazon sign-in page...")
        driver.get('https://www.amazon.com/ap/signin?openid.pape.max_auth_age=0'
                    '&openid.return_to=https%3A%2F%2Fwww.amazon.com%2F'
                    '&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select'
                    '&openid.assoc_handle=usflex'
                    '&openid.mode=checkid_setup'
                    '&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select'
                    '&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0')
        
        # Wait for user to complete login
        print("\n" + "=" * 50)
        print("PLEASE LOG IN TO AMAZON IN THE CHROME WINDOW.")
        print("The script will wait and continue automatically.")
        print("=" * 50 + "\n")
        
        while True:
            current_url = driver.current_url.lower()
            page_title = driver.title.lower()
            is_still_login = any(keyword in current_url for keyword in 
                                ['signin', 'captcha', 'validatecaptcha', 'ap/signin'])
            is_still_login = is_still_login or any(keyword in page_title for keyword in 
                                ['sign-in', 'sign in', 'captcha', 'verification'])
            if not is_still_login:
                print("Login successful! Starting to scrape reviews...\n")
                break
            time.sleep(3)
        
        time.sleep(3)
        
        # Scrape reviews page by page
        for page in range(1, max_pages + 1):
            print(f"Scraping page {page} of {max_pages}...")
            
            url = get_review_url(asin, page)
            print(f"  URL: {url}")
            
            # Load the review page
            driver.get(url)
            time.sleep(4)
            
            # Scroll down to load all review content
            scroll_page(driver)
            time.sleep(3)
            
            # Parse the page
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Extract reviews
            page_reviews = extract_reviews(soup)
            
            if not page_reviews:
                print(f"  No reviews found on page {page}. Stopping.")
                break
            
            all_reviews.extend(page_reviews)
            print(f"  Extracted {len(page_reviews)} reviews from page {page}.")
            print(f"  Total reviews collected so far: {len(all_reviews)}")
            
            # Random delay between pages to avoid detection
            if page < max_pages:
                delay = random.uniform(3, 6)
                print(f"  Waiting {delay:.1f} seconds...\n")
                time.sleep(delay)
    
    finally:
        driver.quit()
        print("\nBrowser closed.")
    
    return all_reviews


def save_to_csv(reviews, filename='amazon_reviews.csv'):
    """
    Saves the collected reviews to a CSV file using pandas.
    
    Parameters:
        reviews (list): List of review dictionaries
        filename (str): Output CSV filename (default: 'amazon_reviews.csv')
    
    Returns:
        DataFrame: The pandas DataFrame created from the reviews
    """
    df = pd.DataFrame(reviews)
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"\nData saved to '{filename}'")
    print(f"Total records saved: {len(df)}")
    return df


# --- Execute Scraping ---
print("=" * 50)
print("Amazon Review Scraper (Selenium)")
print(f"Product ASIN: {ASIN} | Pages: {MAX_PAGES}")
print("=" * 50)

all_reviews = scrape_all_reviews(ASIN, MAX_PAGES)
print(f"\n{'='*50}")
print(f"Scraping complete! Total reviews collected: {len(all_reviews)}")

# --- 5. Save to CSV ---
df_reviews = save_to_csv(all_reviews, 'amazon_reviews.csv')

# --- 6. Preview the Data ---
print(f"\nShape of dataset: {df_reviews.shape}")
print(f"Columns: {list(df_reviews.columns)}")
print(f"\nNull values:\n{df_reviews.isnull().sum()}")
print(f"\nFirst 10 rows:")
df_reviews.head(10)

Amazon Review Scraper (Selenium)
Product ASIN: B09MQWWP87 | Pages: 5
Setting up browser...
Opening Amazon sign-in page...

PLEASE LOG IN TO AMAZON IN THE CHROME WINDOW.
The script will wait and continue automatically.

Login successful! Starting to scrape reviews...

Scraping page 1 of 5...
  URL: https://www.amazon.com/product-reviews/B09MQWWP87/ref=cm_cr_arp_d_paging_btm_next_1?ie=UTF8&reviewerType=all_reviews&pageNumber=1
  Extracted 8 reviews from page 1.
  Total reviews collected so far: 8
  Waiting 4.5 seconds...

Scraping page 2 of 5...
  URL: https://www.amazon.com/product-reviews/B09MQWWP87/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2
  Extracted 8 reviews from page 2.
  Total reviews collected so far: 16
  Waiting 5.2 seconds...

Scraping page 3 of 5...
  URL: https://www.amazon.com/product-reviews/B09MQWWP87/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3
  Extracted 8 reviews from page 3.
  Total reviews col

Unnamed: 0,reviewer_name,review_date,review_content
0,Amanda_D,"February 19, 2026",Bought two of these for my kids awhile back. T...
1,Chonnn,"February 18, 2026",Love the color. Love the size. So many pockets...
2,Amazon Customer,"February 22, 2026",I bought this bag last minute before my travel...
3,sandra smith,"February 7, 2026","This is an amazing bag for travel , work etc. ..."
4,Kritzia Lopez,"February 25, 2026",Exactly as depicted in the image and descripti...
5,M MADSON,"January 5, 2026",This backpack/bag is huge! It has one pocket f...
6,Astghik Ohanyan,"January 12, 2026",I have used many travel backpacks designed for...
7,happykbd26,"February 5, 2026",Absolutely a great buy! Can fit so much while...
8,Amanda_D,"February 19, 2026",Bought two of these for my kids awhile back. T...
9,Chonnn,"February 18, 2026",Love the color. Love the size. So many pockets...
