In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from pythainlp.util import isthai
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import re

In [2]:
def scrape_comments(url, num_comments, star_rating=None):
    # Start a Selenium WebDriver session
    driver = webdriver.Chrome()

    # Navigate to the Lazada product page
    driver.get(url)

    # Wait for the page to load
    time.sleep(5)  # Adjust the waiting time as needed

    # Click on the filter for the specified star rating
    try:
        filter_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "oper"))
        )
        filter_option = filter_container.find_element(By.XPATH, ".//span[@class='condition']")
        filter_option.click()
    except Exception as e:
        print(f"Failed to click on filter option: {e}")

    # Wait for the page to reload with the filtered comments
    time.sleep(5)  # Adjust the waiting time as needed

    # Find the star rating filter options
    star_filter_options = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "next-menu-content"))
    )
    star_options = star_filter_options.find_elements(By.CLASS_NAME, "next-menu-item")

    # Click on the star rating filter option based on the input
    if star_rating is not None:
        star_options[5 - star_rating].click()  # Adjust index based on star rating

    # Wait for the page to reload with the filtered comments
    time.sleep(5)  # Adjust the waiting time as needed

    # Get the updated page source
    page_source = driver.page_source

    # Parse the updated HTML content with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Define a function to filter comments
    def is_valid_comment(comment):
        thai_pattern = re.compile(r'[\u0E00-\u0E7F\s\d\W]+')  # Thai characters, spaces, digits, and special characters
        return thai_pattern.fullmatch(comment)

    # Find all elements containing comments
    comments = soup.find_all('div', class_='item')

    print("Number of comments found:", len(comments))  # Check number of comments found

    # Extract comment data and store in a list of dictionaries
    comment_data = []
    while len(comment_data) < num_comments:
        for i, comment in enumerate(comments):
            if len(comment_data) >= num_comments:
                break
            user_element = comment.find('div', class_='middle').find('span')
            message_element = comment.find('div', class_='content')

            user = user_element.text.strip() if user_element else ""
            content = message_element.text.strip() if message_element else ""

            # Determine sentiment based on star rating
            sentiment = "Positive" if star_rating in [4, 5] else "Neutrally" if star_rating == 3 else "Negative" if star_rating in [0, 1, 2] else "-"

            # Check if both user and content are not empty and if the comment is valid
            if user and content and is_valid_comment(content):
                comment_data.append({'User': user, 'Message': content, 'Sentiment': sentiment})
                print(f"Comment {len(comment_data)}: {user} - {content} - {sentiment}")

        if len(comment_data) >= num_comments:
            break

        try:
            # Click on the next page button
            next_button = driver.find_element(By.CLASS_NAME, 'next')
            next_button.click()

            # Wait for the page to reload with the next set of comments
            time.sleep(5)  # Adjust the waiting time as needed

            # Get the updated page source
            page_source = driver.page_source

            # Parse the updated HTML content with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')

            # Find all elements containing comments
            comments = soup.find_all('div', class_='item')

            print("Total comments collected:", len(comment_data))

        except Exception as e:
            print(f"Failed to fetch more comments: {e}")
            break

    # Close the WebDriver session
    driver.quit()

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(comment_data)

    # Check if there are comments to be saved to Excel
    if not df.empty:
        # Get the path to the Downloads folder
        downloads_path = os.path.join(os.path.expanduser("~"), "Downloads")

        # Save DataFrame to Excel file in the Downloads folder
        excel_file_path = os.path.join(downloads_path, "comments.xlsx")
        df.to_excel(excel_file_path, index=False, na_rep='-')
        print("Comments saved to Excel.")

    return df

In [3]:
product_url = "https://www.lazada.co.th/products/-i3546818528-s16377435692.html"

In [4]:
comments_df = scrape_comments(product_url, num_comments=30, star_rating=3)

Number of comments found: 5
Comment 1: ‡∏à***. - ‡∏à‡∏±‡∏î‡∏™‡πà‡∏á‡πÑ‡∏ß ‡πÅ‡∏û‡πá‡∏Ñ‡∏°‡∏≤‡∏î‡∏µ ‡πÇ‡∏î‡∏¢‡∏£‡∏ß‡∏°‡πÇ‡∏≠‡πÄ‡∏Ñ‡∏£ ‡∏Ç‡∏≠‡∏•‡∏≠‡∏á‡πÉ‡∏ä‡πâ‡∏Å‡πà‡∏≠‡∏ô ‡∏î‡∏µ‡∏à‡∏∞‡∏Å‡∏•‡∏±‡∏ö‡∏°‡∏≤‡∏™‡∏±‡πà‡∏á‡∏≠‡∏µ‡∏Å‡∏Ñ‡πà‡∏∞ - Neutrally
Comment 2: ‡∏´‡∏•‡∏∏‡∏¢‡∏™‡πå ‡∏à. - ‡πÑ‡∏°‡πà‡∏ô‡πà‡∏≤‡πÉ‡∏ä‡πà‡∏™‡πÅ‡∏ï‡∏ô‡πÄ‡∏•‡∏™ ‡πÅ‡∏ï‡πà‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏´‡∏•‡πá‡∏Å‡πÄ‡∏Ñ‡∏•‡∏∑‡∏≠‡∏ö‡∏™‡∏µ ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏ô‡∏ó‡∏≤‡∏ô‡∏û‡∏≠‡∏™‡∏°‡∏Ñ‡∏ß‡∏£‡∏ï‡∏≤‡∏°‡∏£‡∏≤‡∏Ñ‡∏≤ ‡πÅ‡∏ï‡πà‡∏Å‡πá‡∏ñ‡∏∑‡∏≠‡∏ß‡πà‡∏≤‡∏£‡∏≤‡∏Ñ‡∏≤‡∏ñ‡∏π‡∏Å ‡∏£‡∏±‡∏ö‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å ‡πÑ‡∏î‡πâ‡∏û‡∏≠‡∏™‡∏°‡∏Ñ‡∏ß‡∏£ ‡∏ñ‡πâ‡∏≤‡∏´‡∏ô‡∏±‡∏Å‡πÄ‡∏Å‡∏¥‡∏ô‡πÑ‡∏õ ‡∏ï‡∏±‡∏ß‡∏ó‡∏µ‡πà‡∏ï‡∏¥‡∏î‡∏Å‡∏±‡∏ö‡∏ù‡∏≤‡∏ú‡∏ô‡∏±‡∏á‡∏ï‡∏±‡∏ß‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏≠‡∏≤‡∏à‡∏à‡∏∞‡∏´‡∏±‡∏Å‡πÄ‡∏™‡∏µ‡∏¢‡∏Å‡πà‡∏≠‡∏ô ‡∏ï‡∏±‡∏ß‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡πÅ‡∏Ç‡∏ß‡∏ô‡∏ï‡∏¥‡∏î‡∏ú‡∏ô‡∏±‡∏á ‡πÑ‡∏î‡πâ‡πÉ‡∏ä‡πâ‡∏Å‡∏±‡∏ö‡∏ä‡∏±‡πâ‡∏ô‡πÄ‡∏Ç‡πâ‡∏≤‡∏°‡∏∏‡∏°  2 ‡∏Ç‡πâ‡∏≤‡∏á‡∏Ç‡πâ‡∏≤‡∏á‡∏•‡∏∞ 1 ‡∏ï‡∏±‡∏ß ‡∏ï‡∏±‡∏ß‡πÅ‡∏ö‡∏ö‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß ‡πÑ‡∏°‡πà‡∏ô‡πà‡∏≤‡∏à‡∏∞‡∏£‡∏±‡∏ö‡∏ô‡πâ‡∏≥‡∏´‡∏ô