# **Import Libraries**

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [2]:
def get_product_urls(url_main):
    product_urls = []
    driver = webdriver.Chrome()

    try:
        driver.get(url_main)
        sleep(2)

        # Scroll the page to load all links
        for _ in range(20):  # Scroll n times
            driver.execute_script("window.scrollBy(0, 250)")
            sleep(1)

        # Get the page source after scrolling
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # Find all divs with the class containing product information
        product_divs = soup.find_all('div', {"class": "css-bk6tzz e1nlzfl2"})

        # Extract href from nested <a> tags inside the divs
        for div in product_divs:
            a_tag = div.find('a')  # Locate the <a> tag inside the div
            if a_tag and a_tag.get('href'):
                product_urls.append(a_tag['href'])

    finally:
        driver.quit()

    return product_urls

In [3]:
def scrape_reviews_and_ratings(product_urls):
    reviews = []
    ratings = []

    driver = webdriver.Chrome()

    try:
        for product_url in product_urls:
            print(f"Processing: {product_url}")
            driver.get(product_url)

            while True:
                for _ in range(20):
                    driver.execute_script("window.scrollBy(0, 250)")
                    sleep(1)

                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")

                for product in soup.find_all('div', {"class": "css-1k41fl7"}):
                    review_element = product.find('span', {"data-testid": "lblItemUlasan"})
                    reviews.append(review_element.get_text() if review_element else 'None')

                    rating_element = product.find('div', {"class": "rating"})
                    ratings.append(rating_element.get('aria-label') if rating_element else 'None')

                try:
                    # Wait for the next button to be clickable
                    next_button_container = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "css-1xqkwi8"))
                    )
                    next_button = next_button_container.find_element(
                        By.XPATH, './/button[contains(@class, "css-16uzo3v-unf-pagination-item") and @aria-label="Laman berikutnya"]'
                    )

                    # Ensure the button is clickable
                    if 'css-1xqkwi8--disabled' not in next_button.get_attribute('class'):
                        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                        driver.execute_script("arguments[0].click();", next_button)
                        sleep(2)
                    else:
                        print("Next button is disabled or not clickable. Moving to next product.")
                        break  # Move to the next product URL
                except (NoSuchElementException, TimeoutException):
                    print("Next button is not found or the page has no more reviews. Moving to next product.")
                    break  # Move to the next product URL

    finally:
        driver.quit()

    print(f"Scraped {len(reviews)} reviews and {len(ratings)} ratings.")  # Debugging output
    data = pd.DataFrame({'Review': reviews, 'Rating': ratings})
    return data

In [4]:
def main(url_main):
    urls = []

    for url in url_main:
        # Get the product URLs from the main page
        product_urls = get_product_urls(url)
        for urll in product_urls:    
            urls.append(urll)

    # Scrape reviews and ratings for each product
    data = scrape_reviews_and_ratings(urls)

    # Optionally, save the data to a CSV file
    data.to_csv("reviews.csv", index=False)
    return data

In [None]:
urlss = ['https://www.tokopedia.com/p/fashion-pria/sepatu-pria/sneakers-pria']

urls = []

for url in urlss:
    # Get the product URLs from the main page
    product_urls = get_product_urls(url)
    for urll in product_urls:    
        urls.append(urll)

reviews = []
ratings = []

driver = webdriver.Chrome()

try:
    for product_url in product_urls:
        print(f"Processing: {product_url}")
        driver.get(product_url)

        while True:
            # Scroll the page to load all reviews
            for _ in range(20):
                driver.execute_script("window.scrollBy(0, 250)")
                sleep(1)

            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            # Extract reviews and ratings
            for product in soup.find_all('div', {"class": "css-1k41fl7"}):
                review_element = product.find('span', {"data-testid": "lblItemUlasan"})
                reviews.append(review_element.get_text() if review_element else 'None')

                rating_element = product.find('div', {"class": "rating"})
                ratings.append(rating_element.get('aria-label') if rating_element else 'None')

            # Check if "Next" button exists and is enabled
            try:
                next_button_container = driver.find_element(By.CLASS_NAME, "css-1xqkwi8")
                next_button = next_button_container.find_element(
                    By.XPATH, './/button[contains(@class, "css-16uzo3v-unf-pagination-item") and @aria-label="Laman berikutnya"]'
                )
                is_disabled = next_button.get_attribute("disabled")  # Check if button is disabled
                if is_disabled:
                    print("No more pages to navigate for this product.")
                    break

                # Scroll to and click the next button
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                driver.execute_script("arguments[0].click();", next_button)
                sleep(2)
            except (NoSuchElementException, TimeoutException):
                print("No 'Next' button found. Moving to next product.")
                break

finally:
    driver.quit()


Processing: https://ta.tokopedia.com/promo/v1/clicks/8a-xgVY2gmUEH_jFoprdopHRbm-xgVY789CBUsthbm-Orfua9fBjUstiHmUDUSz2Q31i6sJRHpeFosjRo_rDUMoFyaUEH_eFHmFircYpq9z2Qfdi6sJDUMVDgaUEUMoxPcuSQR-N9f-N9RCaQfzOyReibm-XP3Oig9-wQfgwy3zpUsthoZFiQSuWyMua9fVjrOYag9Ji6sJObm-sy9zwq3zpUs2QH_rO6ZFh6AnFbAJdoAuPbm-X9foxQMz2gcV7guYxgIHi6sJ7HmFiy3-wPcupPmUEUjdibm-FQRo79fVDgaUEUMzBgiUDUSgBrSo2Qfdi6i-fHiUDUMVj9RosQR-BUstpHsJX6Ara6_eRHsnaHA1RHiFirpowQcYSUstig9BGqMzUZMggQj2fgAo6QJBkQfBoe7BpZ3O6HcoD692qu7gN3_-Sq1Y2Z9xHqMWE_BzzoJOEgpoozcDa_S2sHJO2Z9o-Q9zDguxjPMoW1MgsHjNfyfOuq1Y2Z9P-q9P2yM7NPujau3Bvq1BE3BzSo1Y9__og1pn7_32uoJh11_V6zVu73uzC8JNAHAzoQugW3BPo8B2939x6Hju2_JoGPMoWQcNxupuM3jP3POKaQcW-qMY2_1o-r7BW69BxufzFyMFNPfoW63Wju7dF3A-Dq7BkQfBoe7BpZ37N83V9gICiQABEy1rNPOKaQcW-qMY2_1o-r7BXzsVq3JtO3AoZqVtp_3Bvq12M_uzSHJN36IPy8j-M_1yNgjOc6IPy8j-M_1o-r7BX_M2iH72D3A-G83UpgI2q17jfZ32o81O_oAz68jjO_jzs81OJZ9P6ucoNZ3BRq3Ha_SgsQugMyp-3qcoW_MY-qMY2_92s81hk1_uvzJ7p_uzuo1Y1ypzvuVV2gBJYvZUDUMoDP9o7g9-wq3zwPsUi6seaoiFirI-2yfuwyMBjUst

In [7]:
print(f"Scraped {len(reviews)} reviews and {len(ratings)} ratings.")
data = pd.DataFrame({'Review': reviews, 'Rating': ratings})

Scraped 64 reviews and 64 ratings.


In [11]:
data.tail(20)

Unnamed: 0,Review,Rating
44,Modelnya keren! Bahan nya nyaman dipakai pas d...,bintang 5
45,menurut saya pribadi sih produk sangat oke kua...,bintang 4
46,packingnya baik walaupun ya lakbannya gak nutu...,bintang 4
47,"baru pertama kali nyoba brand lokal, dan menco...",bintang 5
48,"Barang sudah sampai, Salut Box nya nyampe sini...",bintang 5
49,Mantap min barang sesuai ekspektasi💯 pokoknya ...,bintang 5
50,"mantap model nya cakep lah, untuk kenyamananny...",bintang 5
51,enak dipakai tapi ya masih ada kurangnya dibag...,bintang 5
52,"barang bagus sesuai,empuk pas dipake,packaging...",bintang 5
53,"satu kata, mantabb\n\nbahan oke\nkenyamanan ok...",bintang 5
