# **Import Libraries**

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [12]:
def get_product_urls(url_main, total_pages):
    """
    Mengambil URL produk dari kategori dengan jumlah halaman yang ditentukan secara manual menggunakan Selenium.
    """
    
    product_urls = []
    base_url = url_main.split('?')[0]  # Ambil base URL tanpa parameter

    # Inisialisasi Chrome WebDriver
    driver = webdriver.Chrome()  # Pastikan chromedriver sudah di-setup di PATH

    try:
        for page in range(1, total_pages + 1):
            url_with_page = f"{base_url}?ob=5&page={page}"
            print(f"Processing page {page}/{total_pages}: {url_with_page}")
            
            # Akses URL menggunakan WebDriver
            driver.get(url_with_page)
            sleep(3)

            for _ in range(20):
                driver.execute_script("window.scrollBy(0, 250)")
                sleep(1)

            # Ambil halaman HTML setelah scrolling selesai
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            # Cari elemen produk berdasarkan class
            product_divs = soup.find_all('div', {"class": "css-bk6tzz e1nlzfl2"})
            
            if not product_divs:
                print(f"  No products found on page {page}. Stopping scraping for this category.")
                break

            for div in product_divs:
                a_tag = div.find('a')
                if a_tag and a_tag.get('href'):
                    href = a_tag['href']
                    if "https://ta.tokopedia.com/promo/v1/clicks/" not in href:
                        product_urls.append(href)
                        print(f"URL added: {href}")
                    else:
                        print(f"Skipping promo URL: {href}")
    finally:
        driver.quit()

    print(f"Total product URLs collected: {len(product_urls)}")
    return product_urls

In [13]:
def scrape_reviews_and_ratings(product_urls):
    
    reviews = []
    ratings = []
    
    driver = webdriver.Chrome()

    try:
        for product_url in product_urls:
            print(f"Processing: {product_url}")
            driver.get(product_url)

            while True:
                # Scroll the page to load all reviews
                for _ in range(20):
                    driver.execute_script("window.scrollBy(0, 250)")
                    sleep(1)

                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")

                # Extract reviews and ratings
                for product in soup.find_all('div', {"class": "css-1k41fl7"}):
                    review_element = product.find('span', {"data-testid": "lblItemUlasan"})
                    reviews.append(review_element.get_text() if review_element else 'None')

                    rating_element = product.find('div', {"class": "rating"})
                    ratings.append(rating_element.get('aria-label') if rating_element else 'None')

                # Check if "Next" button exists and is enabled
                try:
                    next_button_container = driver.find_element(By.CLASS_NAME, "css-1xqkwi8")
                    next_button = next_button_container.find_element(
                        By.XPATH, './/button[contains(@class, "css-16uzo3v-unf-pagination-item") and @aria-label="Laman berikutnya"]'
                    )
                    is_disabled = next_button.get_attribute("disabled")  # Check if button is disabled
                    if is_disabled:
                        print("No more pages to navigate for this product.")
                        break

                    # Scroll to and click the next button
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                    driver.execute_script("arguments[0].click();", next_button)
                    sleep(2)
                except (NoSuchElementException, TimeoutException):
                    print("No 'Next' button found. Moving to next product.")
                    break

    finally:
        driver.quit()

    print(f"Scraped {len(reviews)} reviews and {len(ratings)} ratings.")
    data = pd.DataFrame({'Review': reviews, 'Rating': ratings})
    return data

In [14]:
def main(url_main):

    for url in url_main:
        product_urls = get_product_urls(url,105)#ganti jumlah halaman

    # Scrape reviews and ratings for each product
    data = scrape_reviews_and_ratings(product_urls)

    return data

In [None]:
urlss = ['https://www.tokopedia.com/p/fashion-pria/sepatu-pria?page=1&rt=1,2,3&ob=5'] #ganti link sendiri

data = main(urlss)
data.to_csv("reviewsnrating123.csv", index=False)#ganti nama csv