In [3]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
import time

# Setup Firefox WebDriver
options = Options()
# options.add_argument("--headless")  # Enable headless if needed but if you did you might not now if there's something blocking it
service = Service("/usr/local/bin/geckodriver")  # firefox driver adjusted if needed
driver = webdriver.Firefox(service=service, options=options)

url = "https://www.tripadvisor.co.id/Attraction_Review-g32780-d4135855-Reviews-Inspiration_Point-Newport_Beach_California.html" # this is where you put the link
driver.get(url)
time.sleep(20)  # Allow time for page load

reviews = []

In [4]:
CSV_PATH = "reviews_scraping.csv"
reviews = []
page = 1
wait = WebDriverWait(driver, 10)

# Remove any existing file so headers are correct on first write
try:
    import os
    os.remove(CSV_PATH)
except FileNotFoundError:
    pass

while True:
    # Wait for reviews to load
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-automation="reviewCard"]')))
    soup = BeautifulSoup(driver.page_source, "html.parser")
    cards = soup.select('div[data-automation="reviewCard"]')
    print(f"[Page {page}] Found {len(cards)} review cards")

    # Scrape this page into a list of dicts
    page_data = []
    for card in cards:
        author_el = card.select_one('span.biGQs._P.fiohW.fOtGX a')
        title_el  = card.select_one('div.fiohW.qWPrE a span.yCeTE')
        review_el = card.select_one('div.fIrGe span.yCeTE')
        date_el   = card.select_one('div.RpeCd')
        rating_el = card.select_one('svg.evwcZ title')

        author  = author_el.text.strip() if author_el else "N/A"
        title   = title_el.text.strip() if title_el else "N/A"
        review  = review_el.get_text(" ", strip=True) if review_el else "N/A"
        rating  = rating_el.text.strip() if rating_el else "N/A"

        if date_el:
            parts = date_el.get_text(strip=True).split("•")
            date = parts[0].strip()
            loc  = parts[1].strip() if len(parts) > 1 else ""
        else:
            date, loc = "N/A", ""

        page_data.append({
            "author":   author,
            "title":    title,
            "date":     date,
            "location": loc,
            "rating":   rating,
            "review":   review
        })

    # Append to CSV
    df_page = pd.DataFrame(page_data)
    if page == 1:
        df_page.to_csv(CSV_PATH, index=False, encoding="utf-8")
    else:
        df_page.to_csv(CSV_PATH, mode='a', index=False, header=False, encoding="utf-8")

    print(f"→ Appended {len(page_data)} reviews from page {page} to {CSV_PATH}")

    # Try to click “Next page” button
    try:
        next_btn = wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//a[@aria-label="Next page"]')
        ))
        next_btn.click()
        page += 1
        # give it a moment to load the next page
        time.sleep(1)
    except (TimeoutException, NoSuchElementException):
        print("No more pages or next button timed out—stopping.")
        break

# Clean up
driver.quit()
print("Done scraping. Total pages:", page)

[Page 1] Found 10 review cards
→ Appended 10 reviews from page 1 to reviews_scraping.csv
[Page 2] Found 10 review cards
→ Appended 10 reviews from page 2 to reviews_scraping.csv
[Page 3] Found 10 review cards
→ Appended 10 reviews from page 3 to reviews_scraping.csv
[Page 4] Found 10 review cards
→ Appended 10 reviews from page 4 to reviews_scraping.csv
[Page 5] Found 10 review cards
→ Appended 10 reviews from page 5 to reviews_scraping.csv


KeyboardInterrupt: 