Data Scrapper 1

In [None]:
# pip install selenium beautifulsoup4 pandas webdriver-manager on terminal

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import math
import re
import os

In [None]:
options = Options()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(options=options) # If this does not work, uncomment the below code and comment this line and comment this line
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager

# driver = webdriver.Chrome(
#     service=Service(ChromeDriverManager().install()),
#     options=options
# )

In [None]:

URLS = [
"Add the Tripadvisro links",
"Add the links",


]

In [None]:
MAX_REVIEWS_PER_DESTINATION = 500

output_folder = r"C:\NIBM\Travel recommendation system\Travel-Recommendation-system-in-Sri-Lanka-\scraped_data\charuka" #your file path
os.makedirs(output_folder, exist_ok=True)


for full_url in URLS:
    print(f"\nProcessing destination: {full_url}")
    driver.get(full_url)
    time.sleep(7)

    soup = BeautifulSoup(driver.page_source, "html.parser")


    title_tag = soup.select_one("h1[data-test-target='mainH1']")
    destination_title = title_tag.get_text(strip=True) if title_tag else "Unknown"

    safe_title = re.sub(r'[\\/*?:"<>|]', "", destination_title)
    filename = os.path.join(output_folder, f"{safe_title}.csv")


    if os.path.exists(filename):
        print(f"CSV already exists for {destination_title}, skipping...")
        continue


    score_tag = soup.select_one("div[data-automation='bubbleRatingValue'] span")
    destination_score = score_tag.get_text(strip=True) if score_tag else None


    total_tag = soup.select_one("div[data-automation='bubbleReviewCount'] span")
    total_reviews = 0
    if total_tag:
        total_reviews = int(re.sub(r"[^\d]", "", total_tag.get_text()))
    total_reviews = min(total_reviews, MAX_REVIEWS_PER_DESTINATION)

    img_tag = soup.select_one("img[srcset]")
    destination_image = img_tag["src"] if img_tag else None

    print(f"Destination: {destination_title} | Reviews to scrape: {total_reviews}")


    reviews_per_page = 10
    total_pages = math.ceil(total_reviews / reviews_per_page)
    all_reviews = []

    for page in range(total_pages):
        if page == 0:
            page_url = full_url
        else:
            page_url = full_url.replace("Reviews-", f"Reviews-or{page * reviews_per_page}-")

        print(f"  Scraping page {page + 1}/{total_pages}")
        driver.get(page_url)
        time.sleep(5)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        review_cards = soup.select("div[data-automation='reviewCard']")

        for card in review_cards:

            # Reviewer name
            user_tag = card.select_one("span.biGQs._P.ezezH a")
            user = user_tag.get_text(strip=True) if user_tag else None

            # Reviewer location
            location_tag = card.select_one("div.biGQs._P.navcl span")
            location = location_tag.get_text(strip=True) if location_tag else None

            # Review title
            title_tag = card.select_one("h3 span.yCeTE")
            review_title = title_tag.get_text(strip=True) if title_tag else None

            # Review text
            review_tag = card.select_one("div.biGQs._P.VImYz.AWdfh span.yCeTE")
            review_text = review_tag.get_text(" ", strip=True) if review_tag else None

            # Written date
            written_tag = card.select_one("div.biGQs._P.VImYz.ncFvv.navcl")
            written_date = (
                written_tag.get_text(strip=True).replace("Written ", "")
                if written_tag else None
            )

            # Review score
            review_score = None
            score_svg = card.select_one("svg[data-automation='bubbleRatingImage'] title")
            if score_svg:
                match = re.search(r"(\d) of 5", score_svg.get_text())
                if match:
                    review_score = int(match.group(1))

            # Reviewer category (ROBUST METHOD)
            category = None
            card_text = card.get_text(" ", strip=True)
            cat_match = re.search(
                r"\b(Couples|Family|Friends|Solo|Business)\b",
                card_text
            )
            if cat_match:
                category = cat_match.group(1)

            all_reviews.append({
                "Destination URL": full_url,
                "Destination Title": destination_title,
                "Destination Score": destination_score,
                "Total Reviews (Destination)": total_reviews,
                "Destination Image": destination_image,
                "Reviewer Name": user,
                "Reviewer Location": location,
                "Review Title": review_title,
                "Review Score": review_score,
                "Reviewer Category": category,
                "Review Text": review_text,
                "Written Date": written_date
            })


    df = pd.DataFrame(all_reviews)
    df.to_csv(filename, index=False, encoding="utf-8-sig")
    print(f"Saved {len(df)} reviews → {filename}")

driver.quit()
print("\n✅ Scraping completed successfully.")