In [8]:
# Cell 1: Import Libraries
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time
import json
import re

In [9]:
# Cell 2: Set up Base URL and Headers
# Define a base URL without the offset parameter for pagination
# The 'oaX' offset will be inserted dynamically
base_url = 'https://www.tripadvisor.com/AttractionProductReview-g304141-d23824956-Private_Jeep_Safari_at_Minneriya_National_Park_to_Visit_Elephants-Sigiriya_Central.html'

headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

print("Headers configured successfully")

Headers configured successfully


In [10]:
# Cell 3: Create Session and Make Request
session = requests.Session()
session.headers.update(headers)

try:
    response = session.get(base_url, timeout=10)
    response.raise_for_status()
    
    print(f"✅ Status Code: {response.status_code}")
    print(f"📦 Content Encoding: {response.headers.get('Content-Encoding', 'None')}")
    print(f"📄 Content Type: {response.headers.get('Content-Type', 'None')}")
    print(f"📏 Content Length: {len(response.content)} bytes")
    
except requests.exceptions.RequestException as e:
    print(f"❌ Request failed: {e}")

# Cell 4: Handle Text Encoding
# Ensure proper encoding to avoid garbled text
response.encoding = response.apparent_encoding or 'utf-8'

print(f"🔤 Detected encoding: {response.encoding}")
print(f"📝 Response text length: {len(response.text)} characters")

✅ Status Code: 200
📦 Content Encoding: gzip
📄 Content Type: text/html; charset=utf-8
📏 Content Length: 1630445 bytes
🔤 Detected encoding: utf-8
📝 Response text length: 1630345 characters


In [11]:
# Cell 5: Parse with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

print("🍲 BeautifulSoup parsing completed")
print(f"📊 Found {len(soup.find_all())} HTML elements")

🍲 BeautifulSoup parsing completed
📊 Found 5858 HTML elements


In [12]:
# Cell 6: Test the Parsed Content
# Display first 500 characters to verify it's readable
print("🔍 First 500 characters of parsed content:")
print("=" * 50)
print(soup.prettify()[:1000])
print("=" * 50)

🔍 First 500 characters of parsed content:
<!DOCTYPE html>
<html lang="en-US">
 <head>
  <link href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/favicon_2025.ico" id="favicon" rel="icon" type="image/x-icon"/>
  <link href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/icon.svg" rel="icon" type="image/svg+xml"/>
  <link href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/apple_touch_icon.png" rel="apple-touch-icon" sizes="180x180"/>
  <link color="#00210c" href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/mask_icon.svg" rel="mask-icon" sizes="any"/>
  <meta content="#00eb5b" name="theme-color"/>
  <meta content="telephone=no" name="format-detection"/>
  <meta content="app-id=-1" name="apple-itunes-app"/>
  <script type="application/ld+json">
   [{"@context":"https:\u002F\u002Fschema.org","@type":"Organization","name":"Tripadvisor","url":"https:\u002F\u002Fwww.tripadvisor.com\u002F","logo":"https:\u002

In [13]:
# Initialize a list to store all scraped places from all pages
all_places_data = []

# --- Pagination Control Variables ---
offset = 0 # Starting offset for the first page
total_items = float('inf') # Will be updated after scraping the first page
items_per_page = 30 # Number of items displayed per page on Tripadvisor

# Cell 3 & 4: Create Session, Make Request, Parse, and Extract (now within a loop for pagination)
session = requests.Session()
session.headers.update(headers)

In [14]:
# Loop through pages until all items are collected
while offset < total_items:
    
    print(f"\n--- Current Loop Iteration ---")
    print(f"Current offset before URL construction: {offset}")

    if offset == 0:
        current_page_url = base_url
        print(current_page_url)
    else:
        # Example: 'Attractions-g293961-Activities-c61-t243-oa30-Sri_Lanka.html'
        # We split the base_url to insert the offset parameter correctly
        parts = base_url.rsplit('-Sri_Lanka.html', 1)
        current_page_url = f"{parts[0]}-oa{offset}-Sri_Lanka.html"
        print(current_page_url)

    print(f"Constructed URL for this page: {current_page_url}")

    # --- MODIFIED: Add retry logic for requests ---
    max_retries = 3 # Try up to 3 times for a failed request
    for attempt in range(1, max_retries + 1):

        try:
            print(f"Attempt {attempt} for URL: {current_page_url}") # New print for retry info
            response = session.get(current_page_url, timeout=20) # <--- CHANGED: Increased timeout to 20 seconds
            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

            print(f"✅ Status Code for {current_page_url}: {response.status_code}")
            break # Break out of the retry loop if request is successful

        except requests.exceptions.Timeout as e:
            print(f"❌ Timeout error on attempt {attempt} for {current_page_url}: {e}")
            if attempt < max_retries:
                sleep_time = 5 * attempt # Exponential backoff: 5s, 10s, 15s delay
                print(f"Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                print(f"Max retries reached for {current_page_url}. Skipping this page.")
                response = None # Set response to None to indicate failure
                break # Break out of retry loop if max attempts reached

        except requests.exceptions.RequestException as e:
            print(f"❌ Request failed on attempt {attempt} for {current_page_url}: {e}")
            if attempt < max_retries:
                sleep_time = 5 * attempt # Exponential backoff
                print(f"Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                print(f"Max retries reached for {current_page_url}. Skipping this page.")
                response = None # Set response to None to indicate failure
                
    # If response is None after retries, skip processing this page and move to next offset
    if response is None:
        offset += items_per_page
        time.sleep(2) # Still add a small delay before attempting next page
        continue # Skip to the next iteration of the while loop

    # Parse the HTML content for the current page
    soup = BeautifulSoup(response.text, 'html.parser')
    print("🍲 BeautifulSoup parsing completed for current page")

    # --- Extract Total Items from the first page (only once) ---
    if offset == 0:
        total_items_text_tag = soup.find('div', class_='Ci')
        if total_items_text_tag:
            full_text = total_items_text_tag.get_text(strip=True)
            match = re.search(r'of\s*([\d,]+)', full_text)
            if match:
                total_items = int(match.group(1).replace(',', ''))
                print(f"🔢 Total items identified for scraping: {total_items}")
            else:
                print("⚠️ Could not find total number of items from the first page. Assuming only one page.")
                total_items = items_per_page # Fallback: if total items not found, only process the first page
        else:
            print("⚠️ Total items text element not found. Assuming only one page.")
            total_items = items_per_page # Fallback: if element not found, only process the first page

    # --- Data Extraction Logic (your existing robust extraction) ---
    listings = soup.find_all('article', class_='GTuVU XJlaI rHoxO')
    print(f"Found {len(listings)} listings on this page.")

    if not listings:
        print(f"🚫 No new listings found on {current_page_url}. This might be the last page or an empty page. Ending scrape.")
        break # Exit loop if no listings are found on the current page

    for listing in listings:
        place_details = {}

        # Extract Title and Detail Page URL (MODIFIED)
        title_link_tag = listing.find('a') # Find the first <a> tag within the listing (expected to be the title link)
        if title_link_tag:
            # Ensure the h3 tag exists for title, as it's the primary text component
            title_h3_tag = title_link_tag.find('h3', class_='biGQs _P fiohW OgHoE')
            if title_h3_tag:
                title_text = title_h3_tag.get_text(strip=True)
                place_details['title'] = re.sub(r'^\d+\.\s*', '', title_text).strip()
            else:
                place_details['title'] = 'N/A'

            detail_page_relative_url = title_link_tag.get('href')
            if detail_page_relative_url:
                # Construct the full detail page URL
                # Assuming base URL is consistent, might need more robust URL join
                detail_page_url = f"https://www.tripadvisor.com{detail_page_relative_url}"
                place_details['detail_page_url'] = detail_page_url # Optional: save the detail page URL too
            else:
                detail_page_url = None
                place_details['detail_page_url'] = 'N/A' # Optional
        else:
            place_details['title'] = 'N/A'
            detail_page_url = None
            place_details['detail_page_url'] = 'N/A'

        # Extract Image links
        image_links = []
        image_container = listing.find('div', class_='IdURT w carousel UznXc wSSLS')
        if image_container:
            img_tags = image_container.find_all('img', src=True)
            for img_tag in img_tags:
                src = img_tag['src']
                if not src.startswith('data:image/svg+xml'):
                    if src.startswith('//'):
                        src = 'https:' + src
                    image_links.append(src)
        place_details['image_links'] = image_links if image_links else 'N/A'

        # Extract Rating
        rating_tag = listing.find('div', {'data-automation': 'bubbleRatingValue'})
        if rating_tag:
            place_details['rating'] = rating_tag.find('span').get_text(strip=True)
        else:
            place_details['rating'] = 'N/A'

        # Extract Total Reviews
        reviews_tag = listing.find('div', {'data-automation': 'bubbleLabel'})
        if reviews_tag:
            place_details['total_reviews'] = reviews_tag.get_text(strip=True)
        else:
            place_details['total_reviews'] = 'N/A'

        # Extract Recommendation
        recommendation_tag = listing.find('span', class_='biGQs _P pZUbB egaXP ZNjnF', string=lambda text: text and 'Recommended by' in text)
        if recommendation_tag:
            place_details['recommendation'] = recommendation_tag.get_text(strip=True)
        else:
            place_details['recommendation'] = 'N/A'

        # Extract Type
        type_div = listing.find('div', class_='alPVI eNNhq PgLKC tnGGX yzLvM')
        if type_div:
            type_element = None
            all_text_divs = type_div.find_all('div', class_='biGQs _P pZUbB ZNjnF')
            for div in all_text_divs:
                if not div.find_parent('div', class_='bRMrl _Y K fOSqw') and \
                   not div.find_parent('div', {'data-automation': 'listCardDescription'}) and \
                   not div.find_parent('div', {'data-automation': 'cardPrice'}) and \
                   not "cancellation" in div.get_text(strip=True).lower() and \
                   not "recommended by" in div.get_text(strip=True).lower():
                    type_element = div
                    break
            if type_element:
                place_details['type'] = type_element.get_text(strip=True)
            else:
                place_details['type'] = 'N/A'
        else:
            place_details['type'] = 'N/A'

        # Extract Travel Duration
        duration_tag = listing.find('div', class_='bRMrl _Y K fOSqw')
        if duration_tag:
            duration_element = duration_tag.find('div', class_='biGQs _P pZUbB ZNjnF')
            if duration_element:
                place_details['travel_duration'] = duration_element.get_text(strip=True)
            else:
                place_details['travel_duration'] = 'N/A'
        else:
            place_details['travel_duration'] = 'N/A'

        # Extract Description
        # description_tag = listing.find('div', {'data-automation': 'listCardDescription'})
        # if description_tag:
        #     span_tag = description_tag.find('span', class_='SwTtt')
        #     if span_tag:
        #         description_text = span_tag.get_text(strip=True).replace('\n', ' ').strip()
        #         if not description_text.endswith('…') and '…' in span_tag.text:
        #             description_text += '…'
        #         place_details['description'] = description_text
        #     else:
        #         place_details['description'] = 'N/A'
        # else:
        #     place_details['description'] = 'N/A'

        # Extract Starting Price
        price_tag = listing.find('div', {'data-automation': 'cardPrice'})
        if price_tag:
            place_details['starting_price'] = price_tag.get_text(strip=True)
        else:
            place_details['starting_price'] = 'N/A'
        pass
        all_places_data.append(place_details)

        # --- NEW LOGIC: Fetch and extract full description from detail page ---
        if detail_page_url and detail_page_url != 'N/A':
            print(f"  ➡️ Fetching full description for: {place_details['title']}")
            time.sleep(2) # <--- IMPORTANT: Delay before fetching detail page
            try:
                detail_response = session.get(detail_page_url, timeout=15) # Shorter timeout for individual detail page
                detail_response.raise_for_status()
                detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

                # Find the div with class '_d' for full description
                full_description_tag = detail_soup.find('div', class_='_d')
                if full_description_tag:
                    place_details['description'] = full_description_tag.get_text(strip=True).replace('\n', ' ')
                else:
                    place_details['description'] = 'Full Description N/A (selector not found)'
                    print(f"    ❌ Full description div ('_d') not found for {place_details['title']}")

            except requests.exceptions.RequestException as e:
                place_details['description'] = f"Full Description N/A (request failed: {e})"
                print(f"    ❌ Failed to fetch detail page for {place_details['title']}: {e}")
        else:
            place_details['description'] = 'Full Description N/A (detail URL missing)'

        # Append the collected details for this place
        all_places_data.append(place_details)

    # Increment the offset for the next page
    offset += items_per_page
    print(f"Offset for next page: {offset}")

    # Add a delay between requests to be polite and avoid being blocked
    time.sleep(3) # IMPORTANT: Adjust this value as needed, typically 1-5 seconds

# Final output: print all collected data after the loop finishes
print(f"\n✅ Scraped {len(all_places_data)} items across all pages.")
print(json.dumps(all_places_data, indent=2, ensure_ascii=False))


--- Current Loop Iteration ---
Current offset before URL construction: 0
https://www.tripadvisor.com/AttractionProductReview-g304141-d23824956-Private_Jeep_Safari_at_Minneriya_National_Park_to_Visit_Elephants-Sigiriya_Central.html
Constructed URL for this page: https://www.tripadvisor.com/AttractionProductReview-g304141-d23824956-Private_Jeep_Safari_at_Minneriya_National_Park_to_Visit_Elephants-Sigiriya_Central.html
Attempt 1 for URL: https://www.tripadvisor.com/AttractionProductReview-g304141-d23824956-Private_Jeep_Safari_at_Minneriya_National_Park_to_Visit_Elephants-Sigiriya_Central.html
✅ Status Code for https://www.tripadvisor.com/AttractionProductReview-g304141-d23824956-Private_Jeep_Safari_at_Minneriya_National_Park_to_Visit_Elephants-Sigiriya_Central.html: 200
🍲 BeautifulSoup parsing completed for current page
⚠️ Could not find total number of items from the first page. Assuming only one page.
Found 0 listings on this page.
🚫 No new listings found on https://www.tripadvisor.com