In [None]:
# Cell 1: Import Libraries
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time
import json # Ensure json is imported for printing structured data
import re # Ensure re is imported for cleaning title.

In [16]:
# Cell 2: Set up URL and Headers
url = 'https://www.tripadvisor.com/Attractions-g293961-Activities-c61-t243-Sri_Lanka.html'

headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    # Note: Removed Accept-Encoding to fix garbled text issue
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

print("Headers configured successfully")

Headers configured successfully


In [17]:
# Cell 3: Create Session and Make Request
session = requests.Session()
session.headers.update(headers)

try:
    response = session.get(url, timeout=10)
    response.raise_for_status()
    
    print(f"✅ Status Code: {response.status_code}")
    print(f"📦 Content Encoding: {response.headers.get('Content-Encoding', 'None')}")
    print(f"📄 Content Type: {response.headers.get('Content-Type', 'None')}")
    print(f"📏 Content Length: {len(response.content)} bytes")
    
except requests.exceptions.RequestException as e:
    print(f"❌ Request failed: {e}")

✅ Status Code: 200
📦 Content Encoding: gzip
📄 Content Type: text/html; charset=utf-8
📏 Content Length: 1966300 bytes


In [18]:
# Cell 4: Parse with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

print("🍲 BeautifulSoup parsing completed")
print(f"📊 Found {len(soup.find_all())} HTML elements")

🍲 BeautifulSoup parsing completed
📊 Found 7047 HTML elements


Taken from a saved text

In [19]:
with open("scraped_page_06.txt", "w", encoding="utf-8") as file:
    file.write(soup.prettify())

In [20]:
from bs4 import BeautifulSoup
import json
import re

with open('scraped_page_06.txt', 'r', encoding='utf-8') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

places = []

# Find all article tags that represent a "place" or listing
listings = soup.find_all('article', class_='GTuVU XJlaI rHoxO')

for listing in listings:
    place_details = {}

    # Extract Title
    title_tag = listing.find('h3', class_='biGQs _P fiohW OgHoE')
    if title_tag:
        # Removing leading numbers (e.g., "1.", "2.", "30.")
        title_text = title_tag.get_text(strip=True)
        place_details['title'] = re.sub(r'^\d+\.\s*', '', title_text).strip()
    else:
        place_details['title'] = 'N/A'

    # Extract Image links
    image_links = []
    # Find the main div that contains the image carousel
    image_container = listing.find('div', class_='IdURT w carousel UznXc wSSLS')
    if image_container:
        # Find all img tags within this container that have a 'src' attribute
        # and filter out data URI images (placeholders)
        img_tags = image_container.find_all('img', src=True)
        for img_tag in img_tags:
            src = img_tag['src']
            if not src.startswith('data:image/svg+xml'):
                # Handle relative URLs if any (TripAdvisor generally uses full URLs or // scheme relative)
                if src.startswith('//'):
                    src = 'https:' + src
                image_links.append(src)
    place_details['image_links'] = image_links if image_links else 'N/A'

    # Extract Rating
    rating_tag = listing.find('div', {'data-automation': 'bubbleRatingValue'})
    if rating_tag:
        place_details['rating'] = rating_tag.find('span').get_text(strip=True)
    else:
        place_details['rating'] = 'N/A'

    # Extract Total Reviews
    reviews_tag = listing.find('div', {'data-automation': 'bubbleLabel'})
    if reviews_tag:
        place_details['total_reviews'] = reviews_tag.get_text(strip=True)
    else:
        place_details['total_reviews'] = 'N/A'

    # Extract Recommendation
    recommendation_tag = listing.find('span', class_='biGQs _P pZUbB egaXP ZNjnF', string=lambda text: text and 'Recommended by' in text)
    if recommendation_tag:
        place_details['recommendation'] = recommendation_tag.get_text(strip=True)
    else:
        place_details['recommendation'] = 'N/A'

    # Extract Type
    # This might require more specific targeting as 'biGQs _P pZUbB ZNjnF' is generic
    # Based on the example, it's often the first strong text in the description block after the title
    type_div = listing.find('div', class_='alPVI eNNhq PgLKC tnGGX yzLvM')
    if type_div:
        # Looking for the specific div that contains the type, which is usually right after the title section
        # and not part of 'bRMrl _Y K fOSqw' (travel duration) or 'DnkDV' (description)
        # This selector is a bit tricky due to generic class names, might need refinement if it pulls wrong text
        # Attempting to find the first 'biGQs _P pZUbB ZNjnF' within the main detail block that isn't part of duration or description
        type_element = None
        all_text_divs = type_div.find_all('div', class_='biGQs _P pZUbB ZNjnF')
        for div in all_text_divs:
            # Heuristic: check if it's not part of known other fields
            if not div.find_parent('div', class_='bRMrl _Y K fOSqw') and \
               not div.find_parent('div', {'data-automation': 'listCardDescription'}) and \
               not div.find_parent('div', {'data-automation': 'cardPrice'}) and \
               not "cancellation" in div.get_text(strip=True).lower() and \
               not "recommended by" in div.get_text(strip=True).lower():
                type_element = div
                break
        if type_element:
            place_details['type'] = type_element.get_text(strip=True)
        else:
            place_details['type'] = 'N/A'
    else:
        place_details['type'] = 'N/A'

    # Extract Travel Duration
    duration_tag = listing.find('div', class_='bRMrl _Y K fOSqw')
    if duration_tag:
        duration_element = duration_tag.find('div', class_='biGQs _P pZUbB ZNjnF')
        if duration_element:
            place_details['travel_duration'] = duration_element.get_text(strip=True)
        else:
            place_details['travel_duration'] = 'N/A'
    else:
        place_details['travel_duration'] = 'N/A'

    # Extract Description
    description_tag = listing.find('div', {'data-automation': 'listCardDescription'})
    if description_tag:
        span_tag = description_tag.find('span', class_='SwTtt')
        if span_tag:
            description_text = span_tag.get_text(strip=True).replace('\n', ' ').strip()
            # Ensure description ends with '...' if it was truncated, but avoid double '...'
            if not description_text.endswith('…') and '…' in span_tag.text: # Check if original text had '…'
                description_text += '…'
            place_details['description'] = description_text
        else:
            place_details['description'] = 'N/A'
    else:
        place_details['description'] = 'N/A'

    # Extract Starting Price
    price_tag = listing.find('div', {'data-automation': 'cardPrice'})
    if price_tag:
        place_details['starting_price'] = price_tag.get_text(strip=True)
    else:
        place_details['starting_price'] = 'N/A'

    places.append(place_details)

print(json.dumps(places, indent=2, ensure_ascii=False))

[
  {
    "title": "Ella: Transfer to Tangalle/Mirissa/Galle & Yala/Udawalawe Safari",
    "image_links": [
      "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/30/03/15/e8/caption.jpg?w=500&h=400&s=1"
    ],
    "rating": "4.7",
    "total_reviews": "30",
    "recommendation": "Recommended by 93% of travelers",
    "type": "Private and Luxury",
    "travel_duration": "8–10 hours",
    "description": "After pick-up from your hotel in Ella, head for Udawalawe Or Yala National Park, where you'll switch to an open concept …",
    "starting_price": "$20"
  },
  {
    "title": "Private Jeep Safari at Minneriya National Park to Visit Elephants",
    "image_links": [
      "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/2c/c8/cb/5e/caption.jpg?w=500&h=400&s=1"
    ],
    "rating": "4.9",
    "total_reviews": "215",
    "recommendation": "Recommended by 98% of travelers",
    "type": "4WD Tours",
    "travel_duration": "3–4 hours",
    "description": "Attention: According to

direct extracting

In [21]:
places = []

# Find all article tags that represent a "place" or listing
listings = soup.find_all('article', class_='GTuVU XJlaI rHoxO')

for listing in listings:
    place_details = {}

    # Extract Title
    title_tag = listing.find('h3', class_='biGQs _P fiohW OgHoE')
    if title_tag:
        # Removing leading numbers (e.g., "1.", "2.", "30.")
        title_text = title_tag.get_text(strip=True)
        place_details['title'] = re.sub(r'^\d+\.\s*', '', title_text).strip()
    else:
        place_details['title'] = 'N/A'

    # Extract Image links
    image_links = []
    # Find the main div that contains the image carousel
    image_container = listing.find('div', class_='IdURT w carousel UznXc wSSLS')
    if image_container:
        # Find all img tags within this container that have a 'src' attribute
        # and filter out data URI images (placeholders)
        img_tags = image_container.find_all('img', src=True)
        for img_tag in img_tags:
            src = img_tag['src']
            if not src.startswith('data:image/svg+xml'):
                # Handle relative URLs if any (TripAdvisor generally uses full URLs or // scheme relative)
                if src.startswith('//'):
                    src = 'https:' + src
                image_links.append(src)
    place_details['image_links'] = image_links if image_links else 'N/A'

    # Extract Rating
    rating_tag = listing.find('div', {'data-automation': 'bubbleRatingValue'})
    if rating_tag:
        place_details['rating'] = rating_tag.find('span').get_text(strip=True)
    else:
        place_details['rating'] = 'N/A'

    # Extract Total Reviews
    reviews_tag = listing.find('div', {'data-automation': 'bubbleLabel'})
    if reviews_tag:
        place_details['total_reviews'] = reviews_tag.get_text(strip=True)
    else:
        place_details['total_reviews'] = 'N/A'

    # Extract Recommendation
    recommendation_tag = listing.find('span', class_='biGQs _P pZUbB egaXP ZNjnF', string=lambda text: text and 'Recommended by' in text)
    if recommendation_tag:
        place_details['recommendation'] = recommendation_tag.get_text(strip=True)
    else:
        place_details['recommendation'] = 'N/A'

    # Extract Type
    type_div = listing.find('div', class_='alPVI eNNhq PgLKC tnGGX yzLvM')
    if type_div:
        type_element = None
        all_text_divs = type_div.find_all('div', class_='biGQs _P pZUbB ZNjnF')
        for div in all_text_divs:
            if not div.find_parent('div', class_='bRMrl _Y K fOSqw') and \
               not div.find_parent('div', {'data-automation': 'listCardDescription'}) and \
               not div.find_parent('div', {'data-automation': 'cardPrice'}) and \
               not "cancellation" in div.get_text(strip=True).lower() and \
               not "recommended by" in div.get_text(strip=True).lower():
                type_element = div
                break
        if type_element:
            place_details['type'] = type_element.get_text(strip=True)
        else:
            place_details['type'] = 'N/A'
    else:
        place_details['type'] = 'N/A'

    # Extract Travel Duration
    duration_tag = listing.find('div', class_='bRMrl _Y K fOSqw')
    if duration_tag:
        duration_element = duration_tag.find('div', class_='biGQs _P pZUbB ZNjnF')
        if duration_element:
            place_details['travel_duration'] = duration_element.get_text(strip=True)
        else:
            place_details['travel_duration'] = 'N/A'
    else:
        place_details['travel_duration'] = 'N/A'

    # Extract Description
    description_tag = listing.find('div', {'data-automation': 'listCardDescription'})
    if description_tag:
        span_tag = description_tag.find('span', class_='SwTtt')
        if span_tag:
            description_text = span_tag.get_text(strip=True).replace('\n', ' ').strip()
            if not description_text.endswith('…') and '…' in span_tag.text:
                description_text += '…'
            place_details['description'] = description_text
        else:
            place_details['description'] = 'N/A'
    else:
        place_details['description'] = 'N/A'

    # Extract Starting Price
    price_tag = listing.find('div', {'data-automation': 'cardPrice'})
    if price_tag:
        place_details['starting_price'] = price_tag.get_text(strip=True)
    else:
        place_details['starting_price'] = 'N/A'

    places.append(place_details)

print(json.dumps(places, indent=2, ensure_ascii=False))

[
  {
    "title": "Ella: Transfer to Tangalle/Mirissa/Galle & Yala/Udawalawe Safari",
    "image_links": [
      "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/30/03/15/e8/caption.jpg?w=500&h=400&s=1"
    ],
    "rating": "4.7",
    "total_reviews": "30",
    "recommendation": "Recommended by 93% of travelers",
    "type": "Private and Luxury",
    "travel_duration": "8–10 hours",
    "description": "After pick-up from your hotel in Ella, head for Udawalawe Or Yala National Park, where you'll switch to an open concept …",
    "starting_price": "$20"
  },
  {
    "title": "Private Jeep Safari at Minneriya National Park to Visit Elephants",
    "image_links": [
      "https://dynamic-media-cdn.tripadvisor.com/media/photo-o/2c/c8/cb/5e/caption.jpg?w=500&h=400&s=1"
    ],
    "rating": "4.9",
    "total_reviews": "215",
    "recommendation": "Recommended by 98% of travelers",
    "type": "4WD Tours",
    "travel_duration": "3–4 hours",
    "description": "Attention: According to