In [1]:
# Cell 1: Import Libraries
import requests
from bs4 import BeautifulSoup
import logging
import csv
import time

In [2]:
# Cell 2: Set up URL and Headers
url = 'https://www.tripadvisor.com/Restaurant_Review-g60763-d478965-Reviews-Gallaghers_Steakhouse-New_York_City_New_York.html'

headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    # Note: Removed Accept-Encoding to fix garbled text issue
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

print("Headers configured successfully")

Headers configured successfully


In [3]:
# Cell 3: Create Session and Make Request
session = requests.Session()
session.headers.update(headers)

try:
    response = session.get(url, timeout=10)
    response.raise_for_status()
    
    print(f"✅ Status Code: {response.status_code}")
    print(f"📦 Content Encoding: {response.headers.get('Content-Encoding', 'None')}")
    print(f"📄 Content Type: {response.headers.get('Content-Type', 'None')}")
    print(f"📏 Content Length: {len(response.content)} bytes")
    
except requests.exceptions.RequestException as e:
    print(f"❌ Request failed: {e}")

✅ Status Code: 200
📦 Content Encoding: gzip
📄 Content Type: text/html; charset=utf-8
📏 Content Length: 910296 bytes


In [4]:
# Cell 4: Handle Text Encoding
# Ensure proper encoding to avoid garbled text
response.encoding = response.apparent_encoding or 'utf-8'

print(f"🔤 Detected encoding: {response.encoding}")
print(f"📝 Response text length: {len(response.text)} characters")

🔤 Detected encoding: utf-8
📝 Response text length: 910143 characters


In [5]:
# Cell 5: Parse with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

print("🍲 BeautifulSoup parsing completed")
print(f"📊 Found {len(soup.find_all())} HTML elements")

🍲 BeautifulSoup parsing completed
📊 Found 4376 HTML elements


In [6]:
# Cell 6: Test the Parsed Content
# Display first 500 characters to verify it's readable
print("🔍 First 500 characters of parsed content:")
print("=" * 50)
print(soup.prettify()[:500])
print("=" * 50)

🔍 First 500 characters of parsed content:
<!DOCTYPE html>
<html lang="en-US">
 <head>
  <link href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/favicon_2025.ico" id="favicon" rel="icon" type="image/x-icon"/>
  <link href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/icon.svg" rel="icon" type="image/svg+xml"/>
  <link href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/apple_touch_icon.png" rel="apple-touch-icon" sizes="180x180"/>
  <link color="#00210c" href="https://stat


In [7]:
# Cell 7: Check for Common TripAdvisor Elements
# Test if we can find typical TripAdvisor elements
restaurant_name = soup.find('h1')
if restaurant_name:
    print(f"🏪 Restaurant name found: {restaurant_name.get_text().strip()}")
else:
    print("⚠️ Restaurant name not found - might be blocked or structure changed")

🏪 Restaurant name found: Gallaghers Steakhouse


In [8]:
general_infos = soup.find('div', class_='CsAqy').text.strip()
print(general_infos)

4.44.4 of 5 bubbles(6,149 reviews) #45 of 12,852 Restaurants in New York CitySeafood, Steakhouse, $$$$


In [9]:
# Cell 8: Clean up
session.close()
print("🔒 Session closed successfully")

🔒 Session closed successfully
