<h1>Imports</h1>

In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
from time import sleep

<h1>Establish a driver</h1>

In [23]:
options = Options()
# options.add_argument("--headless")  # Run in headless mode for lower resource usage
options.add_argument("--disable-extensions")
options.add_argument("--disable-images")  # Disable loading images
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-gpu")
options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images
options.add_argument("--disable-infobars")
options.add_argument("--mute-audio")  # Disable audio
options.add_argument("--disable-plugins")  # Disable plugins
options.add_argument("--no-sandbox")  # Improve performance
options.add_argument("--disable-dev-shm-usage")  # Avoid shared memory issues

# Set preferences for blocking other resource types
prefs = {
    "profile.managed_default_content_settings.images": 2,  # Block images
    "profile.default_content_setting_values.media_stream": 2,  # Block video streams
    "profile.default_content_setting_values.geolocation": 2,  # Block location access
    "profile.default_content_setting_values.notifications": 2,  # Block notifications
}
options.add_experimental_option("prefs", prefs)

# Initialize the WebDriver
driver = webdriver.Chrome(
    service=ChromeService(ChromeDriverManager().install()), options=options
)

<h1>Get URL opened</h1>

In [24]:
url = 'https://www.opentable.com/r/5a5-san-francisco-1?corrid=a3596258-85a5-4f20-9d80-be2f68be5f32&avt=eyJ2IjoyLCJtIjoxLCJwIjoxLCJzIjowLCJuIjowfQ&p=2&sd=2024-11-30T19%3A00%3A00&page=1'
driver.get(url)

<h1>Extract total reviews and overall rating</h1>

In [25]:
overAllRatings = []
overAllRatings.append({"restaurant_name": driver.find_element(By.CSS_SELECTOR,".E-vwXONV9nc-").text})
totalReviews = {"total_reviews": driver.find_element(By.CSS_SELECTOR,".mRPf1qe386o-").text}
overAllRatings.append(totalReviews)
value = driver.find_elements(By.CSS_SELECTOR,".yEg-cOaKGpI-")
name = driver.find_elements(By.CSS_SELECTOR,"._2IbhjCOldv8-")
for i in range(len(name)):
    keyValPair = {name[i].text:value[i].text}
    overAllRatings.append(keyValPair)

print(overAllRatings)


[{'restaurant_name': '5A5'}, {'total_reviews': '2,316 Reviews'}, {'Food': '4.3'}, {'Service': '4.3'}, {'Ambience': '4.2'}, {'Value': '1.5'}]


In [30]:
reviews = []
counter = 1
running = True
retries = 0
while running:
    #allows three retries per page. if page does not refresh after 3 tries I assume we have reached the end of scrapping
    while retries < 3:
        try:
            #wait for all elements to be present in DOM. the wait time is 3 seconds
            WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "afkKaa-4T28-")))
            #get reviews tag
            reviewsTag = driver.find_elements(By.CLASS_NAME, "afkKaa-4T28-")
            for review in reviewsTag:
                    try:
                        #extract ratings from the rating tag one by one
                        ratingsTag = review.find_elements(By.CSS_SELECTOR, ".-k5xpTfSXac-")
                        reviewsRating = []
                        for r in ratingsTag:
                            words = r.text.split()
                            if len(words) >= 2:  
                                ratingsDict = {words[0]: words[1]}
                                reviewsRating.append(ratingsDict)
                        try:
                            #get review date
                            reviewDate = review.find_element(By.CSS_SELECTOR, ".iLkEeQbexGs-").text
                            #get review text
                            reviewText = review.find_element(By.CSS_SELECTOR, ".l9bbXUdC9v0-").text.replace('\n', '')
                        #if review date/text not found then refresh page
                        except NoSuchElementException:
                            print("Element not found! Refreshing the page...")
                            driver.refresh()
                            continue

                        #storing reviews in a dict in structured order
                        ratingsTotal = {"review_" + str(counter): {"Ratings": reviewsRating,"Date": reviewDate,"Review": reviewText}}
                        reviews.append(ratingsTotal)
                        counter += 1
                        #reset retries after successful appending
                        retries = 0
                    #if dom path for element is changed we skip
                    except StaleElementReferenceException:
                        continue
            #goto next page
            max_retries = 3
            #try for next button 3 times
            for _ in range(max_retries):
                try:
                    #wait for next button to be clickable, wait is three seconds
                    next_button = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[aria-label='Go to the next page']")))
                    #if next button is displayed to the user we click it
                    if next_button.is_displayed():
                        next_button.click()
                        print(f"Navigating to the next page, collected {len(reviews)} reviews so far.")
                        break
                    #else if next button is not displayed scrapping is finished and we exit
                    else:
                        print("Scrapping complete, exitting script")
                        running = False
                        break
                #if next button is not found, scrapping is complete and we exit
                except NoSuchElementException:
                    print("Scrapping complete, exitting script")
                    running = False
                    break
                except StaleElementReferenceException:
                    print("Stale element reference for next button. Retrying...")
                    sleep(2)
            #if all 3 tries of clicking next button fail, we exit the script
            else:
                print("Failed to click next button after retries. Exiting loop.")
                running = False
                break
        #if dom doesn't load in 3 seconds we refresh and retry
        except TimeoutException:
            print("Timeout exception occurred, Refreshing page...")
            driver.refresh()
            retries += 1
        except NoSuchElementException:
            running = False
            break
    #exit loop when three retries occurred
    if retries >= 3:
        running = False
        print("End of reviews reached, exitting script")

print(len(reviews))

Navigating to the next page, collected 10 reviews so far.
Navigating to the next page, collected 20 reviews so far.
Navigating to the next page, collected 30 reviews so far.
Navigating to the next page, collected 40 reviews so far.
Navigating to the next page, collected 50 reviews so far.
Navigating to the next page, collected 60 reviews so far.
Navigating to the next page, collected 70 reviews so far.
Navigating to the next page, collected 80 reviews so far.
Navigating to the next page, collected 90 reviews so far.
Navigating to the next page, collected 100 reviews so far.
Navigating to the next page, collected 110 reviews so far.
Navigating to the next page, collected 120 reviews so far.
Navigating to the next page, collected 130 reviews so far.
Navigating to the next page, collected 140 reviews so far.
Navigating to the next page, collected 150 reviews so far.
Navigating to the next page, collected 160 reviews so far.
Navigating to the next page, collected 170 reviews so far.
Naviga

In [31]:
driver.quit()

In [32]:
with open("reviews.json",'w') as f:
    json.dump(reviews,fp = f, indent = 4)