In [12]:
pip install selenium webdriver-manager fake_useragent 

Note: you may need to restart the kernel to use updated packages.


In [13]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from webdriver_manager.chrome import ChromeDriverManager

In [14]:
from urllib.parse import urlparse, urljoin
yelp_url = "https://www.yelp.com/biz/mix-kitchen-and-bar-ithaca-11?osq=Reservations"
def extract_business_slug(yelp_url):
    """Extracts the business slug from a Yelp URL"""
    parsed_url = urlparse(yelp_url)
    path_parts = parsed_url.path.split("/")
    
    if len(path_parts) > 2 and path_parts[1] == "biz":
        business_slug = path_parts[2]  
        clean_url = urljoin("https://www.yelp.com", f"/biz/{business_slug}")  
    return clean_url

clean_yelp_url = extract_business_slug(yelp_url)
clean_yelp_url

'https://www.yelp.com/biz/mix-kitchen-and-bar-ithaca-11'

In [15]:
from fake_useragent import UserAgent

ua = UserAgent()
headers = {"User-Agent": ua.random}
page = requests.get(clean_yelp_url, headers=headers)

In [16]:
soup = BeautifulSoup(page.text, 'html.parser')

In [17]:
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
headers = {"User-Agent": ua.random}
def click_next_page(driver):
    """Clicks the 'Next' button to load the next page of reviews, if available."""
    try:
        time.sleep(2)  # <--- Short delay to allow page update

        next_button = driver.find_element(By.XPATH, "//a[contains(@class, 'next-link')]")

        if next_button:
            driver.execute_script("arguments[0].scrollIntoView();", next_button)  # Scroll to button
            time.sleep(2)
            driver.execute_script("arguments[0].click();", next_button)  
            print("Clicked 'Next' button, loading next page...")
            time.sleep(4)  
            return True  

    except (StaleElementReferenceException, NoSuchElementException):
        print("No more pages or 'Next' button not found.")
        return False  

    return False

In [18]:
def get_driver():
    """This function is used to set up a Selenium WebDriver with headless Chrome 
    in order to go through the data within Yelp."""
    options = Options()
    options.add_argument("--headless")  
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")       
    options.add_argument("--window-size=1920,1080")
    time.sleep(5)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def yelp_dates(clean_yelp_url):
    "This function extracts the time at which each yelp rating was posted"
    driver = get_driver()
    driver.get(clean_yelp_url)
    time.sleep(5) # <-- allows time for javascript to load
    ask_community_reviews = driver.find_elements(By.XPATH, "//section[@aria-label='Ask the Community']//ul[contains(@class, 'list__09f24__ynIEd')]/li")
    length = len(ask_community_reviews)
    all_page_dates = []
    page = 1
    while True:
        time.sleep(3)
        review_sections = driver.find_elements(By.XPATH, "//ul[contains(@class, 'list__09f24__ynIEd')]//li") # <--- tells the webscraper to start scraping at the start constumer reviews section
        print(f"Found {len(review_sections)} dates.")  # Debugging
        if not review_sections:
            print("No dates found on this page.")
            break 
        
        dates = []
        for d in review_sections:
            try:
                date_element = d.find_element(By.XPATH, ".//span[contains(@class, 'y-css-1d8mpv1')][not(contains(text(), 'Photos'))]")
                date = date_element.text.strip()  
                if date:
                    dates.append(date)
                else:
                    continue
            except Exception as e:
                print(f"Error extracting rating: {e}") 
        all_page_dates.append(dates)
        print(f"Stored {len(dates)} from Page {page} into all_page_dates")
        if page > 2:
            print(f"Reached page limit {page}")
            break
        if not click_next_page(driver):
            print("No more pages to scrape reviews from")
            break
        page += 1
    driver.quit()
    return all_page_dates
    
all_dates = yelp_dates(clean_yelp_url)
print(all_dates)

Found 0 dates.
No dates found on this page.
[]


In [19]:
def yelp_ratings(clean_yelp_url):
    "This function extracts each yelp rating using Selenium"
    driver = get_driver()
    driver.get(clean_yelp_url)
    time.sleep(5) # <-- allows time for javascript to load
    all_page_ratings = []
    page = 1
    while True:
        time.sleep(2)
        review_sections = driver.find_elements(By.XPATH, "//ul[contains(@class, 'list__09f24__ynIEd')]//li") # <--- tells the webscraper to start scraping at the start constumer reviews section
        print(f"Found {len(review_sections)} reviews.")  # Debugging
        if not review_sections:
            print("No reviews found on this page.")
            break 
        
        ratings = []
        for review in review_sections:
            try:
                rating_element = review.find_elements(By.XPATH, ".//div[contains(@class, 'y-css-dnttlc')]")  
                if rating_element:
                    rating = rating_element[0].get_attribute("aria-label") 
                    ratings.append(rating)
                else:
                    continue
            except Exception as e:
                print(f"Error extracting rating: {e}") 
        time.sleep(2)
        all_page_ratings.append(ratings)
        print(f"Stored {len(ratings)} from Page {page} into all_page_ratings")
        if page > 2:
            print(f"Reached page limit {page}")
            break
        if not click_next_page(driver):
            print("No more pages to scrape reviews from")
            break
        page += 1
    driver.quit()
    return all_page_ratings

all_reviews = yelp_ratings(clean_yelp_url)
print(all_reviews)

Found 0 reviews.
No reviews found on this page.
[]


In [20]:
headers = {"User-Agent": ua.random}
def get_driver():
    """This function is used to set up a Selenium WebDriver with headless Chrome 
    in order to go through the data within Yelp."""
    options = Options()
    options.add_argument("--headless")  
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")       
    options.add_argument("--window-size=1920,1080")
    time.sleep(5)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver
driver = get_driver()

def community_section_len(driver):
    try:
        ask_community_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//section[contains(@aria-label, 'Ask the Community')]"))
        )
        community_section = []
        for i in ask_community_section:
            try:
                community_dates = ask_community_section.find_elements(By.XPATH, ".//span[contains(@class, 'y-css-1d8mpv1')]")
                if community_dates:
                    community_section.append(community_dates)
                else: 
                    continue
            except Exception as e:
                print(f"Error extracting rating: {e}") 
        length = len(community_section)
        return length
    
    except TimeoutException:
        print("Ask the Community section not found.")
        return 0

community_section_len(driver)

NameError: name 'TimeoutException' is not defined