In [1]:
from bs4 import BeautifulSoup
import re
import time
import requests
import random
import string

import pandas as pd


In [2]:
def clean_text(text: str) -> str:
    txt = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    txt = txt.replace('  ', ' ')
    return txt.strip()

def extract_by_regex(text: str, regex: str) -> str:
    pattern = re.compile(regex)
    match = pattern.search(text)
    if match:
        # Check if there are any groups and return the first group if it exists
        if match.groups():
            return match.group(1) + " " + match.group(2) if len(match.groups()) > 1 else match.group(1)
        else:
            return match.group(0)  # Return the entire match if no groups are defined
    return ""

def filter_by_regex(text, pattern):
    """Extract data using a regular expression."""
    match = re.sub(pattern, '', text)
    return match if match else None

In [3]:
class TripAdvisorScraper:
    """
    Base class for TripAdvisor scraping.
    All other classes inherit from this.
    """
    def __init__(self):
        self.url_base = "https://www.tripadvisor.fr"
        self.soup = None
        self.url = None
        self.full_url = None

    def fetch_page(self, url):
        """Fetch a page and set the soup."""
        self.url = url
        self.full_url = self.url_base + url
        random_request_id = "".join(
            random.choice(string.ascii_lowercase + string.digits) for _ in range(180)
        )

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
            "accept-language": "en-US,en;q=0.9,fr;q=0.8",
            "X-Requested-By": random_request_id,
        }

        response = requests.get(self.full_url, headers=headers)
        response.raise_for_status()
        self.soup = BeautifulSoup(response.content, "html.parser")

    def print_soup(self):
        """Print the soup for debugging purposes."""
        if self.soup:
            print(self.soup.prettify())
        else:
            print("Soup is not initialized. Fetch the page first.")

    def get_next_url(self):
        """Get the next page's URL."""
        next_button = self.soup.find(
            "a", class_="BrOJk u j z _F wSSLS tIqAi Vonfv",
            attrs={"aria-label": "Suivant"}
        )
        return next_button["href"] if next_button else None


In [4]:
class TripAdvisorRestaurantsScraper(TripAdvisorScraper):
    """Scraper for the list of restaurants."""
    def __init__(self):
        super().__init__()
        self.restaurant_data = []

    def get_restaurant_cards(self):
        """Extract restaurant cards."""
        if self.soup:
            cards = self.soup.find_all("div", class_="vIjFZ Gi o VOEhq")
            if not cards:
                print("No restaurant cards found. Check the structure.")
            return cards
        print("Soup not initialized. Fetch the page first.")
        return []

    def parse_restaurant(self, restaurant_card):
        """Parse data from a single restaurant card."""
        name_class = "BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS"
        url_class = "BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS"
        reviews_class = "IiChw"
        median_reviews_class = "Qqwyj"

        name = restaurant_card.find("a", class_=name_class).get_text(strip=True) if restaurant_card.find("a", class_=name_class) else None
        url = restaurant_card.find("a", class_=url_class)["href"] if restaurant_card.find("a", class_=url_class) else None
        reviews = restaurant_card.find("span", class_=reviews_class).get_text(strip=True) if restaurant_card.find("span", class_=reviews_class) else None
        median_reviews = restaurant_card.find("span", class_=median_reviews_class).get_text(strip=True) if restaurant_card.find("span", class_=median_reviews_class) else None
        
        ranking, name = name.split(".", 1) if name else (None, None)
        # reviews = re.sub(r'\W+', '', reviews) if reviews else None
        url = url
        reviews = filter_by_regex(reviews, r'\W+') if reviews else None

        return {
            "ranking": ranking,
            "name": name,
            "url": url,
            "avg_review": extract_by_regex(median_reviews, r"(\d+),(\d+)?") if median_reviews else None,
            "total_reviews": extract_by_regex(reviews, r"(\d+)?") if reviews else None,
        }

    def get_all_restaurants(self):
        """Get all restaurants."""
        restaurants = []
        while self.url:
            time.sleep(random.uniform(1, 3))
            self.fetch_page(self.url)
            restaurant_cards = self.get_restaurant_cards()
            for card in restaurant_cards:
                restaurant = self.parse_restaurant(card)
                if restaurant:
                    restaurants.append(restaurant)
            print(f"Scraped {len(restaurants)} restaurants")
            self.url = self.get_next_url()
        return restaurants


In [5]:
restaurants_comp = "/FindRestaurants?geo=187265&offset=0&establishmentTypes=10591&minimumTravelerRating=TRAVELER_RATING_LOW&broadened=false/FindRestaurants?geo=187265&offset=0&establishmentTypes=10591&minimumTravelerRating=TRAVELER_RATING_LOW&broadened=false"
restaurants_scraper = TripAdvisorRestaurantsScraper()
restaurants_scraper.fetch_page(restaurants_comp)

In [6]:
cards = restaurants_scraper.get_restaurant_cards()
for card in cards:
    print(restaurants_scraper.parse_restaurant(card))

{'ranking': '1', 'name': 'Les Terrasses de Lyon', 'url': 'Restaurant_Reviewg187265d3727154ReviewsLes_Terrasses_de_LyonLyon_Rhone_Auvergne_Rhone_Alpeshtml', 'avg_review': '4 5', 'total_reviews': '918'}
{'ranking': '2', 'name': 'Frazarin Bistrot Franco Italien', 'url': 'Restaurant_Reviewg187265d23110895ReviewsFrazarinLyon_Rhone_Auvergne_Rhone_Alpeshtml', 'avg_review': '5 0', 'total_reviews': '298'}
{'ranking': '3', 'name': 'Le Grand Réfectoire', 'url': 'Restaurant_Reviewg187265d15373478ReviewsLe_Grand_RefectoireLyon_Rhone_Auvergne_Rhone_Alpeshtml', 'avg_review': '4 0', 'total_reviews': '1332'}
{'ranking': '4', 'name': 'Le Comptoir Des Cousins', 'url': 'Restaurant_Reviewg187265d12874430ReviewsLe_Comptoir_Des_CousinsLyon_Rhone_Auvergne_Rhone_Alpeshtml', 'avg_review': '5 0', 'total_reviews': '711'}
{'ranking': '5', 'name': 'Agastache Restaurant', 'url': 'Restaurant_Reviewg187265d20287839ReviewsAgastache_RestaurantLyon_Rhone_Auvergne_Rhone_Alpeshtml', 'avg_review': '5 0', 'total_reviews': '2

In [7]:
url = restaurants_scraper.get_next_url()
type(url)


NoneType