In [50]:
from bs4 import BeautifulSoup
import re
import time
import requests
import random
import string

import pandas as pd


In [51]:
def clean_text(text: str) -> str:
    txt = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    txt = txt.replace('  ', ' ')
    return txt.strip()

def extract_by_regex(text: str, regex: str) -> str:
    pattern = re.compile(regex)
    match = pattern.search(text)
    if match:
        # Check if there are any groups and return the first group if it exists
        if match.groups():
            return match.group(1) + " " + match.group(2) if len(match.groups()) > 1 else match.group(1)
        else:
            return match.group(0)  # Return the entire match if no groups are defined
    return ""

def filter_by_regex(text, pattern):
    """Extract data using a regular expression."""
    match = re.sub(pattern, '', text)
    return match if match else None

In [52]:
class TripAdvisorScraper:
    """
    Base class for TripAdvisor scraping.
    All other classes inherit from this.
    """
    def __init__(self):
        self.url_base = "https://www.tripadvisor.fr"
        self.soup = None
        self.url = None
        self.full_url = None

    def fetch_page(self, url):
        """Fetch a page and set the soup."""
        self.url = url
        self.full_url = self.url_base + url
        random_request_id = "".join(
            random.choice(string.ascii_lowercase + string.digits) for _ in range(180)
        )

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
            "accept-language": "en-US,en;q=0.9,fr;q=0.8",
            "X-Requested-By": random_request_id,
            "Referer": "https://www.tripadvisor.fr/Hotels",
            "Origin": "https://www.tripadvisor.fr",
            "accept-encoding": "gzip, deflate, br",
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "encoding": "utf-8",
        }

        response = requests.get(self.full_url, headers=headers)
        response.raise_for_status()
        self.soup = BeautifulSoup(response.content, "html.parser")

    def print_soup(self):
        """Print the soup for debugging purposes."""
        if self.soup:
            print(self.soup.prettify())
        else:
            print("Soup is not initialized. Fetch the page first.")

    def get_next_url(self):
        """Get the next page's URL."""
        next_button = self.soup.find(
            "a", class_="BrOJk u j z _F wSSLS tIqAi Vonfv",
            attrs={"aria-label": "Suivant"}
        )
        return next_button["href"] if next_button else None

In [53]:
class TripAdvisorSpecificRestaurantScraper(TripAdvisorScraper):
    """Scraper for reviews of a specific restaurant."""
    def __init__(self):
        super().__init__()
        self.restaurant_data = []

    def get_review_cards(self):
        """Extract review cards."""
        if self.soup:
            return self.soup.find_all("div", class_="_c")
        print("Soup not initialized. Fetch the page first.")
        return []

    def parse_review(self, review_card):
        """Parse data from a single review card."""
        review_text_class = "biGQs _P pZUbB KxBGd"
        contributions_class = "b"
        date_class = "biGQs _P pZUbB ncFvv osNWb"
        user_name_class = "BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS"
        rating_class = "UctUV d H0"

        review_text = review_card.find("div", class_=review_text_class).get_text(strip=True) if review_card.find("div", class_=review_text_class) else None
        contributions = review_card.find("span", class_=contributions_class).get_text(strip=True) if review_card.find("span", class_=contributions_class) else None
        date = review_card.find("div", class_=date_class).get_text(strip=True) if review_card.find("div", class_=date_class) else None
        user_name = review_card.find("a", class_=user_name_class).get_text(strip=True) if review_card.find("a", class_=user_name_class) else None
        rating = review_card.find("svg", class_=rating_class).title.text if review_card.find("svg", class_=rating_class) else None

        date = filter_by_regex(date, r"Rédigé le") if date else None
        rating = extract_by_regex(rating, r"(\d\,\d)") if rating else None
        
        return {
            "review_text": clean_text(review_text) if review_text else None,
            "contributions": contributions,
            "date": date,
            "user_name": user_name,
            "rating": rating,
        }

    def get_all_reviews(self):
        """Get all reviews from the restaurant."""
        reviews = []
        while self.url:
            time.sleep(random.uniform(1, 3))
            review_cards = self.get_review_cards()
            for card in review_cards:
                reviews.append(self.parse_review(card))
            self.url = self.get_next_url()
            if self.url:
                self.fetch_page(self.url)
        return reviews

In [54]:
url = '/Restaurant_Review-g187265-d23110895-Reviews-Frazarin-Lyon_Rhone_Auvergne_Rhone_Alpes.html'
restaurants_scrapper = TripAdvisorSpecificRestaurantScraper()
restaurants_scrapper.fetch_page(url)

In [55]:
cards = restaurants_scrapper.get_review_cards()
for card in cards:
    print(restaurants_scrapper.parse_review(card))

{'review_text': "Ambiance soignée, patrons amicaux et donnant toutes les explications bienvenues sur l'histoire de leur restaurant et sur les plats. De beaux plats, 2 à 3 choix par type. Très bien cuisiné, assaisonné et portions équilibrées. Belle carte des vins, très fournies. Bravo à vous, nous reviendrons", 'contributions': '8', 'date': '15 décembre 2024', 'user_name': 'Fred S', 'rating': '5,0'}
{'review_text': 'Vous voulez bien manger, alors allez dans ce restaurant. Tout est parfait. L\'accueil, la sympathie du personnel, excellent. Vous pouvez même vous rendre en cuisine pour féliciter le cuisinier, il est très content d\'entendre votre satisfaction. Je le recommande vivement. En sortant du restaurant, mes amis m\'ont dit " il faut que l\'on revienne ici, trop bon "', 'contributions': '1', 'date': '27 novembre 2024', 'user_name': 'Guide38409630404', 'rating': '5,0'}
{'review_text': 'Tout simplement parfait de l’entrée au dessert en passant par les vins et le service.  Assurément 