In [120]:
from bs4 import BeautifulSoup
import re
import time
import requests
import random
import string

import pandas as pd


In [121]:
def clean_text(text: str) -> str:
    txt = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    txt = txt.replace('  ', ' ')
    return txt.strip()

def extract_by_regex(text: str, regex: str) -> str:
    pattern = re.compile(regex)
    match = pattern.search(text)
    if match:
        # Check if there are any groups and return the first group if it exists
        if match.groups():
            return match.group(1) + " " + match.group(2) if len(match.groups()) > 1 else match.group(1)
        else:
            return match.group(0)  # Return the entire match if no groups are defined
    return ""

In [164]:
class TripAdvisorScraper:
    """
    Base class for TripAdvisor scraping.
    All the other classes will inherit from this one.
    """

    def __init__(self):
        self.url_base = "https://www.tripadvisor.com"
        self.soup = None
        self.url = None
        self.full_url = None

    def fetch_page(self, url):
        """
        Fetch the page and set the soup.
        """
        self.url = url
        self.full_url = self.url_base + url
        # Set headers
        random_request_id = "".join(
            random.choice(string.ascii_lowercase + string.digits) for i in range(180)
        )

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
            "accept-language": "en-US,en;q=0.9,fr;q=0.8",
            "X-Requested-By": random_request_id,
            "Referer": "https://www.tripadvisor.com/Hotels",
            "Origin": "https://www.tripadvisor.com",
            "accept-encoding": "gzip, deflate, br",
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "encoding": "utf-8",
        }

        # Send a GET request
        response = requests.get(self.full_url, headers=headers)
        try:
            # Check encoding

            if response.headers.get("Content-Encoding") == "gzip":
                content = response.content.decode("gzip")
            elif response.headers.get("Content-Encoding") == "deflate":
                content = response.content.decode("zlib")
            # elif response.headers.get('Content-Encoding') == 'br':
            #     content = brotli.decompress(response.content)
            else:
                content = response.text

        except Exception as e:
            print(f"Error: {e}")
            content = response.text
        finally:
            self.soup = BeautifulSoup(content, "html.parser")

    def print_soup(self):
        """
        print the soup
        """
        if self.soup:
            print(self.soup.prettify())
        else:
            print("Soup is not initialized. Please fetch the page first.")


In [None]:
from bs4 import BeautifulSoup
import time
import requests
import random
import string


class TripAdvisorRestaurantsScraper(TripAdvisorScraper):
    """
    Class to scrap the list of restaurants.
    """

    def __init__(self):
        super().__init__()
        self.restaurant_data = []
        self.ranking = 0

    def get_restaurants_cards(self):
        """
        Get the restaurant cards from the soup.
        @return: list of restaurant cards (beautiful soup objects)
        """
        if self.soup:
            restaurant_cards = self.soup.find_all(
                "div", class_="tbrcR _T DxHsn TwZIp rrkMt nSZNd DALUy Re"
            )
            return restaurant_cards
        else:
            print("Soup is not initialized. Please fetch the page first.")
            return []

    def get_next_url(self):
        """
        Get the next url to scrap
        """
        next_url = self.soup.find(
            "a",
            class_="BrOJk u j z _F wSSLS tIqAi unMkR",
            attrs={"aria-label": "Next page"},
        )
        if next_url is not None:
            return next_url.get("href")
        else:
            return None

    def extract_restaurant_data(self, restaurant_cards):
        """
        Extract the restaurant data from the restaurant cards.
        @param restaurant_cards: list of restaurant cards (beautiful soup objects)
        @return: list of restaurant data (liste de dicts)
        """
        corpus = []
        # restaurant_name = "biGQs _P fiohW alXOW oCpZu GzNcM nvOhm UTQMg ZTpaU mtnKn ngXxk"
        # # non utilisé donc commenté

        restaurant_url = "BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS"
        restaurant_reviews = "jVDab W f u w JqMhy"
        restaurant_type = "biGQs _P pZUbB hmDzD"
        restaurant_price = "biGQs _P pZUbB hmDzD"
        
        for restaurant_card in restaurant_cards:
            self.ranking += 1
            scrap_restaurant_name = (
                restaurant_card.find("a", {"class": restaurant_url}).text
                if restaurant_card.find("a", {"class": restaurant_url})
                else None
            )
            scrap_restaurant_url = (
                restaurant_card.find("a", {"class": restaurant_url}).get("href")
                if restaurant_card.find("a", {"class": restaurant_url})
                else None
            )
            scrap_restaurant_reviews = (
                restaurant_card.find("div", {"class": restaurant_reviews}).get(
                    "aria-label"
                )
                if restaurant_card.find("div", {"class": restaurant_reviews})
                else None
            )
            scrap_restaurant_type = (
                restaurant_card.find("span", {"class": restaurant_type}).text
                if restaurant_card.find("span", {"class": restaurant_type})
                else None
            )
            price_elements = [card.text for card in restaurant_card.find_all("span", {"class": restaurant_price}) if '$' in card.text]
            scrap_restaurant_price = price_elements[0] if price_elements else None
            
            doc = {
                "restaurant_ranking": self.ranking,
                "restaurant_name": (
                    scrap_restaurant_name.split(".")[1].strip()
                    if scrap_restaurant_name and len(scrap_restaurant_name.split(".")) > 1
                    else scrap_restaurant_name.split(".")[0].strip()
                    if scrap_restaurant_name and len(scrap_restaurant_name.split(".")) > 0
                    else None
                ),
                "restaurant_url": (
                    scrap_restaurant_url
                    if scrap_restaurant_url is not None and scrap_restaurant_url != ""
                    else None
                ),
                "restaurant_reviews": (
                    scrap_restaurant_reviews
                    if scrap_restaurant_reviews is not None
                    else None
                ),
                "restaurant_type": (
                    scrap_restaurant_type
                    if scrap_restaurant_type is not None
                    else None
                ),
                "restaurant_price": (
                    scrap_restaurant_price
                    if scrap_restaurant_price is not None
                    else None
                ),
            }
            corpus.append(doc)
        return corpus

    def get_all_pages(self):
        """
        Get all the restaurants
        @return: list of restaurant data (liste de dicts)
        """
        page = 1
        corpus = []
        tries = 0
        while self.url is not None:
            time.sleep(random.uniform(1, 3))
            self.fetch_page(self.url)
            new_cards = self.get_restaurants_cards()
            if not new_cards:
                tries += 1
                if tries > 10:
                    raise Exception("No restaurant cards found - Aborting")
                else:
                    continue
            new_reg = self.extract_restaurant_data(new_cards)
            corpus.extend(new_reg)
            print(f"Page {page} done, corpus size: {len(corpus)}")
            page += 1
            self.url = self.get_next_url()
            tries = 0
        return corpus


In [284]:
restaurants_comp = "/FindRestaurants?geo=187265&offset=0&establishmentTypes=10591&minimumTravelerRating=TRAVELER_RATING_LOW&broadened=false"
restaurants_scraper = TripAdvisorRestaurantsScraper()
restaurants_scraper.fetch_page(restaurants_comp)

In [285]:
corpus = restaurants_scraper.get_all_pages()

try 0
No restaurant cards found - Trying again
try 1
No restaurant cards found - Trying again
try 2
Page 1 - 33 cards found
Page 1 done, corpus size: 33
try 0
No restaurant cards found - Trying again
try 1
Page 2 - 33 cards found
Page 2 done, corpus size: 66
try 0
No restaurant cards found - Trying again
try 1
No restaurant cards found - Trying again
try 2
Page 3 - 33 cards found
Page 3 done, corpus size: 99
try 0
Page 4 - 32 cards found
Page 4 done, corpus size: 131
try 0
Page 5 - 33 cards found
Page 5 done, corpus size: 164
try 0


KeyboardInterrupt: 

In [237]:
restaurants_scraper.url

'/FindRestaurants?geo=187265&offset=0&establishmentTypes=10591&minimumTravelerRating=TRAVELER_RATING_LOW&broadened=false'

In [238]:
cards = restaurants_scraper.get_restaurants_cards()


{'restaurant_class': '1', 'restaurant_name': 'Les Terrasses de Lyon', 'restaurant_url': '/Restaurant_Review-g187265-d3727154-Reviews-Les_Terrasses_de_Lyon-Lyon_Rhone_Auvergne_Rhone_Alpes.html', 'restaurant_reviews': '4.5 of 5 bubbles. 916 reviews', 'restaurant_type': 'French', 'restaurant_price': '$$$$'}
{'restaurant_class': '2', 'restaurant_name': 'Frazarin', 'restaurant_url': '/Restaurant_Review-g187265-d23110895-Reviews-Frazarin-Lyon_Rhone_Auvergne_Rhone_Alpes.html', 'restaurant_reviews': '5.0 of 5 bubbles. 297 reviews', 'restaurant_type': 'Italian, French', 'restaurant_price': '$$ - $$$'}
{'restaurant_class': '3', 'restaurant_name': 'Agastache Restaurant', 'restaurant_url': '/Restaurant_Review-g187265-d20287839-Reviews-Agastache_Restaurant-Lyon_Rhone_Auvergne_Rhone_Alpes.html', 'restaurant_reviews': '5.0 of 5 bubbles. 202 reviews', 'restaurant_type': 'French, Contemporary', 'restaurant_price': '$$ - $$$'}
{'restaurant_class': '4', 'restaurant_name': 'Le Comptoir Des Cousins', 'rest

In [231]:
doc

{'restaurant_class': '30',
 'restaurant_name': 'Les Encavés',
 'restaurant_url': '/Restaurant_Review-g187265-d24048789-Reviews-Les_Encaves-Lyon_Rhone_Auvergne_Rhone_Alpes.html',
 'restaurant_reviews': '5.0 of 5 bubbles. 212 reviews',
 'restaurant_type': 'French, Healthy',
 'restaurant_price': None}

In [30]:
restaurants_scraper.fetch_page('/FindRestaurants?geo=187265&offset=30&establishmentTypes=10591&minimumTravelerRating=TRAVELER_RATING_LOW&broadened=false')

In [68]:
restaurants_scraper.get_restaurants_cards()

[]