In [1]:
from bs4 import BeautifulSoup
import re
import time
import requests
import random
import string

import pandas as pd


In [2]:
def clean_text(text: str) -> str:
    txt = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    txt = txt.replace('  ', ' ')
    return txt.strip()

def extract_by_regex(text: str, regex: str) -> str:
    pattern = re.compile(regex)
    match = pattern.search(text)
    if match:
        # Check if there are any groups and return the first group if it exists
        if match.groups():
            return match.group(1) + " " + match.group(2) if len(match.groups()) > 1 else match.group(1)
        else:
            return match.group(0)  # Return the entire match if no groups are defined
    return ""

In [None]:
class TripAdvisorScraper:
    def __init__(self):
        self.url_base = 'https://www.tripadvisor.com'
        self.soup = None
        self.url = None


    def fetch_page(self, url):
        self.url = self.url_base + url
        # Set headers
        random_request_id = "".join(
                random.choice(string.ascii_lowercase + string.digits) for i in range(180)
            )
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            'accept-language': 'en-US,en;q=0.9,fr;q=0.8',
            "X-Requested-By": random_request_id,
            "Referer": "https://www.tripadvisor.com/Hotels",
            "Origin": "https://www.tripadvisor.com",
            'accept-encoding': 'gzip, deflate, br',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'encoding': 'utf-8',
        }
        
        # Send a GET request
        response = requests.get(self.url, headers=headers)
        try:
            # Check encoding
            
            if response.headers.get('Content-Encoding') == 'gzip':
                content = response.content.decode('gzip')
            elif response.headers.get('Content-Encoding') == 'deflate':
                content = response.content.decode('zlib')
            # elif response.headers.get('Content-Encoding') == 'br':
            #     content = brotli.decompress(response.content)
            else:
                content = response.text 
        
        except Exception as e:
            print(f"Error: {e}")
            content = response.text
        finally:    
            self.soup = BeautifulSoup(content, 'html.parser')

    def print_soup(self):
        if self.soup:
            print(self.soup.prettify())
        else:
            print("Soup is not initialized. Please fetch the page first.")

class TripAdvisorSpecificRestaurantScrapper(TripAdvisorScraper):
    def __init__(self):
        super().__init__()
        self.restaurant_data = []
        
    def get_review_cards(self):
        if self.soup:
            review_cards = self.soup.find_all('div', class_='_c', attrs={'data-automation': 'reviewCard'})
            return review_cards
        else:
            print("Soup is not initialized. Please fetch the page first.")
            return []
        
    def get_review_page(self, review_cards):
        corpus = []
        for card in review_cards:
            # Extract the review text
            review_text = 'biGQs _P pZUbB KxBGd'
            contributions = 'biGQs _P pZUbB osNWb'
            date = 'biGQs _P pZUbB ncFvv osNWb'
            user_name = 'biGQs _P fiohW fOtGX'
            rating = 'UctUV d H0'

            scrap_review_text = card.find('div', {'class': review_text}).text if card.find('div', {'class': review_text}) else None
            scrap_contributions = card.find('div', {'class': contributions}).text if card.find('div', {'class': contributions}) else None
            scrap_date = card.find('div', {'class': date}).text if card.find('div', {'class': date}) else None
            scrap_user_name = card.find('span', {'class': user_name}).text if card.find('span', {'class': user_name}) else None
            scrap_rating = card.find('svg', {'class': rating}).find('title').text if card.find('svg', {'class': rating}) else None

            doc = {
                'rating': extract_by_regex(scrap_rating, r'(\d\.\d) of 5 bubbles') if scrap_rating != None and scrap_rating != '' else None,
                'user_name': scrap_user_name if scrap_user_name != None and scrap_user_name != '' else None,
                'date': extract_by_regex(scrap_date, r'(\w+ \d+), (\d+)') if scrap_date != None and scrap_date != '' else None,
                'contributions': extract_by_regex(scrap_contributions, r"\d+") if scrap_contributions is not None and scrap_contributions != '' else None,
                'review_text': clean_text(scrap_review_text) if scrap_review_text != None and scrap_review_text != '' else None
            }
            corpus.append(doc)
            
        return corpus
    
    
    def get_all_pages(self):
        page = 1
        corpus = []
        while self.url is not None:
            time.sleep(random.uniform(1, 3))
            new_cards = self.get_review_cards()
            if len(new_cards) == 0:
                break
            new_reg = self.get_review_page(new_cards)
            corpus.extend(new_reg)
            print(f"Page {page} done")
            page += 1
            url = self.get_next_url()
            if url is not None:
                self.fetch_page(url)
            else:
                break
        return corpus
        
    def get_next_url(self):
        next_url = self.soup.find('a', class_='BrOJk u j z _F wSSLS tIqAi unMkR', attrs={'aria-label':"Next page"})
        if next_url is not None:
            return next_url.get('href')
        else:
            return None
        

        
        
        
    

In [4]:

url = '/Restaurant_Review-g187265-d12419021-Reviews-L_Auberge_Des_Canuts-Lyon_Rhone_Auvergne_Rhone_Alpes.html'

scraper = TripAdvisorSpecificRestaurantScrapper()
scraper.fetch_page(url)

In [5]:
corpus = scraper.get_all_pages()

Page 1 done
Page 2 done
Page 3 done
Page 4 done
Page 5 done
Page 6 done
Page 7 done
Page 8 done
Page 9 done
Page 10 done
Page 11 done


In [6]:
df = pd.DataFrame(corpus)
df


Unnamed: 0,review_text,rating,user_name,date,contributions
0,Lovey simple meal at a bouchon. Good size port...,4.0,AngryofTollcross,October 18 2024,10
1,I had a lunch/dinner here ordering off one of ...,4.0,graceinbeijing,March 1 2020,71
2,"We made the reservation with ""the Fork"" and wa...",5.0,Laura M,October 27 2020,1
3,We could clearly see the hygiene and cleanline...,1.0,Tara Bee,June 11 2022,395
4,"Upon arrival, the real welcome in cork: large ...",5.0,Luna T,October 28 2024,61
...,...,...,...,...,...
148,Very good experience. Quality Lyonnaise cuisin...,5.0,spinuche,August 10 2023,30
149,Address to advise Good typical Lyonnais dishes...,5.0,Jean Benoit B,August 7 2023,2
150,"Impeccable service of Lyonnais dishes, very si...",5.0,Loic S,August 5 2023,1
151,Excellent and pleasant service from start to f...,5.0,Stéphane L,August 4 2023,1


In [7]:
restaurant_name = 'L_Auberge_Des_Canuts'
df.to_csv(f'./data/{restaurant_name}.csv', index=False)

In [None]:
class TripAdvisorRestaurantsScrapper(TripAdvisorScraper):
    def __init__(self):
        super().__init__()
        self.restaurant_data = []
    
    def get_restaurants_cards(self):
        if self.soup:
            restaurant_cards = self.soup.find_all('div', class_='qeraN')
            return restaurant_cards
        else:
            print("Soup is not initialized. Please fetch the page first.")
            return []
        
    def get_next_url(self):
        next_url = self.soup.find('a', class_='BrOJk u j z _F wSSLS tIqAi unMkR', attrs={'aria-label':"Next page"})
        if next_url is not None:
            return next_url.get('href')
        else:
            return None
    
    def extract_restaurant_data(self, restaurant_cards):
        corpus = []
        restaurant_name = 'biGQs _P fiohW alXOW oCpZu GzNcM nvOhm UTQMg ZTpaU mtnKn ngXxk'
        restaurant_url = 'BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS'
        restaurant_reviews = 'jVDab W f u w JqMhy'
        for restaurant_card in restaurant_cards:
            scrap_restaurant_name = restaurant_card.find('a', {'class': restaurant_url}).text if restaurant_card.find('a', {'class': restaurant_url}) else None
            scrap_restaurant_url = restaurant_card.find('a', {'class': restaurant_url}).get('href') if restaurant_card.find('a', {'class': restaurant_url}) else None
            scrap_restaurant_reviews = restaurant_card.find('div', {'class': restaurant_reviews}).get('aria-label') if restaurant_card.find('div', {'class': restaurant_reviews}) else None

            doc = {
                'restaurant_class': scrap_restaurant_name.split('.')[0].strip() if scrap_restaurant_name is not None else None,
                'restaurant_name': scrap_restaurant_name.split('.')[1].strip() if scrap_restaurant_name is not None else None,
                'restaurant_url': scrap_restaurant_url if scrap_restaurant_url is not None and scrap_restaurant_url != '' else None,
                'restaurant_reviews': scrap_restaurant_reviews if scrap_restaurant_reviews is not None else None,
            }
            corpus.append(doc)
        return corpus
    
    def get_all_pages(self):
        page = 1
        corpus = []
        while self.url is not None:
            time.sleep(random.uniform(1, 3))
            new_cards = self.get_restaurants_cards()
            new_reg = self.extract_restaurant_data(new_cards)
            corpus.extend(new_reg)
            print(f"Page {page} done")
            page += 1
            url = self.get_next_url()
            if url is not None:
                self.fetch_page(url)
            else:
                break
        return corpus

In [56]:
url = '/FindRestaurants?geo=187265&offset=0&establishmentTypes=10591&minimumTravelerRating=TRAVELER_RATING_LOW&broadened=false'
restaurants_scrapper = TripAdvisorRestaurantsScrapper()
restaurants_scrapper.fetch_page(url)

In [57]:
cards = restaurants_scrapper.get_restaurants_cards()


In [58]:
corpus = restaurants_scrapper.get_all_pages()

Page 1 done
Page 2 done
Page 3 done
Page 4 done
Page 5 done
Page 6 done
Page 7 done
Page 8 done
Page 9 done
Page 10 done
Page 11 done
Page 12 done
Page 13 done
Page 14 done
Page 15 done
Page 16 done
Page 17 done
Page 18 done
Page 19 done
Page 20 done
Page 21 done
Page 22 done
Page 23 done
Page 24 done
Page 25 done
Page 26 done
Page 27 done
Page 28 done
Page 29 done
Page 30 done
Page 31 done
Page 32 done
Page 33 done
Page 34 done
Page 35 done
Page 36 done
Page 37 done
Page 38 done
Page 39 done
Page 40 done
Page 41 done
Page 42 done
Page 43 done
Page 44 done
Page 45 done
Page 46 done
Page 47 done
Page 48 done
Page 49 done
Page 50 done
Page 51 done
Page 52 done
Page 53 done
Page 54 done
Page 55 done
Page 56 done
Page 57 done
Page 58 done
Page 59 done
Page 60 done
Page 61 done
Page 62 done
Page 63 done
Page 64 done
Page 65 done
Page 66 done
Page 67 done
Page 68 done
Page 69 done
Page 70 done
Page 71 done
Page 72 done
Page 73 done
Page 74 done
Page 75 done


In [59]:
df = pd.DataFrame(corpus)
df

Unnamed: 0,restaurant_class,restaurant_name,restaurant_url,restaurant_reviews
0,1,Frazarin,/Restaurant_Review-g187265-d23110895-Reviews-F...,5.0 of 5 bubbles. 297 reviews
1,2,Les Terrasses de Lyon,/Restaurant_Review-g187265-d3727154-Reviews-Le...,4.5 of 5 bubbles. 916 reviews
2,3,Le Casse Museau,/Restaurant_Review-g187265-d949361-Reviews-Le_...,5.0 of 5 bubbles. 407 reviews
3,4,Agastache Restaurant,/Restaurant_Review-g187265-d20287839-Reviews-A...,5.0 of 5 bubbles. 201 reviews
4,5,Le Comptoir Des Cousins,/Restaurant_Review-g187265-d12874430-Reviews-L...,5.0 of 5 bubbles. 710 reviews
...,...,...,...,...
2241,2242,Levantia,/Restaurant_Review-g187265-d28016436-Reviews-L...,5.0 of 5 bubbles. 1 review
2242,2243,L’Epicureuil,/Restaurant_Review-g187265-d24973521-Reviews-L...,5.0 of 5 bubbles. 2 reviews
2243,2244,San Giovanni,/Restaurant_Review-g187265-d21147601-Reviews-S...,3.0 of 5 bubbles. 9 reviews
2244,2245,Le Restaurant Mondial,/Restaurant_Review-g187265-d25161284-Reviews-L...,5.0 of 5 bubbles. 2 reviews


In [13]:
df[df['restaurant_name'] == 'L\'Auberge Des Canuts']

Unnamed: 0,restaurant_class,restaurant_name,restaurant_url,restaurant_reviews


In [60]:
df.to_csv('./data/restaurants.csv', index=False)

In [26]:
restaurant_card = restaurants_scrapper.get_restaurants_cards()[-1]
restaurant_card

<div class="qeraN _T qMONr iOIte iJfMg ndRxi CpYrl rcibp FKwyn" style="min-height:212px"><div class="yYtes _T Fl y" style="min-height:212px"><div class="MMdJi w"><span><a class="aWhIG _S _Z" href="/Restaurant_Review-g187265-d23594325-Reviews-Nature_Gourmande-Lyon_Rhone_Auvergne_Rhone_Alpes.html" target="_blank"><div class="afQPz eXZKw o pABFk w _Z Gm A"><div class="WTWEM w _Z"><div class="_T w _Z" data-clicksource="Photo"><span aria-label="Nature Gourmande" class="f u j _Z w Gi" role="img"><span class="BoFbV c"><svg class="d Vb UmNoP" height="1em" viewbox="0 0 24 24" width="1em"><path clip-rule="evenodd" d="M13.578 4.891 16.25 2.22l1.06 1.06-2.671 2.672c-.679.679-1.055 1.462-1.12 2.199-.043.5.054 1.003.327 1.472L18.75 4.72l1.06 1.06-4.906 4.906c.473.281.974.387 1.466.354.718-.047 1.467-.394 2.096-1.023A904 904 0 0 0 21.02 7.45l.226-.228h.001l1.066 1.055-.227.23a957 957 0 0 1-2.56 2.57c-.849.849-1.927 1.384-3.057 1.459a4.03 4.03 0 0 1-2.647-.768L12.591 13l7.72 7.72-1.061 1.06-5.97-5.97-

In [54]:
restaurant_name = 'jVDab W f u w JqMhy'
res = restaurant_card.find('div', {'class': restaurant_name})
res

In [None]:
restaurant_card.

<div class="qeraN _T qMONr iOIte iJfMg ndRxi CpYrl rcibp FKwyn" style="min-height:212px"><div class="yYtes _T Fl y" style="min-height:212px"><div class="MMdJi w"><span><a class="aWhIG _S _Z" href="/Restaurant_Review-g187265-d23594325-Reviews-Nature_Gourmande-Lyon_Rhone_Auvergne_Rhone_Alpes.html" target="_blank"><div class="afQPz eXZKw o pABFk w _Z Gm A"><div class="WTWEM w _Z"><div class="_T w _Z" data-clicksource="Photo"><span aria-label="Nature Gourmande" class="f u j _Z w Gi" role="img"><span class="BoFbV c"><svg class="d Vb UmNoP" height="1em" viewbox="0 0 24 24" width="1em"><path clip-rule="evenodd" d="M13.578 4.891 16.25 2.22l1.06 1.06-2.671 2.672c-.679.679-1.055 1.462-1.12 2.199-.043.5.054 1.003.327 1.472L18.75 4.72l1.06 1.06-4.906 4.906c.473.281.974.387 1.466.354.718-.047 1.467-.394 2.096-1.023A904 904 0 0 0 21.02 7.45l.226-.228h.001l1.066 1.055-.227.23a957 957 0 0 1-2.56 2.57c-.849.849-1.927 1.384-3.057 1.459a4.03 4.03 0 0 1-2.647-.768L12.591 13l7.72 7.72-1.061 1.06-5.97-5.97-