In [12]:
from bs4 import BeautifulSoup
import re
import time
import requests
import random
import string

import pandas as pd


In [6]:
def clean_text(text: str) -> str:
    txt = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    txt = txt.replace('  ', ' ')
    return txt.strip()

def extract_by_regex(text: str, regex: str) -> str:
    pattern = re.compile(regex)
    match = pattern.search(text)
    if match:
        # Check if there are any groups and return the first group if it exists
        if match.groups():
            return match.group(1) + " " + match.group(2) if len(match.groups()) > 1 else match.group(1)
        else:
            return match.group(0)  # Return the entire match if no groups are defined
    return ""

In [None]:
class TripAdvisorScraper:
    def __init__(self, url):
        self.url = url
        self.soup = None


    def fetch_page(self):
        
        # Set headers
        random_request_id = "".join(
                random.choice(string.ascii_lowercase + string.digits) for i in range(180)
            )
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            'accept-language': 'en-US,en;q=0.9,fr;q=0.8',
            "X-Requested-By": random_request_id,
            "Referer": "https://www.tripadvisor.com/Hotels",
            "Origin": "https://www.tripadvisor.com",
            'accept-encoding': 'gzip, deflate, br',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'encoding': 'utf-8',
        }
        
        # Send a GET request
        response = requests.get(self.url, headers=headers)
        try:
            # Check encoding
            
            if response.headers.get('Content-Encoding') == 'gzip':
                content = response.content.decode('gzip')
            elif response.headers.get('Content-Encoding') == 'deflate':
                content = response.content.decode('zlib')
            # elif response.headers.get('Content-Encoding') == 'br':
            #     content = brotli.decompress(response.content)
            else:
                content = response.text 
        
        except Exception as e:
            print(f"Error: {e}")
            content = response.text
        finally:    
            self.soup = BeautifulSoup(content, 'html.parser')

    def print_soup(self):
        if self.soup:
            print(self.soup.prettify())
        else:
            print("Soup is not initialized. Please fetch the page first.")

    def get_review_cards(self):
        if self.soup:
            review_cards = self.soup.find_all('div', class_='_c', attrs={'data-automation': 'reviewCard'})
            return review_cards
        else:
            print("Soup is not initialized. Please fetch the page first.")
            return []
        
    def get_next_url(self, base_url):
        next_url = self.soup.find('a', class_='BrOJk u j z _F wSSLS tIqAi unMkR').get('href')
        if next_url is not None:
            next_url = base_url + next_url
            return next_url
        else:
            return None
    

In [9]:
def cards_to_df(review_cards):
    corpus = []
    for card in review_cards:
        # Extract the review text
        review_text = 'biGQs _P pZUbB KxBGd'
        contributions = 'biGQs _P pZUbB osNWb'
        date = 'biGQs _P pZUbB ncFvv osNWb'
        user_name = 'biGQs _P fiohW fOtGX'
        rating = 'UctUV d H0'

        scrap_review_text = card.find('div', {'class': review_text}).text if card.find('div', {'class': review_text}) else None
        scrap_contributions = card.find('div', {'class': contributions}).text if card.find('div', {'class': contributions}) else None
        scrap_date = card.find('div', {'class': date}).text if card.find('div', {'class': date}) else None
        scrap_user_name = card.find('span', {'class': user_name}).text if card.find('span', {'class': user_name}) else None
        scrap_rating = card.find('svg', {'class': rating}).find('title').text if card.find('svg', {'class': rating}) else None

        doc = {
            'review_text': clean_text(scrap_review_text) if scrap_review_text != None and scrap_review_text != '' else None,
            'rating': extract_by_regex(scrap_rating, r'(\d\.\d) of 5 bubbles') if scrap_rating != None and scrap_rating != '' else None,
            'user_name': scrap_user_name if scrap_user_name != None and scrap_user_name != '' else None,
            'date': extract_by_regex(scrap_date, r'(\w+ \d+), (\d+)') if scrap_date != None and scrap_date != '' else None,
            'contributions': extract_by_regex(scrap_contributions, r"\d+") if scrap_contributions is not None and scrap_contributions != '' else None
        }
        corpus.append(doc)

    df = pd.DataFrame(corpus)
    return df



In [23]:


url_base = 'https://www.tripadvisor.com'
complement = '/Restaurant_Review-g187265-d12419021-Reviews-L_Auberge_Des_Canuts-Lyon_Rhone_Auvergne_Rhone_Alpes.html'
url = url_base + complement


scraper = TripAdvisorScraper(url)
scraper.fetch_page()

next_url = scraper.get_next_url(url_base)

review_cards = scraper.get_review_cards()   

df = cards_to_df(review_cards)

    

In [18]:
df

Unnamed: 0,review_text,rating,user_name,date,contributions
0,Lovey simple meal at a bouchon. Good size port...,4.0,AngryofTollcross,October 18 2024,10
1,I had a lunch/dinner here ordering off one of ...,4.0,graceinbeijing,March 1 2020,71
2,"We made the reservation with ""the Fork"" and wa...",5.0,Laura M,October 27 2020,1
3,We could clearly see the hygiene and cleanline...,1.0,Tara Bee,June 11 2022,395
4,"Upon arrival, the real welcome in cork: large ...",5.0,Luna T,October 28 2024,61
5,I see that others have given this tiny place l...,4.0,Mardi S,October 14 2022,156
6,"There’s good attentive service, and there’s be...",1.0,Tim W,February 5 2023,3
7,After a (discreet) dinner with work colleagues...,1.0,Gianluca P,May 27 2024,1
8,I'm not one to write reviews when they're nega...,1.0,Óliver R,June 3 2024,4
9,We come to Lyon regularly and we have frequent...,5.0,Pascal Perret,March 11 2024,39


In [26]:
page = 1
while next_url is not None:
    time.sleep(random.uniform(1, 5) / 1000)
    scraper = TripAdvisorScraper(next_url)
    scraper.fetch_page()
    review_cards = scraper.get_review_cards()
    df = pd.concat([df, cards_to_df(review_cards)], ignore_index=True)
    next_url = scraper.get_next_url(url_base)
    page += 1
    print(f"Page {page} done")

Page 2 done
Page 3 done
Page 4 done
Page 5 done
Page 6 done
Page 7 done
Page 8 done
Page 9 done
Page 10 done
Page 11 done
Page 12 done
Page 13 done
Page 14 done
Page 15 done
Page 16 done
Page 17 done
Page 18 done
Page 19 done
Page 20 done
Page 21 done
Page 22 done
Page 23 done
Page 24 done
Page 25 done
Page 26 done
Page 27 done
Page 28 done
Page 29 done
Page 30 done
Page 31 done
Page 32 done
Page 33 done
Page 34 done
Page 35 done
Page 36 done
Page 37 done
Page 38 done
Page 39 done
Page 40 done
Page 41 done
Page 42 done
Page 43 done
Page 44 done
Page 45 done
Page 46 done
Page 47 done
Page 48 done
Page 49 done
Page 50 done
Page 51 done
Page 52 done
Page 53 done
Page 54 done
Page 55 done
Page 56 done
Page 57 done
Page 58 done
Page 59 done
Page 60 done
Page 61 done
Page 62 done
Page 63 done
Page 64 done
Page 65 done
Page 66 done
Page 67 done


AttributeError: 'NoneType' object has no attribute 'get'