# Tripadvisor Hotel

## Prerequisites

Firefox browser needs to be installed!

In [1]:
# %pip install helium
# %pip install bs4
# %pip install pandas
# %pip install requests

## Web scraping

### Import

In [2]:
from helium import *
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import csv

### Constants

In [3]:
COLUMNS_NAME = ['HOTEL_NAME', 'HOTEL_RATING', 'HOTEL_PRICE', 'HOTEL_LOCATION', 'HOTEL_REVIEW_URL', 'REVIEW_RATING', 'REVIEW_DATE', 'REVIEW_HELPFUL_VOTES', 'REVIEW_TEXT'] # The columns header of the output CSV files.
ROOT_URL = 'https://www.tripadvisor.com/'

INPUT_FOLDER_PATH = '../data/'
BYLIST_HOTELS_INPUT_FILE_NAME = 'tripadvisor_hotels_all.csv'

OUTPUT_FOLDER_PATH = '../data/'
# OUTPUT_FILE_NAME = 'tripadvisor_best_hotels_data.csv' #Best hotels data output file
# OUTPUT_FILE_NAME = 'tripadvisor_worst_hotels_data.csv' #Worst hotels data output file
OUTPUT_FILE_NAME = 'tripadvisor_hotels_all_reviews.csv' #Hotels by list data output file

MAX_HOTEL_PAGES = 1 # The maximum number of hotel pages that the script will scrapes. (used in get_hotels_data_by_location() method)
MAX_REVIEW_PAGES = 20 # The maximum  number of review pages that the script will scrapes for each hotel. (used in get_hotels_data_by_location(), get_hotels_data_by_search(), and get_hotels_by_list())
URL_COLUMN_NAME = 'Tripadvisor' # The name of the column containing the URL of the hotels in the CSV input file. (used in get_hotels_by_list() method)

### CSV Output files

In [4]:
try:
    writer = csv.DictWriter(open(OUTPUT_FOLDER_PATH+OUTPUT_FILE_NAME, 'w', encoding='UTF8', newline=''), fieldnames=COLUMNS_NAME, delimiter=',', lineterminator='\r')
    writer.writeheader()
except IOError:
    print("I/O error")

### Methods

In [5]:
# aux method
def start_browser(URL=None):
    # Setting browser settings
    Config.implicit_wait_secs = 30
    options = webdriver.FirefoxOptions()
    options.preferences["permissions.default.geo"] = 1
    if(URL != None):
        return start_firefox(url=URL, options=options, headless=True)
    else:
        return start_firefox(options=options, headless=True)

In [6]:
# aux method
def get_reviews_info(reviews, row, page):
    scroll_down(4500)
    wait_until(S("//span[text()='Read more']").exists, 60)
    
    try:
        # Expand the review
        click('Read more')
        # Get review's info
        for x in range(len(reviews)):
            review = reviews[x]
            # REVIEW_RATING
            row[COLUMNS_NAME[5]] = review.find('div', class_='Hlmiy F1').find_all('span')[0].get('class')[1]
            # REVIEW_DATE
            row[COLUMNS_NAME[6]] = review.find('span', class_='teHYY _R Me S4 H3').text
            # REVIEW_HELPFUL_VOTES
            if(review.find('span', class_='hVSKz S2 H2 Ch sJlxi') != None):
                row[COLUMNS_NAME[7]] = review.find('span', class_='hVSKz S2 H2 Ch sJlxi').text
            # REVIEW_TEXT
            row[COLUMNS_NAME[8]] = review.find('div', class_='fIrGe _T').text
            try:
                print('Hotel: ' + row[COLUMNS_NAME[0]] + ' - ' + 'Review: ' + str(x+1) + ' - Page: ' + str(page+1))
                writer.writerow(row)
            except IOError:
                print("I/O error")
    except:
        print('Error review info')


In [7]:
# aux method
def get_hotels_info(hotels, max_review_pages):
    for x in range(len(hotels)):
        hotel = hotels[x]
        row = {}

        # HOTEL_NAME
        row[COLUMNS_NAME[0]] = hotel.find('a', class_='property_title prominent').text
        # HOTEL_RATING
        row[COLUMNS_NAME[1]] = hotel.find('div', class_='prw_rup prw_common_rating_and_review_count_with_popup linespace is-shown-at-mobile').find_all('a')[0].get('alt')
        # HOTEL_PRICE
        row[COLUMNS_NAME[2]] = hotel.find('div', class_='price __resizeWatch').text
        # HOTEL_REVIEW_LINK
        row[COLUMNS_NAME[4]] = ROOT_URL + hotel.find('a', class_='review_count').get('href')
        
        # Goes to the hotel's reviews
        browser = start_firefox(url=row[COLUMNS_NAME[4]], headless=True)
        wait_until(S("//div[@class='jvqAy']").exists, 30)
        sleep(10)
        reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')

        # HOTEL_LOCATION
        row[COLUMNS_NAME[3]] = reviews_soup.find('span', class_='fHvkI PTrfg').text
        
        for y in range(max_review_pages):
            reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')
            get_reviews_info(reviews_soup.find_all('div', class_='YibKl MC R2 Gi z Z BB pBbQr'), row, y)
            # Goes to the next reviews page
            try:
                go_to(ROOT_URL + reviews_soup.find('a', class_='ui_button nav next primary').get('href'))
                reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')
            except:
                print('Max review page')
                break 
        kill_browser()
        

In [8]:
#
def get_hotels_data_by_location(URL, max_hotel_pages):
    # Start the browser
    browser = start_browser(URL)
    # Press "See all" button to show the navigation menu
    wait_until(S("//span[text()='Check In']").exists, 60)
    wait_until(lambda: len(browser.page_source) > 0, 60)
    sleep(3)
    press(END)
    scroll_up(600)
    wait_until(Button('See all').exists, 60)
    sleep(2)
    click('See all')
    sleep(15)

    for x in range(max_hotel_pages):
        soup = BeautifulSoup(browser.page_source, 'html.parser')
        get_hotels_info(soup.find_all('div', class_='ui_column is-8 main_col allowEllipsis'), MAX_REVIEW_PAGES)
        set_driver(browser)
        press(END)
        click('Next')
        wait_until(S("//span[text()='Check In']").exists, 60)
        sleep(3)
    kill_browser()

In [9]:
#
def get_hotels_data_by_search(URL):
    # Start the browser
    browser = start_browser(URL)
    wait_until(S("//span[@class='title-match']").exists, 60)
    # Get html source code of the website
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    hotels = soup.find_all('div', class_='ui_columns is-mobile result-content-columns')
    
    for x in range(len(hotels)):
        hotel = hotels[x]
        row = {}

        # HOTEL_RATING
        row[COLUMNS_NAME[1]] = hotel.find('div', class_='prw_rup prw_common_responsive_rating_and_review_count').find('span').get('alt')
        # HOTEL_REVIEW_LINK
        row[COLUMNS_NAME[4]] = ROOT_URL + hotel.find('a', class_='review_count').get('href')

        # Goes to the hotel's reviews
        go_to(url=row[COLUMNS_NAME[4]])
        wait_until(lambda: S("//div[@class='WXMFC b autoResize']").exists or S("//div[@class='WXMFC b']").exists or S("//div[@class='JPNOn b Wi']").exists, 60)
        wait_until(lambda: len(browser.page_source) > 0, 60)
        print(len(browser.page_source))
        reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')
        
        # HOTEL_NAME
        row[COLUMNS_NAME[0]] = reviews_soup.find('h1', class_='QdLfr b d Pn').text
        # HOTEL_LOCATION
        row[COLUMNS_NAME[3]] = reviews_soup.find('span', class_='fHvkI PTrfg').text
        # HOTEL_PRICE
        if(reviews_soup.find('div', class_='WXMFC b') != None):
            row[COLUMNS_NAME[2]] = reviews_soup.find('div', class_='WXMFC b').text
        elif(reviews_soup.find('div', class_='JPNOn b Wi') != None):
            row[COLUMNS_NAME[2]] = reviews_soup.find('div', class_='JPNOn b Wi').text
        elif(reviews_soup.find('div', class_='WXMFC b autoResize') != None):
            row[COLUMNS_NAME[2]] = reviews_soup.find('div', class_='WXMFC b autoResize').text

        for y in range(MAX_REVIEW_PAGES):
            reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')
            get_reviews_info(reviews_soup.find_all('div', class_='YibKl MC R2 Gi z Z BB pBbQr'), row, y)
            # Goes to the next reviews page
            try:
                go_to(ROOT_URL + reviews_soup.find('a', class_='ui_button nav next primary').get('href'))
            except:
                print('Max review page')
                break
    kill_browser()

In [10]:
#
def get_hotels_by_list(csv_path, column_name, max_review_pages):
    hotels_df = pd.read_csv(csv_path)
    hotels = hotels_df[column_name].to_list()
    browser = start_browser()

    for x in range(len(hotels)):
        try:
            row = {}
            go_to(hotels[x])
            wait_until(lambda: S("//div[@class='WXMFC b autoResize']").exists or S("//div[@class='WXMFC b']").exists or S("//div[@class='JPNOn b Wi']").exists, 60)
            wait_until(lambda: len(browser.page_source) > 0, 60)
            reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')

            # Check if the hotel is unclaimed
            if(reviews_soup.find('div', class_='XAnbq _S ZUJme') == None):
                # HOTEL_NAME
                row[COLUMNS_NAME[0]] = reviews_soup.find('h1', class_='QdLfr b d Pn').text
                # HOTEL_RATING
                row[COLUMNS_NAME[1]] = reviews_soup.find('a', class_='BNPpl q wfOFe _T Gi').find('span').get('class')[1]
                # HOTEL_PRICE
                if(reviews_soup.find('div', class_='WXMFC b') != None):
                    row[COLUMNS_NAME[2]] = reviews_soup.find('div', class_='WXMFC b').text
                elif(reviews_soup.find('div', class_='JPNOn b Wi') != None):
                    row[COLUMNS_NAME[2]] = reviews_soup.find('div', class_='JPNOn b Wi').text
                elif(reviews_soup.find('div', class_='WXMFC b autoResize') != None):
                    row[COLUMNS_NAME[2]] = reviews_soup.find('div', class_='WXMFC b autoResize').text
                # HOTEL_LOCATION
                row[COLUMNS_NAME[3]] = reviews_soup.find('span', class_='fHvkI PTrfg').text
                # HOTEL_REVIEW_LINK
                row[COLUMNS_NAME[4]] = hotels[x]

                for y in range(max_review_pages):
                    reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')
                    get_reviews_info(reviews_soup.find_all('div', class_='YibKl MC R2 Gi z Z BB pBbQr'), row, y)
                    # Goes to the next reviews page
                    try:
                        go_to(ROOT_URL + reviews_soup.find('a', class_='ui_button nav next primary').get('href'))
                        wait_until(lambda: len(browser.page_source) > 0, 60)
                    except:
                        print('Max review page')
                        break
        except:
            print('Hotel info error')
    kill_browser()

### Getting the data

In [11]:
#get_hotels_data_by_location('https://www.tripadvisor.com/Hotels-g4-Europe-Hotels.html', MAX_HOTEL_PAGES)
#get_hotels_data_by_search('https://www.tripadvisor.com/Search?q=worst%20hotel&searchSessionId=CFC88E23E91B7FA8D0D751E5DEE9F5D31663450804735ssid&sid=9D518B65B4454EF1A82B6656FCF7E39F1663452594116&blockRedirect=true&ssrc=h&isSingleSearch=true&geo=1&rf=3')
get_hotels_by_list(INPUT_FOLDER_PATH+BYLIST_HOTELS_INPUT_FILE_NAME, URL_COLUMN_NAME, MAX_REVIEW_PAGES)

Hotel: Hotel Restaurant VILLINO - Review: 1 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 2 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 3 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 4 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 5 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 6 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 7 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 8 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 9 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 10 - Page: 1
Hotel: Hotel Restaurant VILLINO - Review: 1 - Page: 2
Hotel: Hotel Restaurant VILLINO - Review: 2 - Page: 2
Hotel: Hotel Restaurant VILLINO - Review: 3 - Page: 2
Hotel: Hotel Restaurant VILLINO - Review: 4 - Page: 2
Hotel: Hotel Restaurant VILLINO - Review: 5 - Page: 2
Hotel: Hotel Restaurant VILLINO - Review: 6 - Page: 2
Hotel: Hotel Restaurant VILLINO - Review: 7 - Page: 2
Hotel: Hotel Restaurant VILLINO - Review: 8 - Page: 2
Hotel: Hotel Restaurant VIL