# Tripadvisor Hotel

## Prerequisites

Firefox browser needs to be installed!

In [1]:
# %pip install selenium==4.9.0
# %pip install helium
# %pip install bs4
# %pip install pandas
# %pip install requests

## Web scraping

### Import

In [2]:
from helium import *
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import csv

### Constants

In [3]:
COLUMNS_NAME = ['HOTEL_NAME', 'HOTEL_RATING', 'HOTEL_PRICE', 'HOTEL_LOCATION', 'HOTEL_REVIEW_URL', 'REVIEW_RATING', 'REVIEW_DATE', 'REVIEW_HELPFUL_VOTES', 'REVIEW_TEXT'] # The columns header of the output CSV files.
ROOT_URL = 'https://www.tripadvisor.com/'

INPUT_FOLDER_PATH = '../data/'
BYLIST_HOTELS_INPUT_FILE_NAME = 'tripadvisor_hotels_not_sustainable.csv'

OUTPUT_FOLDER_PATH = '../data/'
# OUTPUT_FILE_NAME = 'tripadvisor_best_hotels_data.csv' #Best hotels data output file
# OUTPUT_FILE_NAME = 'tripadvisor_worst_hotels_data.csv' #Worst hotels data output file
OUTPUT_FILE_NAME = 'tripadvisor_hotels_not_sustainable_reviews.csv' #Hotels by list data output file

MAX_HOTEL_PAGES = 2 # The maximum number of hotel pages that the  script will scrapes. (used in get_hotels_data_by_location() method)
MAX_REVIEW_PAGES = 10 # The maximum  number of review pages that the script will scrapes for each hotel. (used in get_hotels_data_by_location(), get_hotels_data_by_search(), and get_hotels_by_list())
URL_COLUMN_NAME = 'Tripadvisor' # The name of the column containing the URL of the hotels in the CSV input file. (used in get_hotels_by_list() method)

# Web scraping constants
HOTEL_HEADER_INFO = 'UJWmn f k'
HOTEL_NAME = 'biGQs _P rRtyp'
HOTEL_REVIEW = 'epdRj P'
HOTEL_PRICE = 'gbXAQ'
HOTEL_LOCATION = 'fHvkI PTrfg'
REVIEW = 'YibKl MC R2 Gi z Z BB pBbQr'
NEXT_BUTTON = 'ui_button nav next primary'
READ_MORE_BUTTON = 'Ignyf _S Z'
REVIEW_RATING = 'Hlmiy F1'
REVIEW_DATE = 'teHYY _R Me S4 H3'
REVIEW_HELPFULNESS = 'hVSKz S2 H2 Ch sJlxi'
REVIEW_TEXT = 'fIrGe _T'

### Constants documentation

- HOTEL_HEADER_INFO: UJWmn f k
\
\
![image](../img/HOTEL_HEADER_INFO.png)
---
- HOTEL_NAME: biGQs _P rRtyp
\
\
![image](../img/HOTEL_NAME.png)
---
- HOTEL_REVIEW: epdRj P
\
\
![image](../img/HOTEL_REVIEW.png)
---
- HOTEL_PRICE: gbXAQ
\
\
![image](../img/HOTEL_PRICE.png)
---
- HOTEL_LOCATION: fHvkI PTrfg
\
\
![image](../img/HOTEL_LOCATION.png)
---
- REVIEW: YibKl MC R2 Gi z Z BB pBbQr
\
\
![image](../img/REVIEW.png)
---
- NEXT_BUTTON: ui_button nav next primary
\
\
![image](../img/NEXT_BUTTON.png)
---
- REVIEW_RATING: Hlmiy F1
\
\
![image](../img/REVIEW_RATING.png)
---
- REVIEW_DATE: teHYY _R Me S4 H3
\
\
![image](../img/REVIEW_DATE.png)
---
- REVIEW_HELPFULNESS: hVSKz S2 H2 Ch sJlxi
\
\
![image](../img/REVIEW_HELPFULNESS.png)
---
- REVIEW_TEXT: fIrGe _T
\
\
![image](../img/REVIEW_TEXT.png)

### CSV Output files

In [28]:
try:
    file = open(OUTPUT_FOLDER_PATH+OUTPUT_FILE_NAME, 'w', encoding='UTF8', newline='')
    writer = csv.DictWriter(file, fieldnames=COLUMNS_NAME, delimiter=',', lineterminator='\r')
    writer.writeheader()
except IOError:
    print("I/O error")

### Methods

In [5]:
# aux method
def start_browser(URL=None):
    # Setting browser settings
    Config.implicit_wait_secs = 30
    options = webdriver.FirefoxOptions()
    options.preferences["permissions.default.geo"] = 1
    profile = webdriver.FirefoxProfile()
    profile.set_preference('intl.accept_languages', 'en-US, en')
    if(URL != None):
        return start_firefox(url=URL, options=options, headless=True, profile=profile)
    else:
        return start_firefox(options=options, headless=True, profile=profile)

In [23]:
# aux method
def get_reviews_info(reviews, row, page):
    wait_until(lambda: S("//div[@class=" + HOTEL_HEADER_INFO + "]").exists, 60)
    scroll_down(4500)

    try:
        wait_until(lambda: S("//span[@class=" + READ_MORE_BUTTON +"]").exists, 60)
        # Expand the review
        click('Read more')
        # Get review's info
        for x in range(len(reviews)):
            review = reviews[x]
            # REVIEW_RATING
            row[COLUMNS_NAME[5]] = review.find('div', class_=REVIEW_RATING).find_all('span')[0].get('class')[1]
            # REVIEW_DATE
            row[COLUMNS_NAME[6]] = review.find('span', class_=REVIEW_DATE).text
            # REVIEW_HELPFUL_VOTES
            if(review.find('span', class_=REVIEW_HELPFULNESS) != None):
                row[COLUMNS_NAME[7]] = review.find('span', class_=REVIEW_HELPFULNESS).text
            # REVIEW_TEXT
            row[COLUMNS_NAME[8]] = review.find('div', class_=REVIEW_TEXT).text
            try:
                print('Hotel: ' + row[COLUMNS_NAME[0]] + ' - ' + 'Review: ' + str(x+1) + ' - Page: ' + str(page+1))
                writer.writerow(row)
            except IOError as e:
                print(f"I/O error")
    except Exception as e:
        print(f'Error review info: {e}')


In [27]:
import traceback

def get_hotels_by_list(csv_path, column_name, max_review_pages):
    hotels_df = pd.read_csv(csv_path)
    hotels = hotels_df[column_name].to_list()
    browser = start_browser()

    for x in range(len(hotels)):
        try:
            row = {}
            go_to(hotels[x])
            wait_until(lambda: S("//div[@class=" + HOTEL_HEADER_INFO + "]").exists, 60)
            wait_until(lambda: len(browser.page_source) > 0, 60)
            reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')
            # HOTEL INFORMATION
            # HOTEL_NAME
            row[COLUMNS_NAME[0]] = reviews_soup.find('h1', class_=HOTEL_NAME).text
            # HOTEL_RATING
            row[COLUMNS_NAME[1]] = reviews_soup.find('a', class_=HOTEL_REVIEW).find('span').get('class')[1]
            # HOTEL_PRICE
            price_element = reviews_soup.find('div', class_=HOTEL_PRICE)
            row[COLUMNS_NAME[2]] = price_element.text if price_element != None else 'N/A'
            # HOTEL_LOCATION
            row[COLUMNS_NAME[3]] = reviews_soup.find('span', class_=HOTEL_LOCATION).text
            # HOTEL_REVIEW_LINK
            row[COLUMNS_NAME[4]] = hotels[x]
            for y in range(max_review_pages):
                reviews_soup = BeautifulSoup(browser.page_source, 'html.parser')
                get_reviews_info(reviews_soup.find_all('div', class_=REVIEW), row, y)
                # Goes to the next reviews page
                try:
                    go_to(ROOT_URL + reviews_soup.find('a', class_=NEXT_BUTTON).get('href'))
                    wait_until(lambda: len(browser.page_source) > 0, 60)
                except Exception as e:
                    print(f'Max review page')
                    break
        except Exception as e:
            print(traceback.format_exc())
            print(f'Hotel info error: {e}')

### Getting the data

In [29]:
get_hotels_by_list(INPUT_FOLDER_PATH+BYLIST_HOTELS_INPUT_FILE_NAME, URL_COLUMN_NAME, MAX_REVIEW_PAGES)

Hotel: Strubel-Roos - Review: 1 - Page: 1
Hotel: Strubel-Roos - Review: 2 - Page: 1
Hotel: Strubel-Roos - Review: 3 - Page: 1
Max review page
Hotel: Belle Maison - Das kleine Hotel - Review: 1 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 2 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 3 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 4 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 5 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 6 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 7 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 8 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 9 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 10 - Page: 1
Hotel: Belle Maison - Das kleine Hotel - Review: 1 - Page: 2
Hotel: Belle Maison - Das kleine Hotel - Review: 2 - Page: 2
Hotel: Belle Maison - Das kleine Hotel - Review: 3 - Page: 2
Hotel: Belle Maison - Das kleine Hotel - Review: 4 - Page: 2
Hot

In [30]:
file.close()