In [1]:
import numpy as np
import pandas as pd
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.common.exceptions import TimeoutException

from datetime import datetime
import time
import re

## 1) Define functions

In [2]:
# Define a function to check and load CSV file
def load_csv_file(file_path, default_df=None):
    if os.path.exists(file_path):
        # If the file exists, load it into a DataFrame
        return pd.read_csv(file_path)
    else:
        # If the file doesn't exist, return the default DataFrame (or None)
        return default_df

In [3]:
# Function to wait for loading element
# If element is visible, return "True"
# If the wait times out, return "False"
# Set max waiting time = 60s
def wait_for_element(driver, selector, timeout=60):
    try:
        element_present = EC.visibility_of_element_located((By.CSS_SELECTOR, selector))
        WebDriverWait(driver, timeout).until(element_present)
        return True 
    except TimeoutException:
        return False

In [4]:
# Function to click "Review" button
def click_review_button(driver, selector):
    # If selector is visible, then click the button
    if wait_for_element(driver, selector):
        button_element = driver.find_element(By.CSS_SELECTOR, selector)
        driver.execute_script("arguments[0].click();", button_element)


# Function to click button
def click_button(driver, selector):
    # If selector is visible, then click the button
    if wait_for_element(driver, selector):
        button_element = driver.find_element(By.CSS_SELECTOR, selector)
        action = ActionChains(driver)
        action.move_to_element(button_element).click().perform()

        
# Function to select language option as "English" from dropdown list to show English reviews
def select_english_reviews(driver):
    # If selector is visible, then click the "All languages" button
    languages_selector = "#reviewFilterSection > div > div:nth-child(3) > span"
    if wait_for_element(driver, languages_selector):
        dropdown_element = driver.find_element(By.CSS_SELECTOR, languages_selector)
        action = ActionChains(driver)
        action.move_to_element(dropdown_element).click().perform()
    
    # If selector is visible, then select "English" option
    english_option_selector = "#reviews-language-filter_list li:nth-child(2)"
    if wait_for_element(driver, english_option_selector):
        english_option_element = driver.find_element(By.CSS_SELECTOR, english_option_selector)
        english_option_element.click()
    

# Function to sort reviews by most recent
def sort_by_recent(driver):
    # If selector is visible, then click the button
    selector = "#review-sort-id"
    if wait_for_element(driver, selector):
        dropdown_element = driver.find_element(By.CSS_SELECTOR, selector)
        select = Select(dropdown_element)
        select.select_by_visible_text("Most recent")

In [5]:
# Function to get text from html elements
def get_element_text(driver, selector):
    try:
        return driver.find_elements(By.CSS_SELECTOR, selector)[0].text
    except IndexError:
        return None
    
    
# Function to extract rating category
def extract_rating_cat(rating_cat):
    pattern = re.compile(r'(.+?)\(')
    match = re.search(pattern, rating_cat)

    if match:
        return match.group(1).strip()
    return ""


# Function to extract reviewer and country values from a string
def extract_reviewer_country(text):
    parts = text.split(' from ')

    if len(parts) == 2:
        reviewer = parts[0].strip()
        country = parts[1].strip()
        return reviewer, country
    else:
        return None, None

In [6]:
# Function to extract hotel details and update into df
def extract_hotel_details(driver, index, df):
    try:
        hotel_address = get_element_text(driver, "span[data-selenium='hotel-address-map']")
        overall_rating = get_element_text(driver, "#reviewSection .Review-traveler span:nth-child(2)")
        overall_rating_cat = get_element_text(driver, "#reviewSection .ReviewScoreText")
        rating_cat = get_element_text(driver, "#reviewSection .Review-traveler-Cell > div")
        total_reviews_text = get_element_text(driver, "#reviewSection a")

        # Split string to get rating categories and scores
        rating_cat_split = rating_cat.split('\n')
        rating_cat_pairs = list(zip(rating_cat_split[0::2], rating_cat_split[1::2]))

        # Extract total number of reviews 
        total_reviews = re.sub("[^0-9,]", "", total_reviews_text).replace(",", "")

        # Update extracted data into dataframe
        df.loc[index, "Hotel Address"] = hotel_address
        df.loc[index, "Total Reviews"] = total_reviews
        df.loc[index, "Overall Rating"] = overall_rating
        df.loc[index, "Overall Rating Category"] = overall_rating_cat

        for category, score in rating_cat_pairs:
            df.loc[index, f"Rating_{category}"] = score
            
    except Exception as e:
        print(f"Error extracting hotel details at index {index}")


# Function to extract and return review details
def extract_review_details(driver, index, hotel_id, hotel_name, rating_cat):
    try:
        review_score = get_element_text(driver, f"#review-{index} > div > div > div > div.Review-comment-leftScore")
        review_score_cat = get_element_text(driver, f"#review-{index} > div > div > div > div.Review-comment-leftScoreText")

        reviewer_info = get_element_text(driver, f"#review-{index} > div.Review-comment-left > div > div[data-info-type='reviewer-name']")
        reviewer, country = extract_reviewer_country(reviewer_info)

        group_name = get_element_text(driver, f"#review-{index} > div > div > div[data-info-type='group-name']")
        room_type = get_element_text(driver, f"#review-{index} > div > div > div[data-info-type='room-type']")
        stay_details = get_element_text(driver, f"#review-{index} > div > div > div[data-info-type='stay-detail']")

        review_title = get_element_text(driver, f"#review-{index} > div > div > div > h3")
        review_title = review_title.replace('”', '')

        review = get_element_text(driver, f"#review-{index} > div > div > div > p")

        review_date = get_element_text(driver, f"#review-{index} > div > div > div > div > div > span")
        review_date = review_date.replace('Reviewed ', '')

        return {
            "hotel_id": [hotel_id],
            "hotel_name": [hotel_name],
            "rating_cat": [rating_cat],
            "reviewer": [reviewer],
            "country": [country],
            "group_name": [group_name],
            "room_type": [room_type],
            "stay_details": [stay_details],
            "review_score": [review_score],
            "review_score_cat": [review_score_cat],
            "review_date": [review_date],
            "review_title": [review_title],
            "review": [review]
        }
    except Exception as e:
        print(f"Error extracting review details at index {index}")
        return None  # Return "None" means review details extraction failed

## 2) Load datasets

In [7]:
# Check and load "agoda_hotels_details.csv" file
hotels_file_path = "agoda_hotels_details.csv"
df_hotel = load_csv_file(hotels_file_path, default_df=pd.read_csv("agoda_hotels.csv"))

# Define an empty dataframe "df_review" or load "agoda_reviews_details.csv" file
reviews_file_path = "agoda_reviews_details.csv"
df_review = load_csv_file(reviews_file_path, default_df=pd.DataFrame(columns=["hotel_id", "hotel_name", "rating_cat", "reviewer", 
                                                                              "country", "group_name", "room_type", "stay_details", 
                                                                              "review_score", "review_score_cat", "review_date", 
                                                                              "review_title", "review"]))

print(f"Records in df_hotel: {len(df_hotel)}")
print(f"Records in df_review: {len(df_review)}")

Records in df_hotel: 361
Records in df_review: 51329


## 3) Extract hotel details and review details by using selenium webdriver

In [8]:
# Initial Chrome
driver = webdriver.Chrome()

In [9]:
print(f"Web scraping starts: {datetime.now()}")


# For loop all hotel urls in dataframe to extract hotel details

#for index, row in df_hotel.iterrows():
for index, row in df_hotel.iloc[321:361].iterrows():
    try:
        hotel_id = row["Hotel ID"]
        hotel_name = row["Hotel Name"]
        hotel_url = row["Hotel URL"]
        print(hotel_url)

        # Open Agoda hotel detailed page
        driver.get(hotel_url)
        time.sleep(5)
        #driver.implicitly_wait(4)


        # == Extract hotel details ==
        # If selector is visible, extract the hotel details
        hotel_selector = "#reviewSectionComments > div:last-child"       
        if wait_for_element(driver, hotel_selector):
            extract_hotel_details(driver, index, df_hotel)


        # == Extract review details ==
        # Go to reviews section in the webpage, and configure setting
        click_review_button(driver, "#hotelNavBar > div > ul > li:nth-child(4) > button > span") # Click "Review" button
        sort_by_recent(driver) # Sort reviews by most recent
        #select_english_reviews(driver) # Select English language
        time.sleep(5)

        # For loop to select different rating categories of reviews
        for n in range(1, 6): 
            # Tick checkbox for reviews' rating categories, e.g. "9+ Exceptional", "8-9 Excellent", "7-8 Very Good", etc.
            time.sleep(5)
            click_button(driver, f"#reviewSection .ReviewSideFilter__GuestRatingColumn > div > div > div:nth-child({n})")
            time.sleep(5)

            # Extract max review index of current page, and convert to integer
            # If selector is visible, then click the button
            max_review_index_selector = "#reviewSectionComments > div:last-child"
            if wait_for_element(driver, max_review_index_selector):
                review_index = int(driver.find_element(By.CSS_SELECTOR, max_review_index_selector).get_attribute("data-element-index"))

            # Extract each review's rating category
            rating_cat_text = get_element_text(driver, f"#reviewSection .ReviewSideFilter__GuestRatingColumn > div > div > div:nth-child({n})")
            rating_cat = extract_rating_cat(rating_cat_text)

            # Extract max page number for currently selected reviews' rating categories.
            # Currently, only extract first time loading page number that is <= 5
            page_num = get_element_text(driver, "#reviewSection > div:nth-child(5) > div > span.Review-paginator-numbers > span:last-child")

            if page_num == None: # If only 1 page, extracted page_num = None
                max_page_num = 1
            else:
                max_page_num = int(page_num)
                
                
            # Extrtact review details in first page of current rating category
            for idx in range(review_index+1):
                review_data = extract_review_details(driver, idx, hotel_id, hotel_name, rating_cat) # Extract review details
                df_review = pd.concat([df_review, pd.DataFrame(review_data)], ignore_index=True)


            # For loop to extract review details in next pages
            for page in range(2, max_page_num+1):
                # Click next page button
                click_button(driver, "#reviewSection > div:nth-child(5) > div > span:nth-child(3) > i")
                time.sleep(5)

                # Extract max review index of current page, and convert to integer
                # If selector is visible, then click the button
                max_review_index_selector = "#reviewSectionComments > div:last-child"
                if wait_for_element(driver, max_review_index_selector):
                    review_index = int(driver.find_element(By.CSS_SELECTOR, max_review_index_selector).get_attribute("data-element-index"))
                
                # For loop to extract all reviews details in current page
                for idx in range(review_index+1):
                    review_data = extract_review_details(driver, idx, hotel_id, hotel_name, rating_cat) # Extract review details
                    df_review = pd.concat([df_review, pd.DataFrame(review_data)], ignore_index=True)

            # Untick checkbox for rating categories of reviews
            click_button(driver, f"#reviewSection .ReviewSideFilter__GuestRatingColumn > div > div > div:nth-child({n})")
            time.sleep(5)
    
    # Terminates the current iteration and starts the next iteration
    except Exception as e:
        print(f"Error encountered in hotel_index: {index}")
        continue

print(f"Web scraping ends: {datetime.now()}")

Web scraping starts: 2024-02-02 21:27:33.554748
https://www.agoda.com/lion-peak-hotel-hamilton/hotel/singapore-sg.html
Error extracting review details at index 2
Error extracting review details at index 2
Error extracting review details at index 2
Error extracting review details at index 2
Error extracting review details at index 2
https://www.agoda.com/k-hotel-aliwal-premier/hotel/singapore-sg.html
Error extracting review details at index 1
https://www.agoda.com/k-hotel-14/hotel/singapore-sg.html
https://www.agoda.com/bluewaters-pods-hong-kong-street/hotel/singapore-sg.html
Error extracting review details at index 0
Error extracting review details at index 3
https://www.agoda.com/beat-capsule-hostel-boat-quay/hotel/singapore-sg.html
https://www.agoda.com/fragrance-hotel-oasis/hotel/singapore-sg.html
https://www.agoda.com/ibis-budget-singapore-clarke-quay-sg-clean-certified-staycation-approved/hotel/singapore-sg.html
Error extracting review details at index 2
https://www.agoda.com/asco

In [10]:
# Save hotels and reviews details into csv
df_hotel.to_csv("agoda_hotels_details.csv", index=False, encoding="utf-8")
df_review.to_csv("agoda_reviews_details.csv", index=False, encoding="utf-8")

# Close Chrome
driver.quit()