In [None]:
%pip install webdriver_manager

In [1]:
import os
import csv
import glob
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException, TimeoutException

In [2]:
def scrape_hotel_links(url):
    """
    Scrape hotel links from the given URL and return them as a list of dictionaries.

    Args:
    - url (str): the URL to scrape hotel links from

    Returns:
    - list: a list of dictionaries containing hotel links
    """

    # Set up the Chrome driver with specified options
    options = webdriver.ChromeOptions()
    #driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options = options)
    # Set the path to the WebDriver executable (e.g., chromedriver or geckodriver)
    driver_path = 'chromedriver.exe'

    # Create a WebDriver instance (for Chrome in this example)
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.set_window_size(1120, 1000)
    
    # Load the webpage with the given URL
    driver.get(url)
    
    # Initialize an empty list to store the hotel links
    urls = []

    # give the DOM time to load (3 seconds)
    time.sleep(3)

    # Wait for the element to become clickable
    #wait = WebDriverWait(driver, 10)
    #element = wait.until(EC.element_to_be_clickable((By.XPATH, ".//button[@class='rmyCe _G B- z _S c Wc wSSLS pexOo sOtnj']")))

    # Click the element
    #element.click()

    # Click the "see all" button to reveal the entire hotels.
    #driver.find_element("xpath", ".//button[@class='rmyCe _G B- z _S c Wc wSSLS pexOo sOtnj']").click()

    # Collapse the date panel if it is present
    try:
        collapseDatePanel = driver.find_element("xpath", ".//div[@class='KWdaU Za f e']")
    except:
        collapseDatePanel = None

    if collapseDatePanel is not None:
        collapseDatePanel.click()

    # Get the number of hotels in Marrakech (778)
    numberOfHotels = 30

    # Scrape hotel links until we have enough
    while len(urls) < numberOfHotels:

        # Wait for the DOM to load (3 seconds)
        time.sleep(3)

        # Find all hotels in the current page and store them all to a container
        #container = driver.find_elements("xpath", ".//div[@class='jsTLT K']")
        container = driver.find_elements("xpath", ".//div[contains(@class, 'jsTLT K') or contains(@class, 'listing_title')]/a")

        # Parse each hotel in the container
        for j in range(len(container)):

            # Print progress information
            print("Progress: {}".format("" + str(len(urls)) + "/" + str(numberOfHotels)))

            # If we have enough hotel links, stop parsing
            if len(urls) >= numberOfHotels:
                break

            # Get the hotel link and append it to the list of URLs
            hotelLink = container[j].get_attribute("href")

            urls.append({
                'Hotel_Link': hotelLink
            })

    # Quit the driver when all pages have been processed
    driver.quit()
    
    # Return the list of hotel links
    return urls

In [None]:
def scrape_hotel_links_and_update_csv(url: str) -> None:
    """
    Scrape hotel links from the given URL, remove duplicate links from an existing CSV file,
    and append the new links to the CSV file.

    Args:
    - url (str): the URL to scrape hotel links from
    example: https://www.tripadvisor.com/Hotels-g293734-oa30-Marrakech_Marrakech_Safi-Hotels.html

    Returns:
    - None: the function does not return anything, but it updates the CSV file with the new links
    """

    # Scrape and save the new links to a DataFrame
    links = scrape_hotel_links(url)
    df = pd.DataFrame(links)

    # Remove the hotel links from the existing CSV file
    df1 = pd.read_csv('Data/hotels_links.csv')
    df = df[~df['Hotel_Link'].isin(df1['Hotel_Link'])]

    # Append the new DataFrame to the existing CSV file
    df1 = df1.append(df, ignore_index=True)
    df1.to_csv('Data/hotels_links.csv', index=False)

    # Save the DataFrame to a separate CSV file
    df.to_csv('Data/hotels_links_30.csv', index=False)


In [None]:
def scrape_hotel_reviews(url):
    """
    Scrapes the reviews of a hotel from TripAdvisor.

    Args:
        url (str): The URL of the TripAdvisor page of the hotel.

    Returns:
        reviews (list): A list of dictionaries, each containing the following keys:
            - hotelName: The name of the hotel.
            - reviewDate: The date of the review.
            - reviewRating: The rating given by the reviewer.
            - reviewTitle: The title of the review.
            - reviewText: The text of the review.
            - reviewerProfileLink: The link to the profile page of the reviewer.
            - dateOfStay: The date of the stay.
            - tripType: The type of trip.
            - reviewerLocation: The location of the reviewer.
    """
    
    # Set up the Chrome driver with specified options
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options = options)
    driver.set_window_size(1120, 1000)
    
    # Load the webpage with the given URL
    driver.get(url)
        
    # Initialize an empty list to store the reviews
    reviews = []

    # give the DOM time to load (3 seconds)
    time.sleep(3)

    # Grab the hotel name
    hotelName = driver.find_element("xpath","//h1[@id='HEADING']").text

    # Grab the number of english reviews
    numberOfReviewsStr = driver.find_element("xpath",".//label[@class='Qukvo Vm _S']/span[text()='English']/following-sibling::span[1]").text
    numberOfReviews = int(numberOfReviewsStr.strip('()').replace(',', ''))

    # Scrape hotel reviews until we have enough
    while len(reviews) < numberOfReviews:

        # give the DOM time to load (3 seconds)
        time.sleep(3)
        
        # Wait for the element to become clickable
        #wait = WebDriverWait(driver, 10)
        #element = wait.until(EC.element_to_be_clickable((By.XPATH, ".//div[contains(@data-test-target, 'expand-review')]")))

        # Click the element
        #element.click()

        # Click the "expand review" link to reveal the entire review.
        #driver.find_element("xpath", ".//div[contains(@data-test-target, 'expand-review')]").click()
        try:
            driver.find_element("xpath", ".//span[@class='Ignyf _S Z']").click()
        except ElementNotInteractableException:
            element = driver.find_element("xpath", ".//span[@class='Ignyf _S Z']")
            driver.execute_script("arguments[0].click();", element)

        # Find all reviews in the current page and store them all to a container
        container = driver.find_elements("xpath", "//div[contains(@data-test-target, 'HR_CC_CARD')]")

        # Parse each review in the container
        for j in range(len(container)):

            # Print progress information
            print("Progress: {}".format("" + str(len(reviews)) + "/" + str(numberOfReviews)))

            # If we have enough hotel reviews, stop parsing
            if len(reviews) >= numberOfReviews:
                break

            retry_count = 0
            while retry_count < 3:
                try:
                    # Grab the review date
                    reviewDate = container[j].find_element("xpath", ".//div[@class='cRVSd']/span").text
                    break
                except StaleElementReferenceException:
                    retry_count += 1
                    # If the element has become stale, wait and try again
                    time.sleep(2)
                except NoSuchElementException:
                    reviewDate = None
                    break

            retry_count = 0
            while retry_count < 3:
                try:
                    # Grab the rating
                    reviewRating = container[j].find_element("xpath", ".//span[contains(@class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
                    break
                except StaleElementReferenceException:
                    retry_count += 1
                    # If the element has become stale, wait and try again
                    time.sleep(2)
                except NoSuchElementException:
                    reviewRating = None
                    break

            retry_count = 0
            while retry_count < 3:
                try:
                    # Grab the title
                    reviewTitle = container[j].find_element("xpath",".//div[contains(@data-test-target, 'review-title')]").text
                    break
                except StaleElementReferenceException:
                    retry_count += 1
                    # If the element has become stale, wait and try again
                    time.sleep(2)
                except NoSuchElementException:
                    reviewTitle = None
                    break

            retry_count = 0
            while retry_count < 3:
                try:
                    # Grab the review text, removing newlines and commas
                    reviewText = container[j].find_element("xpath", ".//q[@class='QewHA H4 _a']").text.replace("\n", "  ").replace(",", "")
                    break
                except StaleElementReferenceException:
                    retry_count += 1
                    # If the element has become stale, wait and try again
                    time.sleep(2)
                except NoSuchElementException:
                    reviewText = None
                    break
                
            retry_count = 0
            while retry_count < 3:
                try:
                    # Grab the reviewer profile link
                    reviewerProfileLink = container[j].find_element("xpath", ".//div[@class='cRVSd']/span/a").get_attribute("href")
                    break
                except StaleElementReferenceException:
                    retry_count += 1
                    # If the element has become stale, wait and try again
                    time.sleep(2)
                except NoSuchElementException:
                    reviewerProfileLink = None
                    break
                
            retry_count = 0
            while retry_count < 3:
                try:
                    # Grab the reviewer Date of stay
                    dateOfStay = container[j].find_element("xpath", ".//span[@class='teHYY _R Me S4 H3']").text
                    break
                except StaleElementReferenceException:
                    retry_count += 1
                    # If the element has become stale, wait and try again
                    time.sleep(2)
                except NoSuchElementException:
                    dateOfStay = None
                    break

            retry_count = 0
            while retry_count < 3:
                try:
                    # Grab the reviewer Trip type
                    tripType = container[j].find_element("xpath", ".//span[@class='TDKzw _R Me']").text
                    break
                except StaleElementReferenceException:
                    retry_count += 1
                    # If the element has become stale, wait and try again
                    time.sleep(2)              
                except NoSuchElementException:
                    tripType = None
                    break

            retry_count = 0
            while retry_count < 3:
                try:
                    # Grab the reviewer location
                    reviewerLocation = container[j].find_element("xpath", ".//span[@class='default LXUOn small']").text
                    break
                except StaleElementReferenceException:
                    retry_count += 1
                    # If the element has become stale, wait and try again
                    time.sleep(2)
                except NoSuchElementException:
                    # Handle the case where an element is not found  
                    reviewerLocation = None
                    break

            reviews.append({'Hotel_Name':hotelName,
                            'Review_Date':reviewDate,
                            'Review_Rating':reviewRating,
                            'Review_Title':reviewTitle,
                            'Review_Text':reviewText,
                            'Reviewer_Date_Of_Stay':dateOfStay,
                            'Reviewer_Trip_Type':tripType,
                            'Reviewer_Location':reviewerLocation,
                            'Reviewer_Profile_Link':reviewerProfileLink})

        try:
            # When all the reviews in the container have been processed, move to the next page and repeat
            next_page = driver.find_element("xpath", './/a[contains(@class, "ui_button nav next primary ")]').click()
        except StaleElementReferenceException:
            # If the element has become stale, wait and try again
            time.sleep(2)
            next_page = driver.find_element("xpath", './/a[contains(@class, "ui_button nav next primary ")]').click()
        except NoSuchElementException:
            print('next page failed')

    # Quit the driver when all pages have been processed
    driver.quit()
    
    # Return the list of hotel reviews
    return reviews

In [None]:
def scrape_hotel_reviews_and_save(n):
    """
    Scrapes and saves hotel reviews for a given range of hotels.

    Args:
        n (int): The starting index in the list of hotel links to scrape from.

    Returns:
        None
    """
    
    hotel_links = pd.read_csv('Data/hotels_links_30.csv')

    for index in range(n,len(hotel_links)):
        url = hotel_links['Hotel_Link'][index]
        # First scrape and save the reviews to a DataFrame
        reviews = scrape_hotel_reviews(url)
        df = pd.DataFrame(reviews)

        start_index = url.find("Reviews-") + len("Reviews-")
        end_index = url.find("-Marrakech_Marrakech_Safi")

        hotel_name = url[start_index:end_index]
        # Save the DataFrame to the CSV file
        df.to_csv('Data/Reviews/'+ hotel_name +'.csv', index=False)

In [None]:
def merge_csv_files():
    """
    Merges all the CSV files in the 'Data/Reviews' folder into a single file named 'marrakech_hotels_reviews.csv'.

    Returns:
        None
    """
    
    file_list = glob.glob("Data/Reviews/*.csv")

    df = pd.DataFrame()

    for file_name in file_list:
        temp_df = pd.read_csv(file_name)
        print(len(temp_df))
        df = df.append(temp_df, ignore_index=True)

    df.to_csv('Data/marrakech_hotels_reviews.csv', index=False)
    