In [None]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Create a CSV file to write the data
# csv_file = open("tripadvisor_reviews.csv", "w", newline='', encoding="utf-8")
csv_file = open("lagos_hotels.csv", "w", newline='', encoding="utf-8")
csv_writer = csv.writer(csv_file)

# Initialize the web driver
driver = webdriver.Chrome()

# Define the URL of the TripAdvisor page
base_url = "https://www.tripadvisor.com"
url = "https://www.tripadvisor.com/Hotels-g304026-Lagos_Lagos_State-Hotels.html"
driver.get(url)

# Get the page source
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
current_url = driver.current_url

# Function to extract review details
def extract_review_details(review_div):
    # Extract the review title
    review_title = review_div.find('div', class_='KgQgP MC _S b S6 H5 _a')
    title = review_title.text.strip() if review_title else None

    # Extract the review text
    review_text_div = review_div.find('span', class_='QewHA H4 _a')
    review_text = review_text_div.select_one(':first-child').text.strip() if review_text_div else None

    # Extract the stay date
    stay_date_span = review_div.find('span', class_='usajM')
    stay_date = stay_date_span.next_sibling.strip() if stay_date_span else None

    # Extract the trip type
    trip_type_span = review_div.find('span', class_='trip_type_label')
    trip_type = trip_type_span.next_sibling.strip() if trip_type_span else None

    # Extract the room tips
    room_tip_div = review_div.find('span', class_='tkWaG b')
    room_tip = room_tip_div.find_next_sibling('span').text if room_tip_div else None

    # Extract Review Date
    review_date_div = review_div.find('a', class_='ui_header_link uyyBf')
    author_info = review_date_div.next_sibling.strip() if review_date_div else None

    # Extract location of author
    location_div = review_div.find('span', class_='ui_icon map-pin-fill fXexN')
    author_location = location_div.next_sibling.strip() if location_div else None

    # Extract helpful votes, contributions, and review author
    author_div = review_div.find('div', class_='MziKN')
    elements = [span.text for span in author_div.select('span.phMBo > span')] if author_div else None

    # Extract the overall review rating
    rating_div = review_div.find('div', class_='Hlmiy F1')
    rating_span = rating_div.find('span', class_='ui_bubble_rating')
    rating_class = rating_span['class'][1].replace('bubble_', '') if rating_span else None

    # Extract the specific review rating
    rating_div = review_div.find_all('div', class_='hemdC S2 H2 WWOoy')
    specific_ratings = []
    for specific_rating in rating_div:
        value1 = specific_rating.find('span', class_='Nd').find_next_sibling().text
        value2 = specific_rating.find('span', class_='Nd').select_one(':first-child')['class'][1]
        specific_ratings.append(f"{value1} {value2}")

    # Create a dictionary to store all the extracted details
    review_details = {
        "Review Title": title,
        "Review Text": review_text,
        "Stay Date": stay_date,
        "Trip Type": trip_type,
        "Room Tips": room_tip,
        "Review Date": author_info,
        "Author Location": author_location,
        "Author Info": elements,
        "Overall Rating": rating_class,
        "Specific Ratings": specific_ratings,
    }

    # Create a list with the extracted details
    details_list = [
        title, review_text, stay_date, trip_type, room_tip,
        author_info, author_location, ', '.join(elements),
        rating_class, ', '.join(specific_ratings)
    ]

    # Write the details to the CSV file
    csv_writer.writerow(details_list)

    return review_details

# Function to loop through pages for reviews
def get_page_reviews(soup, counter):
    # Find and extract review details
    reviews = soup.find('div', class_='uNacK PS')
    if reviews:
        review_divs = reviews.find_all('div', class_='YibKl MC R2 Gi z Z BB pBbQr')
        for count, review_div in enumerate(review_divs, 1):
            print(f"----------- Review {counter} -----------")
            counter += 1
            details = extract_review_details(review_div)
            for key, value in details.items():
                if value:
                    print(f"{key}: {value}")

# Add a hotel to the list
def add_hotel(hotel_list: list, unique_hotel_names: set, hotel_name: str , hotel_url: str):
    if hotel_name not in unique_hotel_names or hotel_name == "NIL":
        # If the hotel name is unique, add it to the list and update the set
        hotel_info = {'name': hotel_name.strip(), 'url': hotel_url.strip()}
        hotel_list.append(hotel_info)
        unique_hotel_names.add(hotel_name)

# Get Reviews for hotel
def get_hotel_reviews(hotel):
    driver.get(hotel['url'])

    # Get the page source
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    current_url = driver.current_url
    counter = 1

    page = 1
    print('-----------')
    print(f"Hotel Name: {hotel['name']}")
    print('-----------')
    print(f"----------- Review {page} -----------")
    print('-----------')

    get_page_reviews(soup, counter)
    
    # Loop to navigate to the next page
    while True:
        # Click the "Next" button to load the next page
        next_button = driver.find_element(By.CLASS_NAME, 'ui_button.nav.next.primary ')
        
        # If there's a "Next" button, click it to go to the next page
        if next_button:
            # Check if the "Next" button is disabled
            is_disabled = "disabled" in next_button.get_attribute("class")
            
            # If the "Next" button is disabled, break out of the loop
            if is_disabled:
                break        
        else:
            break

        next_button.click()
        
        # Wait for the next page to load by checking the URL change
        WebDriverWait(driver, 10).until(EC.url_changes(current_url))
        
        # Wait for 2 secs for page to be loaded, it can be adjusted based on your internet speed
        time.sleep(2)

        page += 1

        print('-----------')
        print(f"----------- Page {page} -----------")
        print('-----------')

        # Get the updated page source
        html = driver.page_source

        # Parse the new page with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        current_url = driver.current_url

        get_page_reviews(soup, counter)

# List to hold found hotels
hotel_list = []

# Track hotel names
unique_hotel_names = set()

# check for the See All button and Click
time.sleep(5)
see_all_button = driver.find_element(By.CLASS_NAME, 'rmyCe._G')
if see_all_button:
    see_all_button.click()

    # Wait for 5 secs for page to be loaded, it can be adjusted based on your internet speed
    time.sleep(2)

    html = driver.page_source
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, "html.parser")

    # Wait for the page to load up more content checking if the next symbol is loaded
    # WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'xkSty')))  
    time.sleep(2)  

# get all links to hotels, store in a list
hotel_links = soup.find_all('a', class_='BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS')
for hotel_link in hotel_links:
    hotel_url = base_url + hotel_link['href']
    h3_tag = hotel_link.find('h3') # find the first h3 tag in the html
    if h3_tag:
        text_value = h3_tag.text
        text_list = text_value.split('.', 1)
        if len(text_list) == 1:
            continue
        hotel_name = text_list[-1].strip()
        add_hotel(hotel_list, unique_hotel_names, hotel_name, hotel_url)

# loop thru the list and get review for hotels
# for hotel in hotel_list:
#     print(f"{hotel['name']}, {hotel['url']}")
csv_writer.writerow(hotel_links)

while True:    
    hotel_list = []

    # Click the "Next" button to load the next page
    next_button = driver.find_element(By.CLASS_NAME, 'BrOJk.u.j.z._F.wSSLS.tIqAi.unMkR')
    
    # If there's a "Next" button, click it to go to the next page
    if next_button:
        next_button.click()    
    else:
        break
    
    # Wait for the next page to load by checking the URL change
    WebDriverWait(driver, 10).until(EC.url_changes(current_url))
    
    # Wait for 2 secs for page to be loaded, it can be adjusted based on your internet speed
    time.sleep(2)
    
    # Get the updated page source
    html = driver.page_source

    # Parse the new page with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    current_url = driver.current_url

    # check for the See All button and Click
    see_all_button = driver.find_element(By.CLASS_NAME, 'rmyCe._G')
    if see_all_button:
        see_all_button.click()

        # Wait for 5 secs for page to be loaded, it can be adjusted based on your internet speed
        time.sleep(5)

        html = driver.page_source
        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")

        # Wait for the page to load up more content checking if the next symbol is loaded
        # WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'xkSty')))
        time.sleep(2)    

    # get all links to hotels, store in a list
    hotel_links = soup.find_all('a', class_='BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS')
    for hotel_link in hotel_links:
        hotel_url = base_url + hotel_link['href']
        h3_tag = hotel_link.find('h3') # find the first h3 tag in the html
        if h3_tag:
            text_value = h3_tag.text
            text_list = text_value.split('.', 1)
            if len(text_list) == 1:
                continue
            hotel_name = text_list[-1].strip()
            add_hotel(hotel_list, unique_hotel_names, hotel_name, hotel_url)

    # loop thru the list and get review for hotels
    # for hotel in hotel_list:
    #     print(f"{hotel['name']}, {hotel['url']}")
    
    csv_writer.writerow(hotel_list)

    if len(hotel_links) > 100:
        break


# loop thru the hotel list to get the reviews
# for hotel in hotel_list:
#     get_hotel_reviews(hotel)

# Close the CSV file
csv_file.close()

# Close the web driver
driver.quit()


In [5]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

# Initialize the web driver
driver = webdriver.Chrome()
hotel_count = 1
# Function to extract review details
def extract_review_details(review_div):
    # Extract the review title
    review_title = review_div.find('div', class_='KgQgP MC _S b S6 H5 _a')
    title = review_title.text.strip() if review_title else None

    # Extract the review text
    review_text_div = review_div.find('span', class_='QewHA H4 _a')
    review_text = review_text_div.select_one(':first-child').text.strip() if review_text_div else None

    # Extract the stay date
    stay_date_span = review_div.find('span', class_='usajM')
    stay_date = stay_date_span.next_sibling.strip() if stay_date_span else None

    # Extract the trip type
    trip_type_span = review_div.find('span', class_='trip_type_label')
    trip_type = trip_type_span.next_sibling.strip() if trip_type_span else None

    # Extract the room tips
    room_tip_div = review_div.find('span', class_='tkWaG b')
    room_tip = room_tip_div.find_next_sibling('span').text if room_tip_div else None

    # Extract Review Date
    review_date_div = review_div.find('a', class_='ui_header_link uyyBf')
    author_info = review_date_div.next_sibling.strip() if review_date_div else None

    # Extract location of author
    location_div = review_div.find('span', class_='ui_icon map-pin-fill fXexN')
    author_location = location_div.next_sibling.strip() if location_div else None

    # Extract helpful votes, contributions, and review author
    author_div = review_div.find('div', class_='MziKN')
    elements: list = [span.text for span in author_div.select('span.phMBo > span')] if author_div else []

    # Extract the overall review rating
    rating_div = review_div.find('div', class_='Hlmiy F1')
    rating_span = rating_div.find('span', class_='ui_bubble_rating')
    rating_class = rating_span['class'][1].replace('bubble_', '') if rating_span else None

    # Extract the specific review rating
    rating_div = review_div.find_all('div', class_='hemdC S2 H2 WWOoy')
    specific_ratings = []
    for specific_rating in rating_div:
        value1 = specific_rating.find('span', class_='Nd').find_next_sibling().text
        value2 = specific_rating.find('span', class_='Nd').select_one(':first-child')['class'][1]
        specific_ratings.append(f"{value1} {value2}")

    # Create a dictionary to store all the extracted details
    review_details = {
        "Review Title": title,
        "Review Text": review_text,
        "Stay Date": stay_date,
        "Trip Type": trip_type,
        "Room Tips": room_tip,
        "Review Date": author_info,
        "Author Location": author_location,
        "Author Info": elements,
        "Overall Rating": rating_class,
        "Specific Ratings": specific_ratings,
    }

    return review_details

# Function to loop through pages for reviews
def get_page_reviews(soup):
    # Find and extract review details
    page_reviews = []
    reviews = soup.find('div', class_='uNacK PS')
    if reviews:
        review_divs = reviews.find_all('div', class_='YibKl MC R2 Gi z Z BB pBbQr')
        for count, review_div in enumerate(review_divs, 1):
            page_reviews.append(extract_review_details(review_div))
    
    return page_reviews

# Get Reviews for hotel
def get_hotel_reviews(hotel):
    driver.get(hotel[1])
    driver.maximize_window()

    # Get the page source
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    current_url = driver.current_url
    counter = 1

    # Get number of reviews
    no_of_reviews = soup.find('span', class_='biGQs _P pZUbB KxBGd')
    no_of_reviews = no_of_reviews.text.strip() if no_of_reviews else 0

    page = 1
    print('-----------')
    print(f"Hotel {hotel_count}: {hotel[0]} : {no_of_reviews}")
    print('-----------')

    data = get_page_reviews(soup)
    df = pd.DataFrame(data)
    # Save the DataFrame to a CSV file
    df.to_csv(f"{hotel[0]}.csv", mode='a', index=False)

    # Loop to navigate to the next page
    while True:
        # Click the "Next" button to load the next page
        next_button = driver.find_elements(By.CLASS_NAME, 'ui_button.nav.next.primary ')
        
        # If there's a "Next" button, click it to go to the next page
        if len(next_button) > 0 and next_button[0]:
            # Check if the "Next" button is disabled
            attributes = next_button[0].get_attribute("class")
            if attributes is not None and "disabled" in attributes:
                is_disabled = True
            else:
                is_disabled = False

            # If the "Next" button is disabled, break out of the loop
            if is_disabled:
                break        
        else:
            print(f'No next button in {hotel[0]}')
            break

        next_button[0].click()
        
        # Wait for the next page to load by checking the URL change
        WebDriverWait(driver, 20).until(EC.url_changes(current_url))
        
        # Wait for 2 secs for page to be loaded, it can be adjusted based on your internet speed
        time.sleep(2)

        page += 1

        # Get the updated page source
        html = driver.page_source

        # Parse the new page with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        current_url = driver.current_url

        data = get_page_reviews(soup)
        df = pd.DataFrame(data)
        # Save the DataFrame to a CSV file
        df.to_csv(f"{hotel[0]}.csv", mode='a', index=False)

# List to hold found hotels
hotel_list: list[list] = []

with open('hotel_list_3.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        hotel_list.append(row)

# loop thru the hotel list to get the reviews
for hotel in hotel_list:    
    if len(hotel) > 2:
        temp_hotel = [''.join(hotel[:-1]).replace(',', ' '), hotel[-1]]
    else:
        temp_hotel = hotel

    get_hotel_reviews(temp_hotel)
    
    hotel_count += 1

# Close the web driver
driver.quit()


-----------
Hotel 1: Radisson Blu Anchorage Hotel Lagos V.I. : 1,460 reviews
-----------
-----------
Hotel 2: MÃ¶venpick Hotel Ikoyi Lagos : 1,086 reviews
-----------
-----------
Hotel 3: Oakspring Hotel and Luxury Suites : 2 reviews
-----------
No next button in Oakspring Hotel and Luxury Suites
-----------
Hotel 4: La Cour Hotel Cooper : 55 reviews
-----------
-----------
Hotel 5: The Sojourner By Genesis : 70 reviews
-----------
-----------
Hotel 6: Providence by Mantis : 28 reviews
-----------
-----------
Hotel 7: Jara Beach Resort : 62 reviews
-----------
-----------
Hotel 8: Jeromes Garden & Suites : 11 reviews
-----------
