In [1]:
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from urllib.parse import urlparse
import time

# Initial Chrome
driver = webdriver.Chrome()

In [2]:
# Function to scroll down page to load all hotels
def scroll_down():
    current_scroll_height = 0
    while True:
        # Scroll down by 1000 pixels
        driver.execute_script("window.scrollBy(0, 1000)")
        time.sleep(5)

        # Get the new scroll height after scrolling
        new_scroll_height = driver.execute_script("return Math.max(document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);")

        # Check if the scroll height has not increased, indicating the end of the page
        if new_scroll_height == current_scroll_height:
            break  # Exit the loop if no more scrolling is possible
        current_scroll_height = new_scroll_height
        

# Function to scroll up page to load all hotels
def scroll_up():
    while True:
        # Scroll up by 1000 pixels
        driver.execute_script("window.scrollBy(0, -1000)")
        time.sleep(5)

        # Check if have reached the top of the page
        if driver.execute_script("return window.pageYOffset") == 0:
            break  # Exit the loop if we are at the top of the page

            
# Function to go to the next page
def next_page():
    driver.find_element(By.ID, "paginationNext").click()
    time.sleep(5)

In [3]:
# Function to parse url to get base url
def parse_url(url):
    parsed_url = urlparse(url)
    base_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
    return (base_url)

In [4]:
# Function to extract hotel data return as dataframe
def extract_data(ol_num):
    hotel_data = []
    
    while True:
        # Extract hotel elements
        selector = "#contentContainer > div:nth-child(" + str(ol_num) + ") > ol"
        hotel_elements = driver.find_elements("css selector", selector)
        
        id_elements = hotel_elements[0].find_elements("css selector", "li[data-selenium='hotel-item']")
        name_elements = hotel_elements[0].find_elements("css selector", "h3[data-selenium='hotel-name']")
        url_elements = hotel_elements[0].find_elements("css selector", "div > a")
        
        # Test
        print(len(id_elements))
        print(len(name_elements))
        print(len(url_elements))
        print("---")
        
        # Check if the lengths of id, name, and url elements are the same
        if len(id_elements) == len(name_elements) == len(url_elements):
            for i in range(len(id_elements)):
                hotel_id = id_elements[i].get_attribute("data-hotelid")
                hotel_name = name_elements[i].text
                hotel_url = url_elements[i].get_attribute("href")
                
                # Parse hotel url to get base url
                hotel_base_url = parse_url(hotel_url)
                
                # Append the data to the list
                hotel_data.append([hotel_id, hotel_name, hotel_base_url])
                
        else:
            # Check if we are at the bottom of the page
            if driver.execute_script("return Math.max(document.documentElement.scrollHeight, document.documentElement.offsetHeight, document.documentElement.clientHeight) <= window.innerHeight"):
                scroll_up()  # Scroll up if we are at the bottom
            else:
                scroll_down()  # Scroll down if not at the bottom
            continue

        # If we reach this point, it means the data has been successfully extracted
        break

    # Create a DataFrame from the list of data
    df = pd.DataFrame(hotel_data, columns=["Hotel ID", "Hotel Name", "Hotel URL"])
    
    return df

In [5]:
# Open Agoda website
url = "https://www.agoda.com/search?city=4064&checkIn=2023-11-17&los=1&rooms=1&adults=1&children=0&locale=en-us&ckuid=7026c8ca-26e7-48e3-859c-8b9a562c7482&prid=0&currency=SGD&correlationId=b42e66fb-3385-4078-8db7-e981120cdce6&analyticsSessionId=1094851885165923957&pageTypeId=1&realLanguageId=1&languageId=1&origin=SG&cid=-999&userId=7026c8ca-26e7-48e3-859c-8b9a562c7482&whitelabelid=1&loginLvl=0&storefrontId=3&currencyId=5&currencyCode=SGD&htmlLanguage=en-us&cultureInfoName=en-us&machineName=sg-pc-6g-acm-web-user-746d798d6f-pljfd&trafficGroupId=4&sessionId=1kqnitcr15hmxmit5prbwly1&trafficSubGroupId=6&aid=178961&useFullPageLogin=true&cttp=4&isRealUser=true&mode=production&browserFamily=Chrome&cdnDomain=agoda.net&checkOut=2023-11-18&priceCur=SGD&textToSearch=Singapore&travellerType=0&familyMode=off&productType=-1"
driver.get(url)

driver.implicitly_wait(10)

In [6]:
# Loop through 4 pages
combined_hotel_df = pd.DataFrame(columns=["Hotel ID", "Hotel Name", "Hotel URL"])  # Initialize an empty DataFrame

for page in range(4):
    print(f"Page: {page+1}")
    scroll_down()

    # Extract data from the current page and add it to the combined df
    hotel_df1 = extract_data(2)
    print("=====")
    hotel_df2 = extract_data(3)
    combined_hotel_df = pd.concat([combined_hotel_df, hotel_df1], ignore_index=True)
    combined_hotel_df = pd.concat([combined_hotel_df, hotel_df2], ignore_index=True)
    
    # Scroll to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
    
    # Go to the next page
    next_page()
    time.sleep(5)

Page: 1
10
10
10
---
=====
88
88
88
---
Page: 2
10
10
10
---
=====
88
88
88
---
Page: 3
10
10
10
---
=====
88
67
88
---
88
88
88
---
Page: 4
10
10
10
---
=====
88
88
88
---


In [7]:
print(f"Total Number of Records: {len(combined_hotel_df)}")
combined_hotel_df.head()

Total Number of Records: 392


Unnamed: 0,Hotel ID,Hotel Name,Hotel URL
0,408551,Dorsett Singapore,https://www.agoda.com/dorsett-singapore/hotel/...
1,1635,M Hotel Singapore,https://www.agoda.com/m-hotel/hotel/singapore-...
2,10672643,Dusit Thani Laguna Singapore,https://www.agoda.com/dusit-thani-laguna-singa...
3,51381,Hotel Royal @ Queens,https://www.agoda.com/hotel-royal-queens/hotel...
4,10604,Singapore Marriott Tang Plaza Hotel,https://www.agoda.com/singapore-marriott-tang-...


In [8]:
# Save hotel data as csv
combined_hotel_df.to_csv("agoda_hotels.csv", index=False, encoding="utf-8")

# Close Chrome
driver.quit()