In [None]:
"""
https://www.scrapingbee.com/blog/web-scraping-booking/
https://selenium-python.readthedocs.io/locating-elements.html
https://stackoverflow.com/questions/72165030/selenium-script-searches-previous-tabs-html-after-opening-a-new-tab

"""

In [None]:
pip install selenium selenium-wire webdriver-manager pandas

In [1]:
import pandas as pd
import numpy as np
import selenium
import time
from selenium.webdriver.common.by import By
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

To use selenium with google chrome we need to create a driver with the wanted url. This function will create and return a driver object.

In [2]:
def create_driver(url):
    driver = webdriver.Chrome()
    driver.get(url)
    #print(driver.title)
    return driver

The following function takes the driver and goes through the result pages to extract the hotel names and links to their pages.
It returns a dataframe of the results.

In [6]:
def get_hotel_links(city_driver):
    names = []
    links_for_hotel_pages = []
    distance_from_center = []
    
    number_of_pages = int(city_driver.find_element(By.CSS_SELECTOR, 'div[data-testid="pagination"]  li:last-child').text)

    for page in range(1, number_of_pages+1):
        # delete former requests
        del city_driver.requests
        # get hotel cards from search results
        hotels_from_search = city_driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="property-card"]')
        # get info from each hotel card
        for hotel_card in hotels_from_search:
            names.append(hotel_card.find_element(By.CSS_SELECTOR, 'div[data-testid="title"]').text)
            links_for_hotel_pages.append(hotel_card.find_element(By.CSS_SELECTOR, 'a[data-testid="title-link"]').get_attribute('href'))
            distance_from_center.append(hotel_card.find_element(By.CSS_SELECTOR, 'span[data-testid="distance"]').text)
        # in the last page we dont need to click on next page button
        if page == number_of_pages:
            break
        # click on next page button
        next_page_btn = city_driver.find_element(By.XPATH, '//button[contains(@aria-label, "Next page")]')
        next_page_btn.click()
        # wait for the next page to load before acquiring more data or till timeout in 20 seconds
        city_driver.wait_for_request("/dml/graphql", timeout=20)
        time.sleep(5)

    hotels_info = {"hotel_names": names, "links": links_for_hotel_pages, 'distance_from_center': distance_from_center}
    city_df = pd.DataFrame(hotels_info)
    print("finished loading data")
    return city_df

This function goes through the list of cities, creates a driver and calls the function that will extract the hotel links from the browser. Then it closes the connection and saves the data into a file.

In [7]:
def get_hotel_links_from_cities():
    cities_and_urls = {"paris": "https://www.booking.com/searchresults.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ss=Paris&ssne=Paris&ssne_untouched=Paris&efdco=1&lang=en-gb&src=searchresults&dest_id=-1456928&dest_type=city&checkin=2023-04-10&checkout=2023-04-13&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204",
              "nice": "https://www.booking.com/searchresults.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ss=Nice%2C+Provence-Alpes-C%C3%B4te+d%27Azur%2C+France&ssne=Paris&ssne_untouched=Paris&lang=en-gb&src=searchresults&dest_id=-1454990&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=5eb2597cf50b0023&ac_meta=GhA1ZWIyNTk3Y2Y1MGIwMDIzIAAoATICZW46BG5pY2VAAEoAUAA%3D&checkin=2023-04-10&checkout=2023-04-13&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204",
              "marseille": "https://www.booking.com/searchresults.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ss=Marseille%2C+Provence-Alpes-C%C3%B4te+d%27Azur%2C+France&ssne=Nice&ssne_untouched=Nice&lang=en-gb&src=searchresults&dest_id=-1449947&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=00145996f939055a&ac_meta=GhAwMDE0NTk5NmY5MzkwNTVhIAAoATICZW46CW1hcnNlaWxsZUAASgBQAA%3D%3D&checkin=2023-04-10&checkout=2023-04-13&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204",
              "lyon": "https://www.booking.com/searchresults.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ss=Lyon%2C+Rh%C3%B4ne-Alps%2C+France&ssne=Marseille&ssne_untouched=Marseille&lang=en-gb&src=searchresults&dest_id=-1448468&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=1&search_selected=true&search_pageview_id=6d9459a8df7400df&ac_meta=GhA2ZDk0NTlhOGRmNzQwMGRmIAAoATICZW46CNea15jXndeeQABKAFAA&checkin=2023-04-10&checkout=2023-04-13&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204"}
    for city in cities_and_urls.keys():
        city_driver = create_driver(cities_and_urls[city])
        city_links_df = get_hotel_links(city_driver)
        # close the driver for this hotel 
        city_driver.close()
        city_driver.quit()
        # save the information in a csv file
        file_name = "./data/" + city + "_hotel_links"
        city_links_df.to_csv(file_name, index=False)
        time.sleep(5)

Run the function that gets the hotel links from the wanted cities and saves then to csv file.

In [8]:
get_hotel_links_from_cities()

finished loading data
finished loading data
finished loading data
finished loading data


In [31]:
def get_parameters_from_hotels(df_links):
    list_of_parameters = {"Air conditioning": [], "Heating": [], "WiFi is available in all areas and is free of charge.": [],
                         "City view": [], "Wardrobe or closet": [], "Soundproofing": [], "TV": [], "Coffee machine": [],
                          "Private bathroom": [], "Lift": [], "Safety deposit box": [], "Pets are allowed. Charges may be applicable.": [],
                         "24-hour front desk": [], "Bar": [], "Room service": [], "Fitness centre": [], "Spa lounge/relaxation area": [],
                         "Electric kettle": [], "Minibar": [], "Tea/Coffee maker": [], "Restaurant": [], "Parking garage": [],
                          "Airport shuttle": [], "Non-smoking rooms": [], "24-hour security": [], "Towels": [],
                          "Facilities for disabled guests": [],"Hairdryer": [], "Iron": [], "Electric vehicle charging station": []}
    names = []
    stars = []
    distance = []
    pool = []
    breakfast = []
    breakfast_and_dinner = []
    addresses = []
    free_cancellations = []
    
    for ind in df_links.index:
        # get link from df
        link = df_links.iloc[ind]['links']
        # create the driver for this hotel
        hotel_driver = create_driver(link)
        time.sleep(3) 
        
        try:
            # enter the main page where the first hotel is the one we want, click the first hotel name
            #hotel_page_btn = hotel_driver.find_element(By.CSS_SELECTOR, 'a[class="fc63351294 a822bdf511 d4b6b7a9e7 fa565176a8 f7db01295e f4605622ad b2f0d6a80e"]')
            hotel_page_btn = hotel_driver.find_element(By.CSS_SELECTOR, 'a[class="e13098a59f"]')
            hotel_page_btn.click()
            # wait for the hotel page to load before acquiring more data or till timeout in 20 seconds
            hotel_driver.wait_for_request("/fragment.en-gb.json", timeout=30)
            # window_after is the tab with the hotel
            window_after = hotel_driver.window_handles[1]
            # switches the current window to the new tab
            hotel_driver.switch_to.window(window_after)
            time.sleep(1)
        except Exception as e: 
            print(e)
            
        time.sleep(10)
        # add hotel name to list
        names.append(df_links['hotel_names'][ind]) 
        # get number of stars and add to list
        number_of_stars = len(hotel_driver.find_elements(By.CSS_SELECTOR, 'span[data-testid="rating-stars"]  span[class="b6dc9a9e69 adc357e4f1 fe621d6382"]'))
        stars.append(number_of_stars)
        # get distance from city center from df
        distance.append(df_links['distance_from_center'][ind])
        # find if there are meals
        is_breakfast_and_dinner = len(hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="bui-text--color-constructive"]'))
        breakfast_and_dinner.append(1 if is_breakfast_and_dinner > 0 else 0)
        is_breakfast = len(hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="ungreen-condition-green"]'))
        breakfast.append(1 if is_breakfast > 0 else 0)
        # find if there is a pool
        titles = hotel_driver.find_elements(By.CSS_SELECTOR, 'div[class="db29ecfbe2 f0d4d6a2f5"] span[class="db312485ba"')
        is_there_pool = 0
        for title in titles:
            if title.text == "Indoor swimming pool" or title.text == "Outdoor swimming pool":
                is_there_pool = 1
        pool.append(is_there_pool)
        # wait for the address element to be present on the page
        wait = WebDriverWait(hotel_driver, 10)
        address_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#showMap2 > span.hp_address_subtitle.js-hp_address_subtitle.jq_tooltip")))
        # get address
        addresses.append(address_element.text)
        
        # wait for the list, then get facilities from the list
        wait = WebDriverWait(hotel_driver, 40)
        facilities_list = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'span[class="db312485ba"')))
        """
        facilities_list = hotel_driver.find_elements(By.CSS_SELECTOR, 'div[class="hotel-facilities__list"] div[class="bui-list__description"]')
        
        facilities_list = hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="db312485ba"')
        while facilities_list == None:
            facilities_list = hotel_driver.find_elements(By.CSS_SELECTOR, 'div[class="hotel-facilities__list"] div[class="bui-list__description"]')

        policy_services = hotel_driver.find_elements(By.CSS_SELECTOR, 'div[class="hotel-facilities__list"] div[class="bui-spacer--medium hotel-facilities-group__policy"]')
        """
        # make facilities list into text
        text_facilities_list = []
        for f in facilities_list:
            print(f.text)
            text_facilities_list.append(f.text)
        # check if facilities exists
        if len(text_facilities_list) == 0:
            print(df_links['hotel_names'][ind], "has empty facilities list")
        # add policies- there are 2 possible classes for them in the list
        wait = WebDriverWait(hotel_driver, 25)
        try:
            policy_services = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'div[class="db29ecfbe2 c90c0a70d3 a34d1a4138"]')))
        except:
            try:
                policy_services = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'div[class="bui-spacer--medium hotel-facilities-group__policy"]')))
            except:
                print(df_links['hotel_names'][ind], "has empty policies list")

        # add the policies to the list of facilities
        for p in policy_services:
            text_facilities_list.append(p.text)
        
        # add 1 if the parameter exists
        for parameter in list_of_parameters.keys():
            if parameter in text_facilities_list:
                list_of_parameters[parameter].append(1)
            else:
                list_of_parameters[parameter].append(0)
        
        # check if there is free cancellation
        cancellations_list = hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="hprt-item--emphasised"]')
        is_there_free_cancellation = 0
        for f in cancellations_list:
            if f.text == "Free cancellation":
                is_there_free_cancellation = 1
        free_cancellations.append(is_there_free_cancellation)
        
        time.sleep(3)  
        # close the driver for this hotel 
        hotel_driver.close()
        hotel_driver.quit()
        
    # the data we got from other parts of the page(not the facilities list)
    full_data_city = {'hotel_names': names, 'star_rating': stars, 'distance_from_center': distance,
                      'swimmingpool': pool, 'breakfast': breakfast, 'breakfast_and_dinner': breakfast_and_dinner,
                      'free_cancellation': free_cancellations, 'address': addresses}
    # merge the dictioneries 
    full_data_city.update(list_of_parameters)
    return pd.DataFrame(full_data_city)

Get the links for the hotels in the city from the csv file we saved earlier.

In [13]:
# returns dataframe with hotel names, links and distance from center
def get_links_from_file(city):
    file_name = "./data/" + city + "_hotel_links"
    df_city_links = pd.read_csv(file_name)
    return df_city_links

Go through the list of hotels and extract the wanted parameters from each hotel's page.

In [29]:
def get_parameters_for_all_cities():
    # cities = ["paris", "lyon", "marseille", "nice"]
    cities = ["lyon"]
    for city in cities:
        df_city_links = get_links_from_file(city)
        # get dataframe with all the parameters of the hotels
        df_parameters = get_parameters_from_hotels(df_city_links.head(5))
        # save the information in a csv file
        file_name = "./data/" + city + "_hotels_parameters"
        df_parameters.to_csv(file_name, index=False)
        time.sleep(10)
        print("finished saving parameters for {} hotels".format(city))
        print(df_parameters)
        print("----------------")
    

In [32]:
get_parameters_for_all_cities()

TimeoutException: Message: 


In [None]:
def get_parameters_for_file(city):
    file_name = "./data/" + city + "_hotels_parameters"
    df_city_parameters = pd.read_csv(file_name, header=None)
    return df_city_parameters

In [None]:
df = get_parameters_for_file("lyon")
df

In [None]:
# for checking
df_links = get_links("lyon")


df_lyon = get_parameters_from_hotels(df_links.head(5))
df_lyon

In [None]:
df_links.head(5)['links'][2]

In [None]:
# for checking 
link ="https://www.booking.com/hotel/fr/etap-nice-aeroport.en-gb.html?aid=318615&label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=63acef8e80078e4e369e8ece284dec93&all_sr_blocks=38782203_115246907_3_2_0;checkin=2023-01-10;checkout=2023-01-14;dest_id=-1454990;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=4;highlighted_blocks=38782203_115246907_3_2_0;hpos=4;matching_block_id=38782203_115246907_3_2_0;no_rooms=1;req_adults=2;req_children=0;room1=A%2CA;sb_price_type=total;sr_order=popularity;sr_pri_blocks=38782203_115246907_3_2_0__23100;srepoch=1673292021;srpvid=e4bf87f976660261;type=total;ucfs=1&#hotelTmpl"
hotel_driver = create_driver(link)

hotel_page_btn = hotel_driver.find_element(By.CSS_SELECTOR, 'a[class="fc63351294 a822bdf511 d4b6b7a9e7 fa565176a8 f7db01295e f4605622ad b2f0d6a80e"]')
hotel_page_btn.click()
# wait for the next page to load before acquiring more data or till timeout in 20 seconds
hotel_driver.wait_for_request("/fragment.en-gb.json", timeout=20)
window_after = hotel_driver.window_handles[1]
# switches the current window to the new tab
hotel_driver.switch_to.window(window_after)

#free cancelation
l = hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="hprt-item--emphasised"]')
for f in l:
    if f.text == "Free cancellation":
        print(1)

address = hotel_driver.find_element(By.CSS_SELECTOR, 'p[class="address address_clean"] span[data-node_tt_id="location_score_tooltip"]').text


hotel_driver.close()
hotel_driver.quit()

In [None]:
# for checking 
link ="https://www.booking.com/hotel/fr/etap-nice-aeroport.en-gb.html?aid=318615&label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=63acef8e80078e4e369e8ece284dec93&all_sr_blocks=38782203_115246907_3_2_0;checkin=2023-01-10;checkout=2023-01-14;dest_id=-1454990;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=4;highlighted_blocks=38782203_115246907_3_2_0;hpos=4;matching_block_id=38782203_115246907_3_2_0;no_rooms=1;req_adults=2;req_children=0;room1=A%2CA;sb_price_type=total;sr_order=popularity;sr_pri_blocks=38782203_115246907_3_2_0__23100;srepoch=1673292021;srpvid=e4bf87f976660261;type=total;ucfs=1&#hotelTmpl"
hotel_driver = create_driver(link)

hotel_page_btn = hotel_driver.find_element(By.CSS_SELECTOR, 'a[class="fc63351294 a822bdf511 d4b6b7a9e7 fa565176a8 f7db01295e f4605622ad b2f0d6a80e"]')
hotel_page_btn.click()
# wait for the next page to load before acquiring more data or till timeout in 20 seconds
hotel_driver.wait_for_request("/fragment.en-gb.json", timeout=20)
window_after = hotel_driver.window_handles[1]
# switches the current window to the new tab
hotel_driver.switch_to.window(window_after)

#free cancelation
l = hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="hprt-item--emphasised"]')
for f in l:
    if f.text == "Free cancellation":
        print(1)

address = hotel_driver.find_element(By.CSS_SELECTOR, 'p[class="address address_clean"] span[data-node_tt_id="location_score_tooltip"]').text


hotel_driver.close()
hotel_driver.quit()