In [None]:
"""
https://www.scrapingbee.com/blog/web-scraping-booking/
https://selenium-python.readthedocs.io/locating-elements.html

"""

In [None]:
pip install selenium selenium-wire webdriver-manager

In [1]:
import pandas as pd
import numpy as np
import selenium
# from selenium import webdriver
from selenium.webdriver.common.by import By
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


In [2]:
def create_driver(url):
    driver = webdriver.Chrome()
    driver.get(url)
    #print(driver.title)
    return driver

In [None]:
def get_hotel_links(city_driver):
    names = []
    links_for_hotel_pages = []
    distance_from_center = []
    
    number_of_pages = int(city_driver.find_element(By.CSS_SELECTOR, 'div[data-testid="pagination"]  li:last-child').text)

    for page in range(1, number_of_pages+1):
        # delete former requests
        del city_driver.requests
        # get hotel cards from search results
        hotels_from_search = city_driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="property-card"]')
        # get info from each hotel card
        for hotel_card in hotels_from_search:
            names.append(hotel_card.find_element(By.CSS_SELECTOR, 'div[data-testid="title"]').text)
            links_for_hotel_pages.append(hotel_card.find_element(By.CSS_SELECTOR, 'a[data-testid="title-link"]').get_attribute('href'))
            distance_from_center.append(hotel_card.find_element(By.CSS_SELECTOR, 'span[data-testid="distance"]').text)
        # in the last page we dont need to click on next page button
        if page == number_of_pages:
            break
        # click on next page button
        next_page_btn = city_driver.find_element(By.XPATH, '//button[contains(@aria-label, "Next page")]')
        next_page_btn.click()
        # wait for the next page to load before acquiring more data or till timeout in 5 seconds
        city_driver.wait_for_request("/dml/graphql", timeout=15)
        city_driver.implicitly_wait(10)

    hotels_info = {"hotel_names": names, "links": links_for_hotel_pages, 'distance_from_center': distance_from_center}
    city_df = pd.DataFrame(hotels_info)
    city_driver.close()
    city_driver.quit()
    return city_df



In [None]:
# Paris
paris_url = "https://www.booking.com/searchresults.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ss=Paris&ssne=Paris&ssne_untouched=Paris&efdco=1&lang=en-gb&src=index&dest_id=-1456928&dest_type=city&checkin=2023-01-10&checkout=2023-01-14&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204"
paris_driver = create_driver(paris_url)
paris_links = get_hotel_links(paris_driver)
paris_links

In [None]:
file_name = "./data/" + "paris_hotel_links"
paris_links.to_csv(file_name)

In [None]:
# Nice
nice_url = "https://www.booking.com/searchresults.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ss=Nice%2C+Provence-Alpes-C%C3%B4te+d%27Azur%2C+France&ssne=Marseille&ssne_untouched=Marseille&lang=en-gb&src=searchresults&dest_id=-1454990&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=b7cb7f7427a70019&ac_meta=GhBiN2NiN2Y3NDI3YTcwMDE5IAAoATICZW46BG5pY2VAAEoAUAA%3D&checkin=2023-01-10&checkout=2023-01-14&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204"
nice_driver = create_driver(nice_url)
nice_links = get_hotel_links(nice_driver)
nice_links

In [None]:
file_name = "./data/" + "nice_hotel_links"
nice_links.to_csv(file_name)

In [None]:
# Marseille
marseille_url = "https://www.booking.com/searchresults.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ss=Marseille%2C+Provence-Alpes-C%C3%B4te+d%27Azur%2C+France&ssne=Lyon&ssne_untouched=Lyon&lang=en-gb&src=searchresults&dest_id=-1449947&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=17077f4de7c00140&ac_meta=GhAxNzA3N2Y0ZGU3YzAwMTQwIAAoATICZW46BG1hcnNAAEoAUAA%3D&checkin=2023-01-10&checkout=2023-01-14&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204"
marseille_driver = create_driver(marseille_url)
marseille_links = get_hotel_links(marseille_driver)
marseille_links

In [None]:
file_name = "./data/" + "marseille_hotel_links"
marseille_links.to_csv(file_name)

In [None]:
# Lyon
lyon_url = "https://www.booking.com/searchresults.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ss=Lyon%2C+Rh%C3%B4ne-Alps%2C+France&ssne=Paris&ssne_untouched=Paris&lang=en-gb&src=searchresults&dest_id=-1448468&dest_type=city&ac_position=0&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=79e77f0759c207cf&ac_meta=GhA3OWU3N2YwNzU5YzIwN2NmIAAoATICZW46BGx5b25AAEoAUAA%3D&checkin=2023-01-10&checkout=2023-01-14&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&nflt=ht_id%3D204"
lyon_driver = create_driver(lyon_url)
lyon_links = get_hotel_links(lyon_driver)
lyon_links

In [None]:
file_name = "./data/" + "lyon_hotel_links"
lyon_links.to_csv(file_name)

In [9]:
# returns dataframe with hotel names, links and distance from center
def get_links(city):
    file_name = "./data/" + city + "_hotel_links"
    df_city_links = pd.read_csv(file_name)
    del df_city_links['Unnamed: 0']
    return df_city_links

In [None]:
def check_for_facility(facility):
    if facility in facilities_list:
        return 1
    return 0

In [3]:
#def get_parameters_from_hotel(hotel_name, link):
def func(df_links):
    # free cancelation
    list_of_parameters = {"Air conditioning": [], "Heating": [], "WiFi is available in all areas and is free of charge.": [],
                         "City view": [], "Wardrobe or closet": [], "Soundproofing": [], "TV": [], "Coffee machine": [],
                          "Private bathroom": [], "Lift": [], "Safety deposit box": [], "Pets are allowed. Charges may be applicable.": [],
                         "24-hour front desk": [], "Bar": [], "Room service": [], "Fitness centre": [], "Spa lounge/relaxation area": [],
                         "Electric kettle": [], "Minibar": [], "Tea/Coffee maker": [], "Restaurant": [], "Parking garage": [],
                          "Airport shuttle": [], "Non-smoking rooms": [], "24-hour security": [], "Towels": [],
                          "Facilities for disabled guests": [],"Hairdryer": [], "Iron": [], "Electric vehicle charging station": []}
    names = []
    stars = []
    distance = []
    tv = []
    pool = []
    breakfast = []
    breakfast_and_dinner = []
    addresses = []
    
    for ind in df_links.index:
        # enter the first hotel from the main page
        
        # get link from df
        link = df_links.iloc[ind]['links']
        # create the driver for this hotel
        hotel_driver = create_driver(link)
        # add hotel name to list
        names.append(df_links['hotel_names']) 
        # get number of stars and add to list
        number_of_stars = len(hotel_driver.find_elements(By.CSS_SELECTOR, 'span[data-testid="rating-stars"]  span[class="b6dc9a9e69 adc357e4f1 fe621d6382"]'))
        stars.append(number_of_stars)
        # get distance from city center from df
        distance.append(df_links['distance_from_center'])
        # get facilities list
        facilities_list = hotel_driver.find_elements(By.CSS_SELECTOR, 'div[class="hotel-facilities__list"] div[class="bui-list__description"]')
        policy_services = hotel_driver.find_elements(By.CSS_SELECTOR, 'div[class="hotel-facilities__list"] div[class="bui-spacer--medium hotel-facilities-group__policy"]')
        # find if there are meals
        is_breakfast_and_dinner = len(hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="bui-text--color-constructive"]'))
        breakfast_and_dinner.append(1 if is_breakfast_and_dinner > 0 else 0)
        is_breakfast = len(hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="ungreen-condition-green"]'))
        is_breakfast.append(1 if is_breakfast > 0 else 0)
        # find if there is a pool
        titles = hotel_driver.find_elements(By.CSS_SELECTOR, 'div[class="bui-title bui-title--strong_1 bui-spacer--medium hotel-facilities-group__title"] div[class="bui-title__text hotel-facilities-group__title-text"]')
        is_there_pool = 0
        for title in titles:
            if title.text == "Indoor swimming pool":
                is_there_pool = 1
        pool.append(is_there_pool)
        # get address
        addresses.append(hotel_driver.find_element(By.CSS_SELECTOR, 'span[class="hp_address_subtitle js-hp_address_subtitle jq_tooltip"]').text)
        # add second list to first
        facilities_list.extend(policy_services)
        # add to list
        for parameter in list_of_parameters.keys():
            if parameter in facilities_list:
                list_of_parameters[parameter].append(1)
            else:
                list_of_parameters[parameter].append(0)
        # close the driver for this hotel 
        hotel_driver.close()
        hotel_driver.quit()
        
    full_data_city = {'hotel_names': names, 'star_rating': stars, 'distance_from_center': distance, 'TV' : tv,
                      'swimmingpool': pool, 'breakfast': breakfast, 'breakfast_and_dinner': breakfast_and_dinner,
                      }
    # merge the dictioneries 
    full_data_city.update(list_of_parameters)
    return full_data_city

In [36]:
# for checking 
link = "https://www.booking.com/hotel/fr/globe-et-cecil.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ucfs=1&arphpl=1&checkin=2023-01-10&checkout=2023-01-14&dest_id=-1448468&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=2&hapos=2&sr_order=popularity&nflt=ht_id%3D204&srpvid=67c880e6395a05e0&srepoch=1673288398&all_sr_blocks=25601002_195446048_0_2_0&highlighted_blocks=25601002_195446048_0_2_0&matching_block_id=25601002_195446048_0_2_0&sr_pri_blocks=25601002_195446048_0_2_0__75820&from_sustainable_property_sr=1&from=searchresults#hotelTmpl"

hotel_driver = create_driver(link)
hotel_driver.delete_all_cookies()
new_link = hotel_driver.find_element(By.CSS_SELECTOR, 'a[data-testid="title-link"]').get_attribute('href')
new_hotel_driver = create_driver(new_link)
"""
l = hotel_driver.find_elements(By.CSS_SELECTOR, 'span[class="hprt-item--emphasised"]')
for f in l:
    print(f.text)
print("l: ",l)
"""
"""
address = hotel_driver.find_element(By.CSS_SELECTOR, 'p[class="address address_clean"] span[class="hp_address_subtitle js-hp_address_subtitle jq_tooltip"]').text
print(address)
"""
hotel_driver.close()
hotel_driver.quit()

In [35]:
get_links("lyon")['links'][1]

'https://www.booking.com/hotel/fr/globe-et-cecil.en-gb.html?label=Catch_All-EN-131006968001-bPiN0WYm7x7ddzlXSroMLwS548793046706%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atiaud-294889297093%3Adsa-1642216383571%3Alp1008002%3Ali%3Adec%3Adm&sid=176ab549c47fbf99c9ee1b8552278873&aid=318615&ucfs=1&arphpl=1&checkin=2023-01-10&checkout=2023-01-14&dest_id=-1448468&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=2&hapos=2&sr_order=popularity&nflt=ht_id%3D204&srpvid=67c880e6395a05e0&srepoch=1673288398&all_sr_blocks=25601002_195446048_0_2_0&highlighted_blocks=25601002_195446048_0_2_0&matching_block_id=25601002_195446048_0_2_0&sr_pri_blocks=25601002_195446048_0_2_0__75820&from_sustainable_property_sr=1&from=searchresults#hotelTmpl'