In [1]:
#Import relevant libraries
from bs4 import BeautifulSoup
import requests
from selenium.webdriver import Chrome
import pandas as pd

In [2]:
#initialize THE SOUP
main_html_link = requests.get("https://booking.com").text
soup = BeautifulSoup(main_html_link, 'lxml')

#Intializing a Pandas Dataframe basically
hotel_dict = {"Location":[], "HotelName":[], "NumAdults":[],"NumChildren":[],"DaysStayed":[],"Bed":[], "Rating":[], "Price" :[]}

In [3]:
#navigate to a Web Carousel containing different regions.
carousels = soup.find_all("ul", attrs = {"class":"bui-carousel__inner", "data-bui-ref":"carousel-container"})

#Look for the carousel with the regions on it.  The loop is necessary because the order of the carousels changes upon reload
for carousel in carousels:
    if "https://www.booking.com/searchresults" in carousel.find('li', attrs= {"class":"bui-carousel__item","data-bui-ref":"carousel-item" }).a['href']:
            regions = carousel.find_all('li', attrs= {"class":"bui-carousel__item","data-bui-ref":"carousel-item" })
            break

#Extract all the links to each region's hotels.
region_links = []
for region in regions:
    region_links.append(region.a['href'])
for i in region_links:
    print(i)
    

https://www.booking.com/searchresults.html?dest_id=20079110&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20117718&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20088325&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20023488&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20058684&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20015725&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20023182&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20014181&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20142297&dest_type=city&
https://www.booking.com/searchresults.html?dest_id=20124359&dest_type=city&


In [4]:
#Used to Scrape each region link
def ScrapeRegion(region_link, days, numAdults, numChildren):
    #creates the relevant url based on the parameters given
    setCheckInCheckOut = f"checkin=2022-08-14&checkout=2022-08-{14+days}&group_adults={numAdults}&no_rooms=1&group_children={numChildren}&sb_travel_purpose=leisure"
    filterByHotel = "&nflt=%3Bht_id%3D204"
    working_link = region_link + setCheckInCheckOut +filterByHotel

    #Since the website needs to be scraped after all the javascript is loaded, 
    #Selenium is used instead of requests to get the HTML String.
    driver = Chrome(executable_path= "/Users/blakechang/Programming/chromedriver")
    driver.get(working_link)
    #print(working_link)
    soup = BeautifulSoup(driver.page_source,'lxml')
    driver.quit()

    try:
        #Finds the Region Name
        region = soup.find('h1', attrs = {"class":["e1f827110f","d3a14d00da"]}).text.split(":")[0]
    except AttributeError:
        region = None
    
    #Each Hotel listing is on a hotelcard, which has all the relevant information.  We find each hotel card to scrape it.
    hotelcards = soup.find_all('div', attrs= {"data-testid":"property-card" })
    for hotelcard in hotelcards:
        hotel_dict["NumAdults"].append(numAdults)
        hotel_dict["NumChildren"].append(numChildren)
        hotel_dict["DaysStayed"].append(days)
        hotel_dict["Location"].append(region)
        ScrapeHotel(hotelcard)
    
def ScrapeHotel(hotelcard):
    #Scraping the Price, Rating, types of beds, and hotel name from each Hotel Card.
    try:
        hotel_dict["Price"].append(hotelcard.find('span', attrs = {"class":["fcab3ed991", "bd73d13072"]}).text)
    except AttributeError:
        hotel_dict["Price"].append(None)

    try:
        hotel_dict["Rating"].append(hotelcard.find('div', attrs = {"class":["b5cd09854e","d10a6220b4"]}).text)
    except AttributeError:
        hotel_dict["Rating"].append(None)

    try:
        hotel_dict["Bed"].append(hotelcard.find('div', attrs = {"class":"cb5b4b68a4"}).div.text)
    except AttributeError:
        hotel_dict["Bed"].append(None)

    try:
        hotel_dict["HotelName"].append(hotelcard.find('div', attrs = {"class":["fcab3ed991","a23c043802"], "data-testid":"title"}).text)
    except AttributeError:
        hotel_dict["HotelName"].append(None)



In [5]:
#Easily modifiable/changeable inputs for the question, "how many adults, how many children, how long"
AdultChildCombination = [(1,0), (2,0), (2,2)]
lengthsOfStay = [1,2,5,7]

#For each combination of length of stay and Adult/Child combination, scrape the data.
for region_link in region_links:
    for numAdult,numChild in AdultChildCombination:
        for lengthOfStay in lengthsOfStay:
            ScrapeRegion(region_link, lengthOfStay, numAdult, numChild)

In [7]:
#converts the Dictionary into a DataFrame
hotels_df = pd.DataFrame(hotel_dict)
hotels_df

Unnamed: 0,Location,HotelName,NumAdults,NumChildren,DaysStayed,Bed,Rating,Price
0,Las Vegas,Downtown Grand Hotel & Casino,1,0,1,2 queen beds,8.0,$45
1,Las Vegas,Fairfield Inn Las Vegas Convention Center,1,0,1,2 queen beds,8.1,$101
2,Las Vegas,Ellis Island Hotel Casino & Brewery,1,0,1,1 king bed,7.9,$40
3,Las Vegas,Polo Towers By Diamond Resorts,1,0,1,Private studio • 1 bathroom • 28m²,8.5,$71
4,Las Vegas,Plaza Hotel & Casino,1,0,1,Multiple bed types,7.6,$50
...,...,...,...,...,...,...,...,...
3092,Pigeon Forge,Tennessee Mountain Lodge Riverside by OYO,2,2,7,2 queen beds,6.9,$516
3093,Pigeon Forge,Timbers Lodge,2,2,7,2 queen beds,7.2,$710
3094,Pigeon Forge,Comfort Suites Mountain Mile Area,2,2,7,2 queen beds,6.5,$717
3095,Pigeon Forge,Guest Inn Pigeon Forge,2,2,7,2 queen beds,6.6,$505


In [8]:
#Save the DataFrame to a CSV
with open("hotels_df.csv", "w") as outputFile:
    hotels_df.to_csv(outputFile,index=False)